Create IndexParallelTaskSpec (#58)

winsmith · web-flow · commit 9cafa2cd6b35 · 2025-05-13T15:00:57.000+02:00
* Create IndexParallelTaskSpec

also clean up folder structure and sync with druid project structure

* Clean up DimensionsSpec

* KinesisIndexTaskIOConfig

* DruidInputSource

* DimensionsSpec

* Update Tests

* fix test

* fix tests for real
diff --git a/Sources/DataTransferObjects/Druid/data/input/InputFormat.swift b/Sources/DataTransferObjects/Druid/data/input/InputFormat.swift
@@ -1,4 +1,6 @@
 /// https://druid.apache.org/docs/latest/ingestion/data-formats/#input-format
+///
+/// https://github.com/apache/druid/blob/master/processing/src/main/java/org/apache/druid/data/input/InputFormat.java
 public struct InputFormat: Codable, Hashable, Equatable {
     public init(type: InputFormat.InputFormatType, keepNullColumns: Bool? = nil) {
         self.type = type
diff --git a/Sources/DataTransferObjects/Druid/data/input/InputSource.swift b/Sources/DataTransferObjects/Druid/data/input/InputSource.swift
@@ -0,0 +1,31 @@
+/// https://github.com/apache/druid/blob/master/processing/src/main/java/org/apache/druid/data/input/InputSource.java#L61
+public indirect enum InputSource: Codable, Hashable, Equatable {
+    case druid(DruidInputSource)
+
+    enum CodingKeys: String, CodingKey {
+        case type
+    }
+
+    public init(from decoder: Decoder) throws {
+        let values = try decoder.container(keyedBy: CodingKeys.self)
+        let type = try values.decode(String.self, forKey: .type)
+
+        switch type {
+        case "druid":
+            self = try .druid(DruidInputSource(from: decoder))
+
+        default:
+            throw EncodingError.invalidValue("Invalid type", .init(codingPath: [CodingKeys.type], debugDescription: "Invalid Type", underlyingError: nil))
+        }
+    }
+
+    public func encode(to encoder: Encoder) throws {
+        var container = encoder.container(keyedBy: CodingKeys.self)
+
+        switch self {
+        case let .druid(spec):
+            try container.encode("druid", forKey: .type)
+            try spec.encode(to: encoder)
+        }
+    }
+}
diff --git a/Sources/DataTransferObjects/Druid/data/input/impl/DimensionsSpec.swift b/Sources/DataTransferObjects/Druid/data/input/impl/DimensionsSpec.swift
@@ -1,7 +1,8 @@
 /// https://druid.apache.org/docs/latest/ingestion/ingestion-spec/#dimensionsspec
-public struct IngestionDimensionSpec: Codable, Hashable, Equatable {
+/// https://github.com/apache/druid/blob/master/processing/src/main/java/org/apache/druid/data/input/impl/DimensionsSpec.java
+public struct DimensionsSpec: Codable, Hashable, Equatable {
     public init(
-        dimensions: [IngestionDimensionSpecDimension],
+        dimensions: [IngestionDimensionSpecDimension]? = nil,
         dimensionExclusions: [String]? = nil,
         spatialDimensions: [IngestionDimensionSpecSpatialDimension]? = nil,
         includeAllDimensions: Bool? = nil,
@@ -23,7 +24,7 @@ public struct IngestionDimensionSpec: Codable, Hashable, Equatable {
     ///
     /// As a best practice, put the most frequently filtered dimensions at the beginning of the dimensions list. In this case, it
     /// would also be good to consider partitioning by those same dimensions.
-    public let dimensions: [IngestionDimensionSpecDimension]
+    public let dimensions: [IngestionDimensionSpecDimension]?
 
     /// The names of dimensions to exclude from ingestion. Only names are supported here, not objects.
     ///
diff --git a/Sources/DataTransferObjects/Druid/data/input/impl/TimestampSpec.swift b/Sources/DataTransferObjects/Druid/data/input/impl/TimestampSpec.swift
@@ -7,6 +7,8 @@
 /// You can also set the expression name to __time to replace the value of the timestamp.
 ///
 /// Treat __time as a millisecond timestamp: the number of milliseconds since Jan 1, 1970 at midnight UTC.
+///
+/// https://github.com/apache/druid/blob/master/processing/src/main/java/org/apache/druid/data/input/impl/TimestampSpec.java
 public struct TimestampSpec: Codable, Hashable, Equatable {
     public init(
         column: String? = nil,
diff --git a/Sources/DataTransferObjects/Druid/indexer/granularity/GranularitySpec.swift b/Sources/DataTransferObjects/Druid/indexer/granularity/GranularitySpec.swift
@@ -1,4 +1,5 @@
 /// https://druid.apache.org/docs/latest/ingestion/ingestion-spec/#granularityspec
+/// https://github.com/apache/druid/blob/master/processing/src/main/java/org/apache/druid/indexer/granularity/GranularitySpec.java
 public struct GranularitySpec: Codable, Hashable, Equatable {
     public init(
         type: GranularitySpec.GranularitySpecType? = nil,
diff --git a/Sources/DataTransferObjects/Druid/indexing/common/task/batch/parallel/ParallelIndexIOConfig.swift b/Sources/DataTransferObjects/Druid/indexing/common/task/batch/parallel/ParallelIndexIOConfig.swift
@@ -0,0 +1,30 @@
+/// https://druid.apache.org/docs/latest/ingestion/native-batch#ioconfig
+/// https://github.com/apache/druid/blob/master/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexIOConfig.java
+public struct ParallelIndexIOConfig: Codable, Hashable, Equatable {
+    public init(inputFormat: InputFormat, inputSource: InputSource? = nil, appendToExisting: Bool? = nil, dropExisting: Bool? = nil) {
+        self.inputFormat = inputFormat
+        self.inputSource = inputSource
+        self.appendToExisting = appendToExisting
+        self.dropExisting = dropExisting
+    }
+    
+    /// inputFormat to specify how to parse input data.
+    public let inputFormat: InputFormat
+
+    public let inputSource: InputSource?
+
+    /// Creates segments as additional shards of the latest version
+    ///
+    /// effectively appending to the segment set instead of replacing it. This means that you can append new segments to any
+    /// datasource regardless of its original partitioning scheme. You must use the dynamic partitioning type for the appended
+    /// segments. If you specify a different partitioning type, the task fails with an error.
+    public let appendToExisting: Bool?
+
+    /// If true and appendToExisting is false and the granularitySpec contains aninterval, then the ingestion task replaces
+    /// all existing segments fully contained by the specified interval when the task publishes new segments. If ingestion
+    /// fails, Druid doesn't change any existing segments. In the case of misconfiguration where either appendToExisting is
+    /// true or interval isn't specified in granularitySpec, Druid doesn't replace any segments even if dropExisting is true.
+    ///
+    /// WARNING: this feature is still experimental.
+    public let dropExisting: Bool?
+}
diff --git a/Sources/DataTransferObjects/Druid/indexing/common/task/batch/parallel/ParallelIndexIngestionSpec.swift b/Sources/DataTransferObjects/Druid/indexing/common/task/batch/parallel/ParallelIndexIngestionSpec.swift
@@ -1,5 +1,5 @@
 /// The container object for the supervisor configuration.
-public struct SupervisorSpec: Codable, Hashable, Equatable {
+public struct ParallelIndexIngestionSpec: Codable, Hashable, Equatable {
     public init(ioConfig: IoConfig? = nil, tuningConfig: TuningConfig? = nil, dataSchema: DataSchema? = nil) {
         self.ioConfig = ioConfig
         self.tuningConfig = tuningConfig
diff --git a/Sources/DataTransferObjects/Druid/indexing/input/DruidInputSource.swift b/Sources/DataTransferObjects/Druid/indexing/input/DruidInputSource.swift
@@ -0,0 +1,22 @@
+/// The Druid input source is to support reading data directly from existing Druid segments, potentially using a new schema
+/// and changing the name, dimensions, metrics, rollup, etc. of the segment. The Druid input source is splittable and can be
+/// used by the parallel task. This input source has a fixed input format for reading from Druid segments; no inputFormat
+/// field needs to be specified in the ingestion spec when using this input source.
+/// https://github.com/apache/druid/blob/master/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java
+///
+public struct DruidInputSource: Codable, Hashable, Equatable {
+    public init(dataSource: String, interval: QueryTimeInterval, filter: Filter? = nil) {
+        self.dataSource = dataSource
+        self.interval = interval
+        self.filter = filter
+    }
+
+    /// A String defining the Druid datasource to fetch rows from
+    public let dataSource: String
+
+    /// A String representing an ISO-8601 interval, which defines the time range to fetch the data over.
+    public let interval: QueryTimeInterval
+
+    /// Only rows that match the filter, if specified, will be returned.
+    public let filter: Filter?
+}
diff --git a/Sources/DataTransferObjects/Druid/indexing/kinesis/KinesisIndexTaskIOConfig.swift b/Sources/DataTransferObjects/Druid/indexing/kinesis/KinesisIndexTaskIOConfig.swift
@@ -1,6 +1,7 @@
 /// https://druid.apache.org/docs/latest/ingestion/supervisor/#io-configuration
 /// https://druid.apache.org/docs/latest/ingestion/kinesis-ingestion#io-configuration
-public struct KinesisIOConfig: Codable, Hashable, Equatable {
+/// https://github.com/apache/druid/blob/master/extensions-core/kinesis-indexing-service/src/main/java/org/apache/druid/indexing/kinesis/KinesisIndexTaskIOConfig.java
+public struct KinesisIndexTaskIOConfig: Codable, Hashable, Equatable {
     public init(
         stream: String,
         inputFormat: InputFormat,
diff --git a/Sources/DataTransferObjects/Druid/ingestion/tasks/IndexParallelTaskSpec.swift b/Sources/DataTransferObjects/Druid/ingestion/tasks/IndexParallelTaskSpec.swift
@@ -0,0 +1,23 @@
+/// The parallel task type index_parallel is a task for multi-threaded batch indexing. Parallel task indexing only relies on Druid
+///  resources. It doesn't depend on other external systems like Hadoop.
+///
+/// The index_parallel task is a supervisor task that orchestrates the whole indexing process. The supervisor task splits the input
+/// data and creates worker tasks to process the individual portions of data.
+///
+/// Druid issues the worker tasks to the Overlord. The Overlord schedules and runs the workers on Middle Managers or Indexers. After a
+/// worker task successfully processes the assigned input portion, it reports the resulting segment list to the Supervisor task.
+///
+/// The Supervisor task periodically checks the status of worker tasks. If a task fails, the Supervisor retries the task until the number
+/// of retries reaches the configured limit. If all worker tasks succeed, it publishes the reported segments at once and finalizes
+///  ingestion.
+///
+/// The detailed behavior of the parallel task is different depending on the partitionsSpec. See partitionsSpec for more details.
+///
+/// https://druid.apache.org/docs/latest/ingestion/native-batch
+public struct IndexParallelTaskSpec: Codable, Hashable, Equatable {
+    /// The task ID. If omitted, Druid generates the task ID using the task type, data source name, interval, and date-time stamp.
+    public let id: String?
+
+    /// The ingestion spec that defines the data schema, IO config, and tuning config.
+    public let spec: ParallelIndexIngestionSpec
+}
diff --git a/Sources/DataTransferObjects/Druid/ingestion/tasks/TaskSpec.swift b/Sources/DataTransferObjects/Druid/ingestion/tasks/TaskSpec.swift
@@ -0,0 +1,36 @@
+/// Tasks do all ingestion-related work in Druid.
+///
+/// For batch ingestion, you will generally submit tasks directly to Druid using the Tasks APIs. For streaming ingestion,
+/// tasks are generally submitted for you by a supervisor.
+///
+/// https://druid.apache.org/docs/latest/ingestion/tasks
+public indirect enum TaskSpec: Codable, Hashable, Equatable {
+    case indexParallel(IndexParallelTaskSpec)
+
+    enum CodingKeys: String, CodingKey {
+        case type
+    }
+
+    public init(from decoder: Decoder) throws {
+        let values = try decoder.container(keyedBy: CodingKeys.self)
+        let type = try values.decode(String.self, forKey: .type)
+
+        switch type {
+        case "index_parallel":
+            self = try .indexParallel(IndexParallelTaskSpec(from: decoder))
+
+        default:
+            throw EncodingError.invalidValue("Invalid type", .init(codingPath: [CodingKeys.type], debugDescription: "Invalid Type", underlyingError: nil))
+        }
+    }
+
+    public func encode(to encoder: Encoder) throws {
+        var container = encoder.container(keyedBy: CodingKeys.self)
+
+        switch self {
+        case let .indexParallel(spec):
+            try container.encode("index_parallel", forKey: .type)
+            try spec.encode(to: encoder)
+        }
+    }
+}
diff --git a/Sources/DataTransferObjects/Druid/segment/indexing/DataSchema.swift b/Sources/DataTransferObjects/Druid/segment/indexing/DataSchema.swift
@@ -1,12 +1,14 @@
 /// https://druid.apache.org/docs/latest/ingestion/ingestion-spec#dataschema
+///
+/// https://github.com/apache/druid/blob/master/server/src/main/java/org/apache/druid/segment/indexing/DataSchema.java
 public struct DataSchema: Codable, Hashable, Equatable {
     public init(
         dataSource: String,
         timestampSpec: TimestampSpec? = nil,
         metricsSpec: [Aggregator]? = nil,
         granularitySpec: GranularitySpec? = nil,
         transformSpec: TransformSpec? = nil,
-        dimensionsSpec: IngestionDimensionSpec? = nil
+        dimensionsSpec: DimensionsSpec? = nil
     ) {
         self.dataSource = dataSource
         self.timestampSpec = timestampSpec
@@ -31,5 +33,5 @@ public struct DataSchema: Codable, Hashable, Equatable {
 
     public let transformSpec: TransformSpec?
 
-    public let dimensionsSpec: IngestionDimensionSpec?
+    public let dimensionsSpec: DimensionsSpec?
 }
diff --git a/Sources/DataTransferObjects/Druid/segment/indexing/IoConfig.swift b/Sources/DataTransferObjects/Druid/segment/indexing/IoConfig.swift
@@ -1,6 +1,7 @@
+/// https://github.com/apache/druid/blob/master/server/src/main/java/org/apache/druid/segment/indexing/IOConfig.java
 public indirect enum IoConfig: Codable, Hashable, Equatable {
-    case kinesis(KinesisIOConfig)
-    // space for Kafka
+    case kinesis(KinesisIndexTaskIOConfig)
+    case indexParallel(ParallelIndexIOConfig)
 
     enum CodingKeys: String, CodingKey {
         case type
@@ -12,7 +13,9 @@ public indirect enum IoConfig: Codable, Hashable, Equatable {
 
         switch type {
         case "kinesis":
-            self = try .kinesis(KinesisIOConfig(from: decoder))
+            self = try .kinesis(KinesisIndexTaskIOConfig(from: decoder))
+        case "index_parallel":
+            self = try .indexParallel(ParallelIndexIOConfig(from: decoder))
 
         default:
             throw EncodingError.invalidValue("Invalid type", .init(codingPath: [CodingKeys.type], debugDescription: "Invalid Type", underlyingError: nil))
@@ -26,6 +29,9 @@ public indirect enum IoConfig: Codable, Hashable, Equatable {
         case let .kinesis(ioConfig):
             try container.encode("kinesis", forKey: .type)
             try ioConfig.encode(to: encoder)
+        case let .indexParallel(ioConfig):
+            try container.encode("index_parallel", forKey: .type)
+            try ioConfig.encode(to: encoder)
         }
     }
 }
diff --git a/Sources/DataTransferObjects/Druid/segment/transform/TransformSpec.swift b/Sources/DataTransferObjects/Druid/segment/transform/TransformSpec.swift
diff --git a/Sources/DataTransferObjects/Supervisor/Supervisor.swift b/Sources/DataTransferObjects/Supervisor/Supervisor.swift
@@ -1,6 +1,6 @@
 /// Datasource / Namespace Supervisor definition
 public struct Supervisor: Codable, Hashable, Equatable {
-    public init(type: Supervisor.SupervisorType, spec: SupervisorSpec? = nil, suspended: Bool? = nil) {
+    public init(type: Supervisor.SupervisorType, spec: ParallelIndexIngestionSpec? = nil, suspended: Bool? = nil) {
         self.type = type
         self.spec = spec
         self.suspended = suspended
@@ -17,7 +17,7 @@ public struct Supervisor: Codable, Hashable, Equatable {
     public let type: SupervisorType
 
     /// The container object for the supervisor configuration.
-    public let spec: SupervisorSpec?
+    public let spec: ParallelIndexIngestionSpec?
 
     /// Indicates whether the supervisor is in a suspended state.
     public let suspended: Bool?
diff --git a/Tests/DataSchemaTests/IngestionDimensionSpecTests.swift b/Tests/DataSchemaTests/IngestionDimensionSpecTests.swift
@@ -15,7 +15,7 @@ final class IngestionDimensionSpecTests: XCTestCase {
     """
     .filter { !$0.isWhitespace }
 
-    let docsValue = IngestionDimensionSpec(
+    let docsValue = DimensionsSpec(
         dimensions: [
             .init(type: .long, name: "page", createBitmapIndex: nil, multiValueHandling: nil),
             .init(type: .long, name: "userId", createBitmapIndex: nil, multiValueHandling: nil),
@@ -66,7 +66,7 @@ final class IngestionDimensionSpecTests: XCTestCase {
     """
     .filter { !$0.isWhitespace }
 
-    let tdValue = IngestionDimensionSpec(
+    let tdValue = DimensionsSpec(
         dimensions: [
             .init(type: .string, name: "appID", createBitmapIndex: true, multiValueHandling: .sorted_array),
             .init(type: .string, name: "type", createBitmapIndex: true, multiValueHandling: .sorted_array),
@@ -85,7 +85,7 @@ final class IngestionDimensionSpecTests: XCTestCase {
         forceSegmentSortByTime: nil
     )
 
-    let testedType = IngestionDimensionSpec.self
+    let testedType = DimensionsSpec.self
 
     func testDecodingDocsExample() throws {
         let decodedValue = try JSONDecoder.telemetryDecoder.decode(testedType, from: docsValueString.data(using: .utf8)!)
diff --git a/Tests/QueryGenerationTests/CompileDownTests.swift b/Tests/QueryGenerationTests/CompileDownTests.swift
@@ -281,22 +281,6 @@ final class CompileDownTests: XCTestCase {
         XCTAssertEqual(compiledQuery.dataSource?.name, "com.telemetrydeck.test")
     }
 
-    func testNamespaceWithAvailabilityDateInTheFuture() throws {
-        let intervals: [QueryTimeInterval] = [
-            .init(beginningDate: Date(iso8601String: "2023-04-01T00:00:00.000Z")!, endDate: Date(iso8601String: "2023-05-31T00:00:00.000Z")!),
-        ]
-
-        var query = CustomQuery(queryType: .timeseries, intervals: intervals, granularity: .day)
-        query.dataSource = nil
-        let precompiledQuery = try query.precompile(
-            namespace: "some-unknown-namespace",
-            organizationAppIDs: [appID1, appID2],
-            isSuperOrg: false
-        )
-        let compiledQuery = try precompiledQuery.compileToRunnableQuery()
-        XCTAssertEqual(compiledQuery.dataSource?.name, "telemetry-signals")
-    }
-
     func testAllowsHourlyGranularityForTimeseries() throws {
         let intervals: [QueryTimeInterval] = [
             .init(beginningDate: Date(iso8601String: "2023-04-01T00:00:00.000Z")!, endDate: Date(iso8601String: "2023-05-31T00:00:00.000Z")!),
diff --git a/Tests/SupervisorTests/IndexParallelTaskSpecTests.swift b/Tests/SupervisorTests/IndexParallelTaskSpecTests.swift
diff --git a/Tests/SupervisorTests/SupervisorSpecTests.swift b/Tests/SupervisorTests/SupervisorSpecTests.swift

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`/// https://druid.apache.org/docs/latest/ingestion/ingestion-spec/#granularityspec`
	`2`	`+/// https://github.com/apache/druid/blob/master/processing/src/main/java/org/apache/druid/indexer/granularity/GranularitySpec.java`
`2`	`3`	`public struct GranularitySpec: Codable, Hashable, Equatable {`
`3`	`4`	`public init(`
`4`	`5`	`type: GranularitySpec.GranularitySpecType? = nil,`