Skip to content

Commit 9cafa2c

Browse files
authored
Create IndexParallelTaskSpec (#58)
* Create IndexParallelTaskSpec also clean up folder structure and sync with druid project structure * Clean up DimensionsSpec * KinesisIndexTaskIOConfig * DruidInputSource * DimensionsSpec * Update Tests * fix test * fix tests for real
1 parent 14a5854 commit 9cafa2c

19 files changed

Lines changed: 311 additions & 32 deletions

File tree

Sources/DataTransferObjects/Supervisor/InputFormat.swift renamed to Sources/DataTransferObjects/Druid/data/input/InputFormat.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
/// https://druid.apache.org/docs/latest/ingestion/data-formats/#input-format
2+
///
3+
/// https://github.com/apache/druid/blob/master/processing/src/main/java/org/apache/druid/data/input/InputFormat.java
24
public struct InputFormat: Codable, Hashable, Equatable {
35
public init(type: InputFormat.InputFormatType, keepNullColumns: Bool? = nil) {
46
self.type = type
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
/// https://github.com/apache/druid/blob/master/processing/src/main/java/org/apache/druid/data/input/InputSource.java#L61
2+
public indirect enum InputSource: Codable, Hashable, Equatable {
3+
case druid(DruidInputSource)
4+
5+
enum CodingKeys: String, CodingKey {
6+
case type
7+
}
8+
9+
public init(from decoder: Decoder) throws {
10+
let values = try decoder.container(keyedBy: CodingKeys.self)
11+
let type = try values.decode(String.self, forKey: .type)
12+
13+
switch type {
14+
case "druid":
15+
self = try .druid(DruidInputSource(from: decoder))
16+
17+
default:
18+
throw EncodingError.invalidValue("Invalid type", .init(codingPath: [CodingKeys.type], debugDescription: "Invalid Type", underlyingError: nil))
19+
}
20+
}
21+
22+
public func encode(to encoder: Encoder) throws {
23+
var container = encoder.container(keyedBy: CodingKeys.self)
24+
25+
switch self {
26+
case let .druid(spec):
27+
try container.encode("druid", forKey: .type)
28+
try spec.encode(to: encoder)
29+
}
30+
}
31+
}

Sources/DataTransferObjects/DataSchema/IngestionDimensionSpec.swift renamed to Sources/DataTransferObjects/Druid/data/input/impl/DimensionsSpec.swift

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
/// https://druid.apache.org/docs/latest/ingestion/ingestion-spec/#dimensionsspec
2-
public struct IngestionDimensionSpec: Codable, Hashable, Equatable {
2+
/// https://github.com/apache/druid/blob/master/processing/src/main/java/org/apache/druid/data/input/impl/DimensionsSpec.java
3+
public struct DimensionsSpec: Codable, Hashable, Equatable {
34
public init(
4-
dimensions: [IngestionDimensionSpecDimension],
5+
dimensions: [IngestionDimensionSpecDimension]? = nil,
56
dimensionExclusions: [String]? = nil,
67
spatialDimensions: [IngestionDimensionSpecSpatialDimension]? = nil,
78
includeAllDimensions: Bool? = nil,
@@ -23,7 +24,7 @@ public struct IngestionDimensionSpec: Codable, Hashable, Equatable {
2324
///
2425
/// As a best practice, put the most frequently filtered dimensions at the beginning of the dimensions list. In this case, it
2526
/// would also be good to consider partitioning by those same dimensions.
26-
public let dimensions: [IngestionDimensionSpecDimension]
27+
public let dimensions: [IngestionDimensionSpecDimension]?
2728

2829
/// The names of dimensions to exclude from ingestion. Only names are supported here, not objects.
2930
///

Sources/DataTransferObjects/DataSchema/TimestampSpec.swift renamed to Sources/DataTransferObjects/Druid/data/input/impl/TimestampSpec.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
/// You can also set the expression name to __time to replace the value of the timestamp.
88
///
99
/// Treat __time as a millisecond timestamp: the number of milliseconds since Jan 1, 1970 at midnight UTC.
10+
///
11+
/// https://github.com/apache/druid/blob/master/processing/src/main/java/org/apache/druid/data/input/impl/TimestampSpec.java
1012
public struct TimestampSpec: Codable, Hashable, Equatable {
1113
public init(
1214
column: String? = nil,

Sources/DataTransferObjects/DataSchema/GranularitySpec.swift renamed to Sources/DataTransferObjects/Druid/indexer/granularity/GranularitySpec.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/// https://druid.apache.org/docs/latest/ingestion/ingestion-spec/#granularityspec
2+
/// https://github.com/apache/druid/blob/master/processing/src/main/java/org/apache/druid/indexer/granularity/GranularitySpec.java
23
public struct GranularitySpec: Codable, Hashable, Equatable {
34
public init(
45
type: GranularitySpec.GranularitySpecType? = nil,
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/// https://druid.apache.org/docs/latest/ingestion/native-batch#ioconfig
2+
/// https://github.com/apache/druid/blob/master/indexing-service/src/main/java/org/apache/druid/indexing/common/task/batch/parallel/ParallelIndexIOConfig.java
3+
public struct ParallelIndexIOConfig: Codable, Hashable, Equatable {
4+
public init(inputFormat: InputFormat, inputSource: InputSource? = nil, appendToExisting: Bool? = nil, dropExisting: Bool? = nil) {
5+
self.inputFormat = inputFormat
6+
self.inputSource = inputSource
7+
self.appendToExisting = appendToExisting
8+
self.dropExisting = dropExisting
9+
}
10+
11+
/// inputFormat to specify how to parse input data.
12+
public let inputFormat: InputFormat
13+
14+
public let inputSource: InputSource?
15+
16+
/// Creates segments as additional shards of the latest version
17+
///
18+
/// effectively appending to the segment set instead of replacing it. This means that you can append new segments to any
19+
/// datasource regardless of its original partitioning scheme. You must use the dynamic partitioning type for the appended
20+
/// segments. If you specify a different partitioning type, the task fails with an error.
21+
public let appendToExisting: Bool?
22+
23+
/// If true and appendToExisting is false and the granularitySpec contains aninterval, then the ingestion task replaces
24+
/// all existing segments fully contained by the specified interval when the task publishes new segments. If ingestion
25+
/// fails, Druid doesn't change any existing segments. In the case of misconfiguration where either appendToExisting is
26+
/// true or interval isn't specified in granularitySpec, Druid doesn't replace any segments even if dropExisting is true.
27+
///
28+
/// WARNING: this feature is still experimental.
29+
public let dropExisting: Bool?
30+
}

Sources/DataTransferObjects/Supervisor/SupervisorSpec.swift renamed to Sources/DataTransferObjects/Druid/indexing/common/task/batch/parallel/ParallelIndexIngestionSpec.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/// The container object for the supervisor configuration.
2-
public struct SupervisorSpec: Codable, Hashable, Equatable {
2+
public struct ParallelIndexIngestionSpec: Codable, Hashable, Equatable {
33
public init(ioConfig: IoConfig? = nil, tuningConfig: TuningConfig? = nil, dataSchema: DataSchema? = nil) {
44
self.ioConfig = ioConfig
55
self.tuningConfig = tuningConfig
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
/// The Druid input source is to support reading data directly from existing Druid segments, potentially using a new schema
2+
/// and changing the name, dimensions, metrics, rollup, etc. of the segment. The Druid input source is splittable and can be
3+
/// used by the parallel task. This input source has a fixed input format for reading from Druid segments; no inputFormat
4+
/// field needs to be specified in the ingestion spec when using this input source.
5+
/// https://github.com/apache/druid/blob/master/indexing-service/src/main/java/org/apache/druid/indexing/input/DruidInputSource.java
6+
///
7+
public struct DruidInputSource: Codable, Hashable, Equatable {
8+
public init(dataSource: String, interval: QueryTimeInterval, filter: Filter? = nil) {
9+
self.dataSource = dataSource
10+
self.interval = interval
11+
self.filter = filter
12+
}
13+
14+
/// A String defining the Druid datasource to fetch rows from
15+
public let dataSource: String
16+
17+
/// A String representing an ISO-8601 interval, which defines the time range to fetch the data over.
18+
public let interval: QueryTimeInterval
19+
20+
/// Only rows that match the filter, if specified, will be returned.
21+
public let filter: Filter?
22+
}

Sources/DataTransferObjects/Supervisor/ioConfig/KinesisIOConfig.swift renamed to Sources/DataTransferObjects/Druid/indexing/kinesis/KinesisIndexTaskIOConfig.swift

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/// https://druid.apache.org/docs/latest/ingestion/supervisor/#io-configuration
22
/// https://druid.apache.org/docs/latest/ingestion/kinesis-ingestion#io-configuration
3-
public struct KinesisIOConfig: Codable, Hashable, Equatable {
3+
/// https://github.com/apache/druid/blob/master/extensions-core/kinesis-indexing-service/src/main/java/org/apache/druid/indexing/kinesis/KinesisIndexTaskIOConfig.java
4+
public struct KinesisIndexTaskIOConfig: Codable, Hashable, Equatable {
45
public init(
56
stream: String,
67
inputFormat: InputFormat,
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/// The parallel task type index_parallel is a task for multi-threaded batch indexing. Parallel task indexing only relies on Druid
2+
/// resources. It doesn't depend on other external systems like Hadoop.
3+
///
4+
/// The index_parallel task is a supervisor task that orchestrates the whole indexing process. The supervisor task splits the input
5+
/// data and creates worker tasks to process the individual portions of data.
6+
///
7+
/// Druid issues the worker tasks to the Overlord. The Overlord schedules and runs the workers on Middle Managers or Indexers. After a
8+
/// worker task successfully processes the assigned input portion, it reports the resulting segment list to the Supervisor task.
9+
///
10+
/// The Supervisor task periodically checks the status of worker tasks. If a task fails, the Supervisor retries the task until the number
11+
/// of retries reaches the configured limit. If all worker tasks succeed, it publishes the reported segments at once and finalizes
12+
/// ingestion.
13+
///
14+
/// The detailed behavior of the parallel task is different depending on the partitionsSpec. See partitionsSpec for more details.
15+
///
16+
/// https://druid.apache.org/docs/latest/ingestion/native-batch
17+
public struct IndexParallelTaskSpec: Codable, Hashable, Equatable {
18+
/// The task ID. If omitted, Druid generates the task ID using the task type, data source name, interval, and date-time stamp.
19+
public let id: String?
20+
21+
/// The ingestion spec that defines the data schema, IO config, and tuning config.
22+
public let spec: ParallelIndexIngestionSpec
23+
}

0 commit comments

Comments
 (0)