Skip to content

Commit fb9e12f

Browse files
committed
[GOBBLIN-ICEBERG] Restore backward compat: CURRENT_DATE produces -00 in legacy hourly mode
1 parent 6904dd0 commit fb9e12f

2 files changed

Lines changed: 81 additions & 29 deletions

File tree

gobblin-data-management/src/main/java/org/apache/gobblin/data/management/copy/iceberg/IcebergSource.java

Lines changed: 50 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,11 @@
9696
* # --- Recommended: configurable partition value format ---
9797
* # iceberg.partition.value.datetime.format is a DateTimeFormatter pattern applied to the output
9898
* # partition value used in the filter expression.
99-
* # When CURRENT_DATE is used, the reference datetime is LocalDateTime.now(), so a pattern
100-
* # with HH will embed the current hour automatically — no separate hour config needed.
10199
* # When set, it supersedes iceberg.hourly.partition.enabled.
100+
* #
101+
* # CURRENT_DATE behaviour:
102+
* # - With this property set → LocalDateTime.now(), so HH embeds the live clock-hour.
103+
* # - Without this property (legacy) → LocalDate.now() at midnight, HH stays -00 (backward compat).
102104
*
103105
* # Standard hourly partitions (yyyy-MM-dd-HH) — CURRENT_DATE picks up live hour
104106
* iceberg.partition.value.datetime.format=yyyy-MM-dd-HH # → "2025-04-01-14" (current hour)
@@ -152,10 +154,18 @@ public class IcebergSource extends FileBasedSource<String, FileAwareInputStream>
152154
/**
153155
* Optional {@link DateTimeFormatter} pattern controlling how the partition value is rendered.
154156
*
155-
* <p>When {@code iceberg.filter.date=CURRENT_DATE} the reference datetime is
156-
* {@link java.time.LocalDateTime#now()}, so a pattern that includes {@code HH} will embed
157-
* the current clock-hour automatically — no separate hour config is needed.
158-
* For a specific date (e.g. {@code 2025-04-03}), the time defaults to midnight (00:00).
157+
* <p><b>CURRENT_DATE behaviour differs between the two paths:</b>
158+
* <ul>
159+
* <li>When this property <em>is</em> set, {@code CURRENT_DATE} resolves to
160+
* {@link java.time.LocalDateTime#now()}, so a pattern that includes {@code HH} embeds the
161+
* live clock-hour automatically — useful for truly hourly-partitioned tables.</li>
162+
* <li>When this property is <em>absent</em> (legacy path), {@code CURRENT_DATE} resolves to
163+
* {@link java.time.LocalDate#now()} at midnight (00:00), preserving the pre-PR behaviour
164+
* where the hour suffix was always {@code -00}. This is the right choice for tables whose
165+
* partitions are daily but formatted as {@code yyyy-MM-dd-00}.</li>
166+
* </ul>
167+
* For a static date value (e.g. {@code 2025-04-03}), the time always defaults to midnight (00:00)
168+
* regardless of which path is used.
159169
*
160170
* <p>Examples:
161171
* <ul>
@@ -284,28 +294,28 @@ public Extractor<String, FileAwareInputStream> getExtractor(WorkUnitState state)
284294
* (defaults to {@value #DEFAULT_DATE_PARTITION_COLUMN}). The date value is specified separately via
285295
* {@code iceberg.filter.date} in standard format ({@code yyyy-MM-dd}).
286296
*
287-
* <p><b>Partition Value Format:</b> Both the input date ({@code iceberg.filter.date}) and the output
288-
* partition value use the pattern specified by {@code iceberg.partition.value.datetime.format}
289-
* (a standard {@link java.time.format.DateTimeFormatter} pattern). Use {@code CURRENT_DATE} as the
290-
* date value to resolve the reference datetime to {@link java.time.LocalDateTime#now()} automatically,
291-
* embedding the current hour when the pattern includes {@code HH}. Examples:
297+
* <p><b>Partition Value Format:</b> The output partition value format is controlled by
298+
* {@code iceberg.partition.value.datetime.format} (a standard {@link java.time.format.DateTimeFormatter}
299+
* pattern). When absent, the legacy {@code iceberg.hourly.partition.enabled} flag drives the format.
300+
*
301+
* <p><b>{@code CURRENT_DATE} resolution:</b>
292302
* <ul>
293-
* <li>{@code yyyy-MM-dd-HH} with date {@code 2025-04-01-05} → {@code 2025-04-01-05}</li>
294-
* <li>{@code dd-MM-yyyy-HH} with date {@code 01-04-2025-00} → {@code 01-04-2025-00}</li>
295-
* <li>{@code yyyyMMdd} with date {@code 20250401} → {@code 20250401} (compact daily)</li>
303+
* <li>With {@code iceberg.partition.value.datetime.format} set → {@link java.time.LocalDateTime#now()},
304+
* so a pattern including {@code HH} embeds the live clock-hour (e.g. {@code 2025-04-08-14}).</li>
305+
* <li>Without that property (legacy) → {@link java.time.LocalDate#now()} at midnight, so the
306+
* hour is always {@code 00} (e.g. {@code 2025-04-08-00}). This preserves the pre-PR
307+
* behaviour for tables that store daily data in {@code yyyy-MM-dd-00} partitions.</li>
296308
* </ul>
297-
* When {@code iceberg.partition.value.datetime.format} is set it supersedes
298-
* {@code iceberg.hourly.partition.enabled}. When absent, the legacy
299-
* {@code iceberg.hourly.partition.enabled} behaviour is preserved for backward compatibility.
309+
* Static date values always default to midnight regardless of which path is used.
300310
*
301311
* <p><b>Configuration Examples:</b>
302312
* <ul>
303313
* <li>Standard daily: {@code iceberg.partition.value.datetime.format=yyyy-MM-dd, iceberg.filter.date=2025-04-03,
304314
* iceberg.lookback.days=3} → partitions: {@code 2025-04-03, 2025-04-02, 2025-04-01}</li>
305-
* <li>Reversed-date hourly: {@code iceberg.partition.value.datetime.format=dd-MM-yyyy-HH,
306-
* iceberg.filter.date=CURRENT_DATE} → {@code 03-04-2025-14, 02-04-2025-14, 01-04-2025-14}</li>
307-
* <li>Dynamic daily: {@code iceberg.filter.date=CURRENT_DATE, iceberg.lookback.days=1}
308-
* → today's partition only (resolved at runtime)</li>
315+
* <li>Truly-hourly (live hour): {@code iceberg.partition.value.datetime.format=yyyy-MM-dd-HH,
316+
* iceberg.filter.date=CURRENT_DATE} → {@code 2025-04-08-14, 2025-04-07-14, 2025-04-06-14}</li>
317+
* <li>Daily-at-midnight (legacy default): {@code iceberg.filter.date=CURRENT_DATE, iceberg.lookback.days=1}
318+
* → {@code 2025-04-08-00} (hour always 00, backward compat)</li>
309319
* </ul>
310320
*
311321
* @param state source state containing filter configuration
@@ -337,13 +347,27 @@ private List<IcebergTable.FilePathWithPartition> discoverPartitionFilePaths(Sour
337347
DateTimeFormatter partitionFormatter = resolvePartitionFormatter(state);
338348

339349
// Resolve the reference datetime for the filter.
340-
// CURRENT_DATE uses LocalDateTime.now() so a formatter pattern that includes HH will
341-
// embed the current clock-hour automatically. For a specific date (yyyy-MM-dd) the time
342-
// defaults to midnight (00:00).
350+
// For a specific date (yyyy-MM-dd) the time always defaults to midnight (00:00).
351+
// For CURRENT_DATE:
352+
// - Custom format path (iceberg.partition.value.datetime.format set): LocalDateTime.now() so
353+
// a pattern that includes HH will embed the live clock-hour automatically.
354+
// - Legacy path (no custom format): LocalDate.now().atStartOfDay() (midnight) to preserve the
355+
// pre-PR behavior where CURRENT_DATE always produced a -00 suffix. Users who genuinely need
356+
// the live hour should migrate to iceberg.partition.value.datetime.format=yyyy-MM-dd-HH.
343357
LocalDateTime startDateTime;
344358
if (CURRENT_DATE_PLACEHOLDER.equalsIgnoreCase(dateValue)) {
345-
startDateTime = LocalDateTime.now();
346-
log.info("Resolved {} placeholder to current datetime: {}", CURRENT_DATE_PLACEHOLDER, startDateTime);
359+
boolean isCustomFormat = state.contains(ICEBERG_PARTITION_VALUE_DATETIME_FORMAT);
360+
if (isCustomFormat) {
361+
startDateTime = LocalDateTime.now();
362+
log.info("Resolved {} to current datetime with live hour (custom format='{}'): {}",
363+
CURRENT_DATE_PLACEHOLDER, state.getProp(ICEBERG_PARTITION_VALUE_DATETIME_FORMAT), startDateTime);
364+
} else {
365+
// Legacy backward-compat: always midnight so the yyyy-MM-dd-HH pattern keeps the old -00 suffix.
366+
startDateTime = LocalDate.now().atStartOfDay();
367+
log.info("Resolved {} to current date at midnight (legacy mode, -00 preserved): {}. "
368+
+ "Set {} to use the live hour.",
369+
CURRENT_DATE_PLACEHOLDER, startDateTime, ICEBERG_PARTITION_VALUE_DATETIME_FORMAT);
370+
}
347371
} else {
348372
// When iceberg.partition.value.datetime.format is explicitly set, the input date must match
349373
// that pattern (consistent input/output format). Legacy path keeps accepting yyyy-MM-dd for

gobblin-data-management/src/test/java/org/apache/gobblin/data/management/copy/iceberg/IcebergSourceTest.java

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -334,8 +334,9 @@ public void testLookbackPeriodLogic() throws Exception {
334334

335335
@Test
336336
public void testCurrentDatePlaceholder() throws Exception {
337-
// CURRENT_DATE resolves to LocalDateTime.now() so the current hour is embedded automatically.
338-
// The default legacy format (hourly.partition.enabled=true) produces yyyy-MM-dd-HH.
337+
// Legacy path (no iceberg.partition.value.datetime.format set):
338+
// CURRENT_DATE resolves to LocalDate.now() at midnight, so the default yyyy-MM-dd-HH
339+
// pattern always produces a -00 suffix — preserving pre-PR backward compat.
339340
properties.setProperty(IcebergSource.ICEBERG_FILTER_ENABLED, "true");
340341
properties.setProperty(IcebergSource.ICEBERG_FILTER_DATE, "CURRENT_DATE");
341342
properties.setProperty(IcebergSource.ICEBERG_LOOKBACK_DAYS, "1");
@@ -351,11 +352,38 @@ public void testCurrentDatePlaceholder() throws Exception {
351352
m.setAccessible(true);
352353
m.invoke(icebergSource, sourceState, mockTable);
353354

355+
String partitionValues = sourceState.getProp(IcebergSource.ICEBERG_PARTITION_VALUES);
356+
Assert.assertNotNull(partitionValues, "Partition values should be set");
357+
String expectedToday = java.time.LocalDate.now().toString() + "-00";
358+
Assert.assertEquals(partitionValues, expectedToday,
359+
"Legacy CURRENT_DATE should produce today's date with -00 suffix (backward compat)");
360+
}
361+
362+
@Test
363+
public void testCurrentDatePlaceholderWithCustomFormat() throws Exception {
364+
// New path (iceberg.partition.value.datetime.format set to yyyy-MM-dd-HH):
365+
// CURRENT_DATE resolves to LocalDateTime.now() so the live clock-hour is embedded.
366+
properties.setProperty(IcebergSource.ICEBERG_FILTER_ENABLED, "true");
367+
properties.setProperty(IcebergSource.ICEBERG_FILTER_DATE, "CURRENT_DATE");
368+
properties.setProperty(IcebergSource.ICEBERG_PARTITION_VALUE_DATETIME_FORMAT, "yyyy-MM-dd-HH");
369+
properties.setProperty(IcebergSource.ICEBERG_LOOKBACK_DAYS, "1");
370+
sourceState = new SourceState(new State(properties));
371+
372+
TableIdentifier tableId = TableIdentifier.of("test_db", "test_table");
373+
when(mockTable.getTableId()).thenReturn(tableId);
374+
when(mockTable.getFilePathsWithPartitionsForFilter(any(Expression.class)))
375+
.thenReturn(new java.util.ArrayList<>());
376+
377+
Method m = IcebergSource.class.getDeclaredMethod("discoverPartitionFilePaths",
378+
SourceState.class, IcebergTable.class);
379+
m.setAccessible(true);
380+
m.invoke(icebergSource, sourceState, mockTable);
381+
354382
// Assert format rather than exact value to avoid clock-dependent flakiness
355383
String partitionValues = sourceState.getProp(IcebergSource.ICEBERG_PARTITION_VALUES);
356384
Assert.assertNotNull(partitionValues, "Partition values should be set");
357385
Assert.assertTrue(partitionValues.matches("\\d{4}-\\d{2}-\\d{2}-\\d{2}"),
358-
"Should resolve to yyyy-MM-dd-HH format, got: " + partitionValues);
386+
"Custom format CURRENT_DATE should produce yyyy-MM-dd-HH with live hour, got: " + partitionValues);
359387
}
360388

361389
@Test

0 commit comments

Comments
 (0)