Skip to content

Commit b67d6ac

Browse files
committed
Merge branch 'upstream-candidate-1.10.0' into upstream-master
2 parents 92851ff + a110785 commit b67d6ac

5 files changed

Lines changed: 48 additions & 26 deletions

File tree

Bundle.ecl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,5 @@ EXPORT Bundle := MODULE(Std.BundleBase)
66
EXPORT License := 'http://www.apache.org/licenses/LICENSE-2.0';
77
EXPORT Copyright := 'Copyright (C) 2024 HPCC Systems';
88
EXPORT DependsOn := [];
9-
EXPORT Version := '1.9.5';
9+
EXPORT Version := '1.10.0';
1010
END;

Profile.ecl

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,8 @@ EXPORT Profile(inFile,
234234
LOCAL %AttributeType_t% := STRING36;
235235
#UNIQUENAME(NumericStat_t);
236236
LOCAL %NumericStat_t% := DECIMAL32_4;
237+
#UNIQUENAME(RecCount_t);
238+
LOCAL %RecCount_t% := UNSIGNED6;
237239

238240
// Tests for enabled features
239241
#UNIQUENAME(FeatureEnabledFillRate);
@@ -391,12 +393,12 @@ EXPORT Profile(inFile,
391393
// Define the record layout that will be used by the inner _Inner_Profile() call
392394
LOCAL ModeRec := RECORD
393395
UTF8 value;
394-
UNSIGNED4 rec_count;
396+
%RecCount_t% rec_count;
395397
END;
396398

397399
LOCAL PatternCountRec := RECORD
398400
STRING data_pattern;
399-
UNSIGNED4 rec_count;
401+
%RecCount_t% rec_count;
400402
UTF8 example;
401403
END;
402404

@@ -408,11 +410,11 @@ EXPORT Profile(inFile,
408410
LOCAL OutputLayout := RECORD
409411
STRING sortValue;
410412
STRING attribute;
411-
UNSIGNED4 rec_count;
413+
%RecCount_t% rec_count;
412414
STRING given_attribute_type;
413415
DECIMAL9_6 fill_rate;
414-
UNSIGNED4 fill_count;
415-
UNSIGNED4 cardinality;
416+
%RecCount_t% fill_count;
417+
%RecCount_t% cardinality;
416418
DATASET(ModeRec) cardinality_breakdown {MAXCOUNT(%lowCardinalityThreshold%)};
417419
STRING best_attribute_type;
418420
DATASET(ModeRec) modes {MAXCOUNT(%MAX_MODES%)};
@@ -442,13 +444,13 @@ EXPORT Profile(inFile,
442444
#IF(%FeatureEnabledBestECLTypes%())
443445
STRING best_attribute_type;
444446
#END
445-
UNSIGNED4 rec_count;
447+
%RecCount_t% rec_count;
446448
#IF(%FeatureEnabledFillRate%())
447-
UNSIGNED4 fill_count;
449+
%RecCount_t% fill_count;
448450
DECIMAL9_6 fill_rate;
449451
#END
450452
#IF(%FeatureEnabledCardinality%())
451-
UNSIGNED4 cardinality;
453+
%RecCount_t% cardinality;
452454
#END
453455
#IF(%FeatureEnabledLowCardinalityBreakdown%())
454456
DATASET(ModeRec) cardinality_breakdown;
@@ -623,11 +625,24 @@ EXPORT Profile(inFile,
623625
#UNIQUENAME(_MapAllUni);
624626
LOCAL %_MapAllUni%(UNICODE s) := (STRING)%_MapDigitUni%(%_MapLowerCharUni%(%_MapUpperCharUni%(s)));
625627

628+
// Pattern mapping a UTF8 datatype; using regex due to the complexity
629+
// of the character set
630+
#UNIQUENAME(_MapUpperCharUTF8);
631+
LOCAL %_MapUpperCharUTF8%(UTF8 s) := REGEXREPLACE(u8'\\p{Lu}', s, u8'A');
632+
#UNIQUENAME(_MapLowerCharUTF8);
633+
LOCAL %_MapLowerCharUTF8%(UTF8 s) := REGEXREPLACE(u8'[\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}]', s, u8'a');
634+
#UNIQUENAME(_MapDigitUTF8);
635+
LOCAL %_MapDigitUTF8%(UTF8 s) := REGEXREPLACE(u8'[1-9]', s, u8'9'); // Leave '0' as-is and replace with '9' later
636+
#UNIQUENAME(_MapAllUTF8);
637+
LOCAL %_MapAllUTF8%(UTF8 s) := (STRING)%_MapDigitUTF8%(%_MapLowerCharUTF8%(%_MapUpperCharUTF8%(s)));
638+
626639
// Trimming strings
627640
#UNIQUENAME(_TrimmedStr);
628641
LOCAL %_TrimmedStr%(STRING s) := TRIM(s, LEFT, RIGHT);
629642
#UNIQUENAME(_TrimmedUni);
630643
LOCAL %_TrimmedUni%(UNICODE s) := TRIM(s, LEFT, RIGHT);
644+
#UNIQUENAME(_TrimmedUTF8);
645+
LOCAL %_TrimmedUTF8%(UTF8 s) := TRIM(s, LEFT, RIGHT);
631646

632647
// Collect a list of the top-level attributes that we can process,
633648
// determine the actual maximum length of a data pattern (if we can
@@ -711,7 +726,7 @@ EXPORT Profile(inFile,
711726
%Attribute_t% attribute;
712727
%AttributeType_t% given_attribute_type;
713728
%StringValue_t% string_value;
714-
UNSIGNED4 value_count;
729+
%RecCount_t% value_count;
715730
%DataPattern_t% data_pattern;
716731
UNSIGNED4 data_length;
717732
BOOLEAN is_filled;
@@ -765,18 +780,24 @@ EXPORT Profile(inFile,
765780
#ELSE
766781
%_TrimmedUni%((%StringValue_t%)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
767782
#END,
768-
UNSIGNED4 value_count := COUNT(GROUP),
783+
%RecCount_t% value_count := COUNT(GROUP),
769784
%DataPattern_t% data_pattern :=
770785
#IF(%_IsSetType%(%'@type'%))
771786
%_MapAllStr%(%_TrimmedStr%(Std.Str.CombineWords((SET OF STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%), ', '))[..%foundMaxPatternLen%])
772787
#ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', %'@type'%))
773788
%_MapAllStr%((STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
774-
#ELSEIF(REGEXFIND('(unicode)|(utf)', %'@type'%))
789+
#ELSEIF(REGEXFIND('unicode', %'@type'%))
775790
#IF(%@size% < 0 OR (%@size% DIV 2 + 1) > %foundMaxPatternLen%)
776791
%_MapAllUni%(%_TrimmedUni%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%])
777792
#ELSE
778793
%_MapAllUni%(%_TrimmedUni%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)))
779794
#END
795+
#ELSEIF(REGEXFIND('utf', %'@type'%))
796+
#IF(%@size% < 0 OR (%@size% DIV 2 + 1) > %foundMaxPatternLen%)
797+
%_MapAllUTF8%(%_TrimmedUTF8%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%])
798+
#ELSE
799+
%_MapAllUTF8%(%_TrimmedUTF8%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)))
800+
#END
780801
#ELSEIF(REGEXFIND('string', %'@type'%))
781802
#IF(%@size% < 0 OR %@size% > %foundMaxPatternLen%)
782803
%_MapAllStr%(%_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%])
@@ -1117,7 +1138,7 @@ EXPORT Profile(inFile,
11171138
%filledDataInfoNumeric%(attribute = %'namePrefix'% + %'@name'%),
11181139
{
11191140
string_value,
1120-
UNSIGNED4 rec_count := SUM(GROUP, value_count)
1141+
%RecCount_t% rec_count := SUM(GROUP, value_count)
11211142
},
11221143
string_value,
11231144
MERGE
@@ -1287,8 +1308,8 @@ EXPORT Profile(inFile,
12871308
{
12881309
attribute,
12891310
data_pattern,
1290-
UTF8 example := string_value[..%foundMaxPatternLen%],
1291-
UNSIGNED4 rec_count := SUM(GROUP, value_count)
1311+
UTF8 example := string_value[..%foundMaxPatternLen%],
1312+
%RecCount_t% rec_count := SUM(GROUP, value_count)
12921313
},
12931314
attribute, data_pattern,
12941315
MERGE
@@ -1333,8 +1354,8 @@ EXPORT Profile(inFile,
13331354
{
13341355
attribute,
13351356
given_attribute_type,
1336-
UNSIGNED4 rec_count := SUM(GROUP, value_count),
1337-
UNSIGNED4 filled_count := SUM(GROUP, IF(is_filled, value_count, 0))
1357+
%RecCount_t% rec_count := SUM(GROUP, value_count),
1358+
%RecCount_t% filled_count := SUM(GROUP, IF(is_filled, value_count, 0))
13381359
},
13391360
attribute, given_attribute_type,
13401361
MERGE
@@ -1430,7 +1451,7 @@ EXPORT Profile(inFile,
14301451
{
14311452
%Attribute_t% attribute,
14321453
BOOLEAN is_numeric,
1433-
UNSIGNED4 cardinality,
1454+
%RecCount_t% cardinality,
14341455
REAL numeric_min,
14351456
REAL numeric_max,
14361457
REAL numeric_mean,

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ level, such as within your "My Files" folder.
104104
|1.9.3|Better identify upper- and lower-case Unicode characters in text patterns; scan Unicode and UTF-8 strings to see if they can be represented with a STRING data type instead|
105105
|1.9.4|README fixes and updates; improve UTF-8 detection and prevent buffer overruns during character scans; use short form of Unicode property names in regex|
106106
|1.9.5|Correct Unicode regex regression introduced in 1.9.4|
107+
|1.10.0|Security in visualization; expand "record count" fields from UNSIGNED4 to UNSIGNED6 -- thanks to Manjunath Venkataswamy for requesting this improvement; add UTF8-specific TRIM and regex calls to avoid casting if possible|
107108
</details>
108109

109110
---

report/package-lock.json

Lines changed: 7 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

report/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
"devDependencies": {
3333
"npm-run-all": "^4.1.5",
3434
"rimraf": "^3.0.2",
35-
"rollup": "^2.41.4",
35+
"rollup": "^3.29.5",
3636
"@rollup/plugin-node-resolve": "^11.2.0",
3737
"terser": "^5.6.0",
3838
"tslib": "^2.1.0",

0 commit comments

Comments
 (0)