@@ -234,6 +234,8 @@ EXPORT Profile(inFile,
234234 LOCAL %AttributeType_t% := STRING36 ;
235235 #UNIQUENAME (NumericStat_t);
236236 LOCAL %NumericStat_t% := DECIMAL32_4 ;
237+ #UNIQUENAME (RecCount_t);
238+ LOCAL %RecCount_t% := UNSIGNED6 ;
237239
238240 // Tests for enabled features
239241 #UNIQUENAME (FeatureEnabledFillRate);
@@ -391,12 +393,12 @@ EXPORT Profile(inFile,
391393 // Define the record layout that will be used by the inner _Inner_Profile() call
392394 LOCAL ModeRec := RECORD
393395 UTF8 value;
394- UNSIGNED4 rec_count;
396+ %RecCount_t% rec_count;
395397 END ;
396398
397399 LOCAL PatternCountRec := RECORD
398400 STRING data_pattern;
399- UNSIGNED4 rec_count;
401+ %RecCount_t% rec_count;
400402 UTF8 example;
401403 END ;
402404
@@ -408,11 +410,11 @@ EXPORT Profile(inFile,
408410 LOCAL OutputLayout := RECORD
409411 STRING sortValue;
410412 STRING attribute;
411- UNSIGNED4 rec_count;
413+ %RecCount_t% rec_count;
412414 STRING given_attribute_type;
413415 DECIMAL9_6 fill_rate;
414- UNSIGNED4 fill_count;
415- UNSIGNED4 cardinality;
416+ %RecCount_t% fill_count;
417+ %RecCount_t% cardinality;
416418 DATASET (ModeRec) cardinality_breakdown {MAXCOUNT(%lowCardinalityThreshold%)};
417419 STRING best_attribute_type;
418420 DATASET (ModeRec) modes {MAXCOUNT(%MAX_MODES%)};
@@ -442,13 +444,13 @@ EXPORT Profile(inFile,
442444 #IF (%FeatureEnabledBestECLTypes%())
443445 STRING best_attribute_type;
444446 #END
445- UNSIGNED4 rec_count;
447+ %RecCount_t% rec_count;
446448 #IF (%FeatureEnabledFillRate%())
447- UNSIGNED4 fill_count;
449+ %RecCount_t% fill_count;
448450 DECIMAL9_6 fill_rate;
449451 #END
450452 #IF (%FeatureEnabledCardinality%())
451- UNSIGNED4 cardinality;
453+ %RecCount_t% cardinality;
452454 #END
453455 #IF (%FeatureEnabledLowCardinalityBreakdown%())
454456 DATASET (ModeRec) cardinality_breakdown;
@@ -623,11 +625,24 @@ EXPORT Profile(inFile,
623625 #UNIQUENAME (_MapAllUni);
624626 LOCAL %_MapAllUni%(UNICODE s) := (STRING )%_MapDigitUni%(%_MapLowerCharUni%(%_MapUpperCharUni%(s)));
625627
628+ // Pattern mapping a UTF8 datatype; using regex due to the complexity
629+ // of the character set
630+ #UNIQUENAME (_MapUpperCharUTF8);
631+ LOCAL %_MapUpperCharUTF8%(UTF8 s) := REGEXREPLACE (u8 '\\ p{Lu}' , s, u8 'A' );
632+ #UNIQUENAME (_MapLowerCharUTF8);
633+ LOCAL %_MapLowerCharUTF8%(UTF8 s) := REGEXREPLACE (u8 '[\\ p{Ll}\\ p{Lt}\\ p{Lm}\\ p{Lo}]' , s, u8 'a' );
634+ #UNIQUENAME (_MapDigitUTF8);
635+ LOCAL %_MapDigitUTF8%(UTF8 s) := REGEXREPLACE (u8 '[1-9]' , s, u8 '9' ); // Leave '0' as-is and replace with '9' later
636+ #UNIQUENAME (_MapAllUTF8);
637+ LOCAL %_MapAllUTF8%(UTF8 s) := (STRING )%_MapDigitUTF8%(%_MapLowerCharUTF8%(%_MapUpperCharUTF8%(s)));
638+
626639 // Trimming strings
627640 #UNIQUENAME (_TrimmedStr);
628641 LOCAL %_TrimmedStr%(STRING s) := TRIM (s, LEFT , RIGHT );
629642 #UNIQUENAME (_TrimmedUni);
630643 LOCAL %_TrimmedUni%(UNICODE s) := TRIM (s, LEFT , RIGHT );
644+ #UNIQUENAME (_TrimmedUTF8);
645+ LOCAL %_TrimmedUTF8%(UTF8 s) := TRIM (s, LEFT , RIGHT );
631646
632647 // Collect a list of the top-level attributes that we can process,
633648 // determine the actual maximum length of a data pattern (if we can
@@ -711,7 +726,7 @@ EXPORT Profile(inFile,
711726 %Attribute_t% attribute;
712727 %AttributeType_t% given_attribute_type;
713728 %StringValue_t% string_value;
714- UNSIGNED4 value_count;
729+ %RecCount_t% value_count;
715730 %DataPattern_t% data_pattern;
716731 UNSIGNED4 data_length;
717732 BOOLEAN is_filled;
@@ -765,18 +780,24 @@ EXPORT Profile(inFile,
765780 #ELSE
766781 %_TrimmedUni%((%StringValue_t%)_inFile.#EXPAND (%'namePrefix' % + %'@name' %))
767782 #END ,
768- UNSIGNED4 value_count := COUNT (GROUP ),
783+ %RecCount_t% value_count := COUNT (GROUP ),
769784 %DataPattern_t% data_pattern :=
770785 #IF (%_IsSetType%(%'@type' %))
771786 %_MapAllStr%(%_TrimmedStr%(Std.Str .CombineWords((SET OF STRING )_inFile.#EXPAND (%'namePrefix' % + %'@name' %), ', ' ))[..%foundMaxPatternLen%])
772787 #ELSEIF (REGEXFIND ('(integer)|(unsigned)|(decimal)|(real)' , %'@type' %))
773788 %_MapAllStr%((STRING )_inFile.#EXPAND (%'namePrefix' % + %'@name' %))
774- #ELSEIF (REGEXFIND ('( unicode)|(utf) ' , %'@type' %))
789+ #ELSEIF (REGEXFIND ('unicode' , %'@type' %))
775790 #IF (%@size% < 0 OR (%@size% DIV 2 + 1 ) > %foundMaxPatternLen%)
776791 %_MapAllUni%(%_TrimmedUni%((UNICODE )_inFile.#EXPAND (%'namePrefix' % + %'@name' %))[..%foundMaxPatternLen%])
777792 #ELSE
778793 %_MapAllUni%(%_TrimmedUni%((UNICODE )_inFile.#EXPAND (%'namePrefix' % + %'@name' %)))
779794 #END
795+ #ELSEIF (REGEXFIND ('utf' , %'@type' %))
796+ #IF (%@size% < 0 OR (%@size% DIV 2 + 1 ) > %foundMaxPatternLen%)
797+ %_MapAllUTF8%(%_TrimmedUTF8%((UNICODE )_inFile.#EXPAND (%'namePrefix' % + %'@name' %))[..%foundMaxPatternLen%])
798+ #ELSE
799+ %_MapAllUTF8%(%_TrimmedUTF8%((UNICODE )_inFile.#EXPAND (%'namePrefix' % + %'@name' %)))
800+ #END
780801 #ELSEIF (REGEXFIND ('string' , %'@type' %))
781802 #IF (%@size% < 0 OR %@size% > %foundMaxPatternLen%)
782803 %_MapAllStr%(%_TrimmedStr%(_inFile.#EXPAND (%'namePrefix' % + %'@name' %))[..%foundMaxPatternLen%])
@@ -1117,7 +1138,7 @@ EXPORT Profile(inFile,
11171138 %filledDataInfoNumeric%(attribute = %'namePrefix' % + %'@name' %),
11181139 {
11191140 string_value,
1120- UNSIGNED4 rec_count := SUM (GROUP , value_count)
1141+ %RecCount_t% rec_count := SUM (GROUP , value_count)
11211142 },
11221143 string_value,
11231144 MERGE
@@ -1287,8 +1308,8 @@ EXPORT Profile(inFile,
12871308 {
12881309 attribute,
12891310 data_pattern,
1290- UTF8 example := string_value[..%foundMaxPatternLen%],
1291- UNSIGNED4 rec_count := SUM (GROUP , value_count)
1311+ UTF8 example := string_value[..%foundMaxPatternLen%],
1312+ %RecCount_t% rec_count := SUM (GROUP , value_count)
12921313 },
12931314 attribute, data_pattern,
12941315 MERGE
@@ -1333,8 +1354,8 @@ EXPORT Profile(inFile,
13331354 {
13341355 attribute,
13351356 given_attribute_type,
1336- UNSIGNED4 rec_count := SUM (GROUP , value_count),
1337- UNSIGNED4 filled_count := SUM (GROUP , IF (is_filled, value_count, 0 ))
1357+ %RecCount_t% rec_count := SUM (GROUP , value_count),
1358+ %RecCount_t% filled_count := SUM (GROUP , IF (is_filled, value_count, 0 ))
13381359 },
13391360 attribute, given_attribute_type,
13401361 MERGE
@@ -1430,7 +1451,7 @@ EXPORT Profile(inFile,
14301451 {
14311452 %Attribute_t% attribute,
14321453 BOOLEAN is_numeric,
1433- UNSIGNED4 cardinality,
1454+ %RecCount_t% cardinality,
14341455 REAL numeric_min,
14351456 REAL numeric_max,
14361457 REAL numeric_mean,
0 commit comments