Skip to content

Commit 7868c9f

Browse files
committed
Add UTF8-specific regex and trim functions to avoid data casting if possible
1 parent ed186a6 commit 7868c9f

2 files changed

Lines changed: 21 additions & 2 deletions

File tree

Profile.ecl

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -625,11 +625,24 @@ EXPORT Profile(inFile,
625625
#UNIQUENAME(_MapAllUni);
626626
LOCAL %_MapAllUni%(UNICODE s) := (STRING)%_MapDigitUni%(%_MapLowerCharUni%(%_MapUpperCharUni%(s)));
627627

628+
// Pattern mapping a UTF8 datatype; using regex due to the complexity
629+
// of the character set
630+
#UNIQUENAME(_MapUpperCharUTF8);
631+
LOCAL %_MapUpperCharUTF8%(UTF8 s) := REGEXREPLACE(u8'\\p{Lu}', s, u8'A');
632+
#UNIQUENAME(_MapLowerCharUTF8);
633+
LOCAL %_MapLowerCharUTF8%(UTF8 s) := REGEXREPLACE(u8'[\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}]', s, u8'a');
634+
#UNIQUENAME(_MapDigitUTF8);
635+
LOCAL %_MapDigitUTF8%(UTF8 s) := REGEXREPLACE(u8'[1-9]', s, u8'9'); // Leave '0' as-is and replace with '9' later
636+
#UNIQUENAME(_MapAllUTF8);
637+
LOCAL %_MapAllUTF8%(UTF8 s) := (STRING)%_MapDigitUTF8%(%_MapLowerCharUTF8%(%_MapUpperCharUTF8%(s)));
638+
628639
// Trimming strings
629640
#UNIQUENAME(_TrimmedStr);
630641
LOCAL %_TrimmedStr%(STRING s) := TRIM(s, LEFT, RIGHT);
631642
#UNIQUENAME(_TrimmedUni);
632643
LOCAL %_TrimmedUni%(UNICODE s) := TRIM(s, LEFT, RIGHT);
644+
#UNIQUENAME(_TrimmedUTF8);
645+
LOCAL %_TrimmedUTF8%(UTF8 s) := TRIM(s, LEFT, RIGHT);
633646

634647
// Collect a list of the top-level attributes that we can process,
635648
// determine the actual maximum length of a data pattern (if we can
@@ -773,12 +786,18 @@ EXPORT Profile(inFile,
773786
%_MapAllStr%(%_TrimmedStr%(Std.Str.CombineWords((SET OF STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%), ', '))[..%foundMaxPatternLen%])
774787
#ELSEIF(REGEXFIND('(integer)|(unsigned)|(decimal)|(real)', %'@type'%))
775788
%_MapAllStr%((STRING)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
776-
#ELSEIF(REGEXFIND('(unicode)|(utf)', %'@type'%))
789+
#ELSEIF(REGEXFIND('unicode', %'@type'%))
777790
#IF(%@size% < 0 OR (%@size% DIV 2 + 1) > %foundMaxPatternLen%)
778791
%_MapAllUni%(%_TrimmedUni%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%])
779792
#ELSE
780793
%_MapAllUni%(%_TrimmedUni%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)))
781794
#END
795+
#ELSEIF(REGEXFIND('utf', %'@type'%))
796+
#IF(%@size% < 0 OR (%@size% DIV 2 + 1) > %foundMaxPatternLen%)
797+
%_MapAllUTF8%(%_TrimmedUTF8%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%])
798+
#ELSE
799+
%_MapAllUTF8%(%_TrimmedUTF8%((UNICODE)_inFile.#EXPAND(%'namePrefix'% + %'@name'%)))
800+
#END
782801
#ELSEIF(REGEXFIND('string', %'@type'%))
783802
#IF(%@size% < 0 OR %@size% > %foundMaxPatternLen%)
784803
%_MapAllStr%(%_TrimmedStr%(_inFile.#EXPAND(%'namePrefix'% + %'@name'%))[..%foundMaxPatternLen%])

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ level, such as within your "My Files" folder.
104104
|1.9.3|Better identify upper- and lower-case Unicode characters in text patterns; scan Unicode and UTF-8 strings to see if they can be represented with a STRING data type instead|
105105
|1.9.4|README fixes and updates; improve UTF-8 detection and prevent buffer overruns during character scans; use short form of Unicode property names in regex|
106106
|1.9.5|Correct Unicode regex regression introduced in 1.9.4|
107-
|1.10.0||
107+
|1.10.0|Expand "record count" fields from UNSIGNED4 to UNSIGNED6 -- thanks to Manjunath Venkataswamy for requesting this improvement; add UTF8-specific TRIM and regex calls to avoid casting if possible|
108108
</details>
109109

110110
---

0 commit comments

Comments
 (0)