Skip to content

Commit bd1df9e

Browse files
committed
Merge branch 'upstream-candidate-1.9.4' into upstream-master
2 parents 2cbc961 + 281b343 commit bd1df9e

3 files changed

Lines changed: 25 additions & 21 deletions

File tree

Bundle.ecl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,5 @@ EXPORT Bundle := MODULE(Std.BundleBase)
66
EXPORT License := 'http://www.apache.org/licenses/LICENSE-2.0';
77
EXPORT Copyright := 'Copyright (C) 2024 HPCC Systems';
88
EXPORT DependsOn := [];
9-
EXPORT Version := '1.9.3';
9+
EXPORT Version := '1.9.4';
1010
END;

Profile.ecl

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,9 @@ EXPORT Profile(inFile,
529529

530530
// Determine if a UTF-8 string really contains UTF-8 characters
531531
#UNIQUENAME(IsUTF8);
532-
LOCAL BOOLEAN %IsUTF8%(UTF8 str) := EMBED(C++)
532+
LOCAL BOOLEAN %IsUTF8%(DATA str) := EMBED(C++)
533+
#option pure;
534+
533535
if (lenStr == 0)
534536
return false;
535537
@@ -543,37 +545,37 @@ EXPORT Profile(inFile,
543545
// ASCII; continue scan
544546
bytes += 1;
545547
}
546-
else if ((0xC2 <= bytes[0] && bytes[0] <= 0xDF) && (0x80 <= bytes[1] && bytes[1] <= 0xBF))
548+
else if ((0xC2 <= bytes[0] && bytes[0] <= 0xDF) && (bytes+1 < endPtr) && (0x80 <= bytes[1] && bytes[1] <= 0xBF))
547549
{
548550
// Valid non-overlong 2-byte
549551
return true;
550552
}
551-
else if (bytes[0] == 0xE0 && (0xA0 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF))
553+
else if (bytes[0] == 0xE0 && (bytes+2 < endPtr) && (0xA0 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF))
552554
{
553555
// Valid excluding overlongs
554556
return true;
555557
}
556-
else if (((0xE1 <= bytes[0] && bytes[0] <= 0xEC) || bytes[0] == 0xEE || bytes[0] == 0xEF) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF))
558+
else if (((0xE1 <= bytes[0] && bytes[0] <= 0xEC) || bytes[0] == 0xEE || bytes[0] == 0xEF) && (bytes+2 < endPtr) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF))
557559
{
558560
// Valid straight 3-byte
559561
return true;
560562
}
561-
else if (bytes[0] == 0xED && (0x80 <= bytes[1] && bytes[1] <= 0x9F) && (0x80 <= bytes[2] && bytes[2] <= 0xBF))
563+
else if (bytes[0] == 0xED && (bytes+2 < endPtr) && (0x80 <= bytes[1] && bytes[1] <= 0x9F) && (0x80 <= bytes[2] && bytes[2] <= 0xBF))
562564
{
563565
// Valid excluding surrogates
564566
return true;
565567
}
566-
else if (bytes[0] == 0xF0 && (0x90 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF))
568+
else if (bytes[0] == 0xF0 && (bytes+3 < endPtr) && (0x90 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF))
567569
{
568570
// Valid planes 1-3
569571
return true;
570572
}
571-
else if ((0xF1 <= bytes[0] && bytes[0] <= 0xF3) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF))
573+
else if ((0xF1 <= bytes[0] && bytes[0] <= 0xF3) && (bytes+3 < endPtr) && (0x80 <= bytes[1] && bytes[1] <= 0xBF) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF))
572574
{
573575
// Valid planes 4-15
574576
return true;
575577
}
576-
else if (bytes[0] == 0xF4 && (0x80 <= bytes[1] && bytes[1] <= 0x8F) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF))
578+
else if (bytes[0] == 0xF4 && (bytes+3 < endPtr) && (0x80 <= bytes[1] && bytes[1] <= 0x8F) && (0x80 <= bytes[2] && bytes[2] <= 0xBF) && (0x80 <= bytes[3] && bytes[3] <= 0xBF))
577579
{
578580
// Valid plane 16
579581
return true;
@@ -613,9 +615,9 @@ EXPORT Profile(inFile,
613615
// Pattern mapping a UNICODE datatype; using regex due to the complexity
614616
// of the character set
615617
#UNIQUENAME(_MapUpperCharUni);
616-
LOCAL %_MapUpperCharUni%(UNICODE s) := REGEXREPLACE(u'\\p{Uppercase_Letter}', s, u'A');
618+
LOCAL %_MapUpperCharUni%(UNICODE s) := REGEXREPLACE(u'\\p{Lu}', s, u'A');
617619
#UNIQUENAME(_MapLowerCharUni);
618-
LOCAL %_MapLowerCharUni%(UNICODE s) := REGEXREPLACE(u'[[\\p{Lowercase_Letter}][\\p{Titlecase_Letter}][\\p{Modifier_Letter}][\\p{Other_Letter}]]', s, u'a');
620+
LOCAL %_MapLowerCharUni%(UNICODE s) := REGEXREPLACE(u'[[\\p{Ll}][\\p{Lt}][\\p{Lm}][\\p{Lo}]]', s, u'a');
619621
#UNIQUENAME(_MapDigitUni);
620622
LOCAL %_MapDigitUni%(UNICODE s) := REGEXREPLACE(u'[1-9]', s, u'9'); // Leave '0' as-is and replace with '9' later
621623
#UNIQUENAME(_MapAllUni);
@@ -824,7 +826,7 @@ EXPORT Profile(inFile,
824826
#IF(%_IsSetType%(%'@type'%))
825827
FALSE
826828
#ELSEIF(REGEXFIND('(unicode)|(utf)', %'@type'%))
827-
%IsUTF8%((UTF8)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
829+
%IsUTF8%((DATA)_inFile.#EXPAND(%'namePrefix'% + %'@name'%))
828830
#ELSE
829831
FALSE
830832
#END

README.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,19 @@ research tools to an ECL programmer.
2121
<a name="installation"></a>
2222
### Installation
2323

24-
**Note:** `DataPatterns.Profile()` and `DataPatterns.BestRecordStructure()` are
25-
now included in HPCC version 7.4.0! They have been added to the ECL Standard
26-
Library (within `Std.DataPatterns`) and also integrated with ECL Watch so you can
27-
create a profile from a saved logical file using only a web browser. Note that
28-
the Std library version of Profile() will create a visualization of the results
29-
only when executed from ECL Watch; visualizations will not be generated if
30-
Profile() is called from ECL code. If that is important to you, install this
31-
bundle version instead (they coexist peacefully).
24+
**Note:** `Profile()`, `BestRecordStructure()` and `Benford()` are
25+
now included in the HPCC Systems platform! They have been added to the ECL
26+
Standard Library (within `Std.DataPatterns`) and `Profile()` has also been
27+
integrated within ECL Watch so you can create a profile from a saved logical file
28+
using only a web browser. Note that the Std library version of `Profile()` will
29+
create a visualization of the results only when executed from ECL Watch;
30+
visualizations will not be generated if `Std.DataPatterns.Profile()` is
31+
called from ECL code. If that is important to you, install this bundle
32+
version instead (they coexist peacefully).
3233

3334
This code is installed as an ECL Bundle. Complete instructions for managing ECL
3435
Bundles can be found in [The ECL IDE and HPCC Client
35-
Tools](https://cdn.hpccsystems.com/releases/CE-Candidate-7.12.0/docs/EN_US/TheECLIDEandHPCCClientTools_EN_US-7.12.0-1.pdf)
36+
Tools](https://cdn.hpccsystems.com/releases/CE-Candidate-9.4.2/docs/EN_US/TheECLIDEandHPCCClientTools_EN_US-9.4.2-1.pdf)
3637
documentation.
3738

3839
Use the ecl command line tool to install this bundle:
@@ -101,6 +102,7 @@ level, such as within your "My Files" folder.
101102
|1.9.1|Fix IMPORT in (Profile) Tests module; support UTF-8 strings in Mode values and example text patterns|
102103
|1.9.2|Security updates|
103104
|1.9.3|Better identify upper- and lower-case Unicode characters in text patterns; scan Unicode and UTF-8 strings to see if they can be represented with a STRING data type instead|
105+
|1.9.4|README fixes and updates; improve UTF-8 detection and prevent buffer overruns during character scans; use short form of Unicode property names in regex|
104106
</details>
105107

106108
---

0 commit comments

Comments
 (0)