Skip to content

Commit 2ec4664

Browse files
authored
Do simple re.ASCII matching (#814)
* Rewrite regex instead of using IgnorePatternWhitespace * Map ASCII flag to RegexOptions.ECMAScript * Enable test_shlex * Add test case
1 parent 0921938 commit 2ec4664

4 files changed

Lines changed: 121 additions & 67 deletions

File tree

Src/IronPython.Modules/re.cs

Lines changed: 108 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -46,23 +46,36 @@ public static void PerformModuleReload(PythonContext/*!*/ context, PythonDiction
4646

4747
#region CONSTANTS
4848

49+
[Flags]
50+
internal enum ReFlags : int {
51+
TEMPLATE = 0x01,
52+
IGNORECASE = 0x02,
53+
LOCALE = 0x04,
54+
MULTILINE = 0x08,
55+
DOTALL = 0x10,
56+
UNICODE = 0x20,
57+
VERBOSE = 0x40,
58+
DEBUG = 0x80,
59+
ASCII = 0x100,
60+
}
61+
4962
// short forms
50-
public const int I = 0x02;
51-
public const int L = 0x04;
52-
public const int M = 0x08;
53-
public const int S = 0x10;
54-
public const int U = 0x20;
55-
public const int X = 0x40;
56-
public const int A = 0x100;
63+
public const int I = (int)ReFlags.IGNORECASE;
64+
public const int L = (int)ReFlags.LOCALE;
65+
public const int M = (int)ReFlags.MULTILINE;
66+
public const int S = (int)ReFlags.DOTALL;
67+
public const int U = (int)ReFlags.UNICODE;
68+
public const int X = (int)ReFlags.VERBOSE;
69+
public const int A = (int)ReFlags.ASCII;
5770

5871
// long forms
59-
public const int IGNORECASE = 0x02;
60-
public const int LOCALE = 0x04;
61-
public const int MULTILINE = 0x08;
62-
public const int DOTALL = 0x10;
63-
public const int UNICODE = 0x20;
64-
public const int VERBOSE = 0x40;
65-
public const int ASCII = 0x100;
72+
public const int IGNORECASE = (int)ReFlags.IGNORECASE;
73+
public const int LOCALE = (int)ReFlags.LOCALE;
74+
public const int MULTILINE = (int)ReFlags.MULTILINE;
75+
public const int DOTALL = (int)ReFlags.DOTALL;
76+
public const int UNICODE = (int)ReFlags.UNICODE;
77+
public const int VERBOSE = (int)ReFlags.VERBOSE;
78+
public const int ASCII = (int)ReFlags.ASCII;
6679

6780
#endregion
6881

@@ -150,28 +163,30 @@ public class Pattern : IWeakReferenceable {
150163
private PythonDictionary _groups;
151164
private WeakRefTracker _weakRefTracker;
152165

153-
internal Pattern(CodeContext/*!*/ context, object pattern, int flags = 0, bool compiled = false) {
154-
_pre = PreParseRegex(context, PatternAsString(pattern), (flags & VERBOSE) != 0);
155-
flags |= OptionToFlags(_pre.Options);
166+
internal Pattern(CodeContext/*!*/ context, object pattern, ReFlags flags = 0, bool compiled = false) {
167+
_pre = PreParseRegex(context, PatternAsString(pattern, ref flags), (flags & ReFlags.VERBOSE) != 0);
168+
flags |= _pre.Options;
156169
_re = GenRegex(context, _pre.Pattern, flags, compiled, false);
157170
this.pattern = pattern;
158-
this.flags = flags;
171+
this.flags = (int)flags;
159172

160-
static string PatternAsString(object pattern) {
173+
static string PatternAsString(object pattern, ref ReFlags flags) {
161174
switch (pattern) {
162175
case Bytes bytes:
163176
return bytes.MakeString();
164177
case string s:
178+
flags |= ReFlags.UNICODE;
165179
return s;
166180
case ExtensibleString es:
181+
flags |= ReFlags.UNICODE;
167182
return es.Value;
168183
default:
169184
throw new ArgumentTypeException();
170185
}
171186
}
172187
}
173188

174-
private static Regex GenRegex(CodeContext/*!*/ context, string pattern, int flags, bool compiled, bool fullmatch) {
189+
private static Regex GenRegex(CodeContext/*!*/ context, string pattern, ReFlags flags, bool compiled, bool fullmatch) {
175190
try {
176191
RegexOptions opts = FlagsToOption(flags);
177192
return new Regex(fullmatch ? $"(?:{pattern})\\Z" : pattern, opts | (compiled ? RegexOptions.Compiled : RegexOptions.None));
@@ -210,7 +225,7 @@ private Regex GetRegexFullMatch(CodeContext /*!*/ context) {
210225
if (_re_fullmatch == null) {
211226
lock (_re) {
212227
if (_re_fullmatch == null)
213-
_re_fullmatch = GenRegex(context, _pre.Pattern, flags, _re.Options.HasFlag(RegexOptions.Compiled), true);
228+
_re_fullmatch = GenRegex(context, _pre.Pattern, (ReFlags)flags, _re.Options.HasFlag(RegexOptions.Compiled), true);
214229
}
215230
}
216231

@@ -837,7 +852,7 @@ private static Pattern GetPattern(CodeContext/*!*/ context, object pattern, int
837852
return res;
838853
}
839854
}
840-
res = new Pattern(context, pattern, flags, compiled);
855+
res = new Pattern(context, pattern, (ReFlags)flags, compiled);
841856
_cachedPatterns[key] = res;
842857
return res;
843858
}
@@ -849,45 +864,25 @@ private static IEnumerator MatchIterator(MatchCollection matches, Pattern patter
849864
}
850865
}
851866

852-
private static RegexOptions FlagsToOption(int flags) {
867+
private static RegexOptions FlagsToOption(ReFlags flags) {
853868
RegexOptions opts = RegexOptions.None;
854-
if ((flags & (int)IGNORECASE) != 0) opts |= RegexOptions.IgnoreCase;
855-
if ((flags & (int)MULTILINE) != 0) opts |= RegexOptions.Multiline;
856-
if (((flags & (int)LOCALE)) == 0) opts &= (~RegexOptions.CultureInvariant);
857-
if ((flags & (int)DOTALL) != 0) opts |= RegexOptions.Singleline;
858-
if ((flags & (int)VERBOSE) != 0) opts |= RegexOptions.IgnorePatternWhitespace;
869+
if ((flags & ReFlags.ASCII) != 0) opts |= RegexOptions.ECMAScript;
870+
if ((flags & ReFlags.IGNORECASE) != 0) opts |= RegexOptions.IgnoreCase;
871+
if ((flags & ReFlags.MULTILINE) != 0) opts |= RegexOptions.Multiline;
872+
if ((flags & ReFlags.LOCALE) == 0) opts &= ~RegexOptions.CultureInvariant;
873+
if ((flags & ReFlags.DOTALL) != 0) opts |= RegexOptions.Singleline;
859874

860875
return opts;
861876
}
862877

863-
private static int OptionToFlags(RegexOptions options) {
864-
int flags = 0;
865-
if ((options & RegexOptions.IgnoreCase) != 0) {
866-
flags |= IGNORECASE;
867-
}
868-
if ((options & RegexOptions.Multiline) != 0) {
869-
flags |= MULTILINE;
870-
}
871-
if ((options & RegexOptions.CultureInvariant) == 0) {
872-
flags |= LOCALE;
873-
}
874-
if ((options & RegexOptions.Singleline) != 0) {
875-
flags |= DOTALL;
876-
}
877-
if ((options & RegexOptions.IgnorePatternWhitespace) != 0) {
878-
flags |= VERBOSE;
879-
}
880-
return flags;
881-
}
882-
883878
internal class ParsedRegex {
884879
public ParsedRegex(string pattern) {
885880
this.UserPattern = pattern;
886881
}
887882

888883
public string UserPattern;
889884
public string Pattern;
890-
public RegexOptions Options = RegexOptions.CultureInvariant;
885+
public ReFlags Options;
891886
}
892887

893888
private static readonly char[] _endOfLineChars = new[] { '\r', '\n' };
@@ -900,6 +895,7 @@ public ParsedRegex(string pattern) {
900895
/// </summary>
901896
private static ParsedRegex PreParseRegex(CodeContext/*!*/ context, string pattern, bool verbose) {
902897
ParsedRegex res = new ParsedRegex(pattern);
898+
if (verbose) res.Options |= ReFlags.VERBOSE;
903899

904900
//string newPattern;
905901
int cur = 0, nameIndex;
@@ -911,6 +907,58 @@ private static ParsedRegex PreParseRegex(CodeContext/*!*/ context, string patter
911907
int groupCount = 0;
912908
var namedGroups = new Dictionary<string, int>();
913909

910+
if (verbose) {
911+
pattern = ApplyVerbose(pattern);
912+
}
913+
914+
static string ApplyVerbose(string pattern) {
915+
var builder = new StringBuilder();
916+
917+
bool isCharList = false;
918+
bool isEscaped = false;
919+
920+
for (int i = 0; i < pattern.Length; i++) {
921+
var c = pattern[i];
922+
if (isEscaped) {
923+
isEscaped = false;
924+
} else {
925+
switch (c) {
926+
case ' ':
927+
case '\t':
928+
case '\n':
929+
case '\r':
930+
case '\f':
931+
case '\v':
932+
if (!isCharList) continue;
933+
break;
934+
case '\\':
935+
isEscaped = true;
936+
break;
937+
case '[':
938+
isCharList = true;
939+
break;
940+
case ']':
941+
isCharList = false;
942+
break;
943+
case '#':
944+
if (!isCharList) {
945+
// skip to end of line
946+
i = pattern.IndexOfAny(_endOfLineChars, i);
947+
if (i < 0) i = pattern.Length;
948+
continue;
949+
}
950+
break;
951+
default:
952+
break;
953+
}
954+
}
955+
956+
builder.Append(c);
957+
}
958+
959+
return builder.ToString();
960+
}
961+
914962
for (; ; ) {
915963
if (verbose && inComment) {
916964
// read to end of line
@@ -1008,30 +1056,34 @@ private static ParsedRegex PreParseRegex(CodeContext/*!*/ context, string patter
10081056
pattern = pattern.Remove(nameIndex, 1);
10091057
}
10101058

1059+
break;
1060+
case 'a':
1061+
res.Options |= ReFlags.ASCII;
1062+
RemoveOption(ref pattern, ref nameIndex);
10111063
break;
10121064
case 'i':
1013-
res.Options |= RegexOptions.IgnoreCase;
1065+
res.Options |= ReFlags.IGNORECASE;
10141066
RemoveOption(ref pattern, ref nameIndex);
10151067
break;
10161068
case 'L':
1017-
res.Options &= ~(RegexOptions.CultureInvariant);
1069+
res.Options |= ReFlags.LOCALE;
10181070
RemoveOption(ref pattern, ref nameIndex);
10191071
break;
10201072
case 'm':
1021-
res.Options |= RegexOptions.Multiline;
1073+
res.Options |= ReFlags.MULTILINE;
10221074
RemoveOption(ref pattern, ref nameIndex);
10231075
break;
10241076
case 's':
1025-
res.Options |= RegexOptions.Singleline;
1077+
res.Options |= ReFlags.DOTALL;
10261078
RemoveOption(ref pattern, ref nameIndex);
10271079
break;
10281080
case 'u':
1029-
// specify unicode; not relevant and not valid under .NET as we're always unicode
1030-
// -- so the option needs to be removed
1081+
res.Options |= ReFlags.UNICODE;
10311082
RemoveOption(ref pattern, ref nameIndex);
10321083
break;
10331084
case 'x':
1034-
res.Options |= RegexOptions.IgnorePatternWhitespace;
1085+
if (!verbose) return PreParseRegex(context, res.UserPattern, true);
1086+
res.Options |= ReFlags.VERBOSE;
10351087
RemoveOption(ref pattern, ref nameIndex);
10361088
break;
10371089
case ':': break; // non-capturing

Src/IronPythonTest/Cases/CPythonCasesManifest.ini

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -848,9 +848,6 @@ IsolationLevel=PROCESS # Also weakref failures; https://github.com/IronLanguages
848848
[CPython.test_shelve]
849849
NotParallelSafe=true
850850

851-
[CPython.test_shlex]
852-
Ignore=true
853-
854851
[CPython.test_shutil]
855852
Ignore=true
856853
Reason=AttributeError: 'module' object has no attribute 'isatty'

Tests/modules/io_related/test_re.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,8 @@ def test_sanity_re_pattern(self):
187187
self.assertEqual(pattern.subn("1", "abcdabcd", 2), ("1d1d",2))
188188

189189
#flags
190-
self.assertEqual(pattern.flags, 0 if is_cli else 32)
191-
self.assertEqual(re.compile("(abc){1}", re.L).flags, re.L | (0 if is_cli else 32))
190+
self.assertEqual(pattern.flags, re.U)
191+
self.assertEqual(re.compile("(abc){1}", re.L).flags, re.L | re.U)
192192

193193
#groupindex
194194
self.assertEqual(pattern.groupindex, {})
@@ -622,9 +622,9 @@ def test_groups(self):
622622
self.assertTrue ( m2.groups('Default') == ('Default',))
623623

624624
def test_locale_flags(self):
625-
self.assertEqual(re.compile(r"^\#[ \t]*(\w[\d\w]*)[ \t](.*)").flags, 0 if is_cli else re.U)
626-
self.assertEqual(re.compile(r"^\#[ \t]*(\w[\d\w]*)[ \t](.*)", re.L).flags, re.L | (0 if is_cli else re.U))
627-
self.assertEqual(re.compile(r"(?L)^\#[ \t]*(\w[\d\w]*)[ \t](.*)").flags, re.L | (0 if is_cli else re.U))
625+
self.assertEqual(re.compile(r"^\#[ \t]*(\w[\d\w]*)[ \t](.*)").flags, re.U)
626+
self.assertEqual(re.compile(r"^\#[ \t]*(\w[\d\w]*)[ \t](.*)", re.L).flags, re.L | re.U)
627+
self.assertEqual(re.compile(r"(?L)^\#[ \t]*(\w[\d\w]*)[ \t](.*)").flags, re.L | re.U)
628628

629629
def test_end(self):
630630
ex = re.compile(r'\s+')
@@ -771,7 +771,7 @@ def test__pickle(self):
771771
pickled_regex = re._pickle(regex)
772772
self.assertEqual(len(pickled_regex), 2)
773773
self.assertEqual(pickled_regex[1],
774-
('^(?P<msg>NMAKE[A-Za-z0-9]*)\'\\"?(?P<file>[\\\\A-Za-z0-9/:_\\.\\+]+)', 0 if is_cli else re.U))
774+
('^(?P<msg>NMAKE[A-Za-z0-9]*)\'\\"?(?P<file>[\\\\A-Za-z0-9/:_\\.\\+]+)', re.U))
775775

776776
def test_conditional(self):
777777
p = re.compile(r'(a)?(b)((?(1)c))')
@@ -801,10 +801,15 @@ def test_issue1370(self):
801801
self.assertEqual(re.compile("\Z").match("\n"), None)
802802
self.assertEqual(re.compile("\Z").match("").group(0), "")
803803

804-
def test_gh21(self):
804+
def test_ipy2_gh21(self):
805805
"""https://github.com/IronLanguages/ironpython2/issues/21"""
806806
self.assertRaisesMessage(re.error, "redefinition of group name 'hoge' as group 2; was group 1", re.compile, r'(?P<hoge>\w+):(?P<hoge>\w+)')
807807
self.assertRaisesMessage(re.error, "redefinition of group name 'hoge' as group 3; was group 2", re.compile, r'(abc)(?P<hoge>\w+):(?P<hoge>\w+)')
808808
self.assertRaisesMessage(re.error, "redefinition of group name 'hoge' as group 4; was group 2", re.compile, r'(abc)(?P<hoge>\w+):(abc)(?P<hoge>\w+)')
809809

810+
def test_ipy3_gh814(self):
811+
"""https://github.com/IronLanguages/ironpython3/issues/814"""
812+
self.assertEqual(re.match(r'\s+', "\xa0", flags=re.UNICODE).group(0), "\xa0")
813+
self.assertIsNone(re.match(r'\s+', "\xa0", flags=re.ASCII))
814+
810815
run_test(__name__)

Tests/test_re_stdlib.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def load_tests(loader, standard_tests, pattern):
7171
suite.addTest(test.test_re.ReTests('test_expand'))
7272
suite.addTest(test.test_re.ReTests('test_finditer'))
7373
suite.addTest(test.test_re.ReTests('test_flags'))
74-
suite.addTest(unittest.expectedFailure(test.test_re.ReTests('test_getattr')))
74+
suite.addTest(test.test_re.ReTests('test_getattr'))
7575
suite.addTest(test.test_re.ReTests('test_getlower'))
7676
suite.addTest(unittest.expectedFailure(test.test_re.ReTests('test_group_name_in_exception')))
7777
suite.addTest(test.test_re.ReTests('test_groupdict'))

0 commit comments

Comments
 (0)