Skip to content

Commit 0ae4b67

Browse files
authored
Merge pull request #1458 from mathics/strings-and-characters-organize
Strings and characters organize
2 parents d32cc49 + b705892 commit 0ae4b67

14 files changed

Lines changed: 2576 additions & 2435 deletions

mathics/builtin/__init__.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def add_builtins(new_builtins):
4040
if isinstance(builtin, SympyObject):
4141
mathics_to_sympy[name] = builtin
4242
for sympy_name in builtin.get_sympy_names():
43-
### print("XXX1", sympy_name)
43+
# print("XXX1", sympy_name)
4444
sympy_to_mathics[sympy_name] = builtin
4545
if isinstance(builtin, Operator):
4646
builtins_precedence[name] = builtin.precedence
@@ -153,7 +153,16 @@ def is_builtin(var):
153153
[] if ENABLE_FILES_MODULE else ["files_io.files", "files_io.importexport"]
154154
)
155155

156-
for subdir in ("colors", "drawing", "files_io", "numbers", "specialfns", "fileformats"):
156+
for subdir in (
157+
"colors",
158+
"distance",
159+
"drawing",
160+
"files_io",
161+
"numbers",
162+
"specialfns",
163+
"string",
164+
"fileformats",
165+
):
157166
import_name = f"{__name__}.{subdir}"
158167

159168
if import_name in disable_file_module_names:

mathics/builtin/colors/color_directives.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""
22
Color Directives
3+
4+
There are many different way to specify color; we support all of the color formats below and will convert between the different color formats.
35
"""
46

57
from math import atan2, cos, exp, pi, radians, sin, sqrt
@@ -225,24 +227,27 @@ class CMYKColor(_Color):
225227
class ColorDistance(Builtin):
226228
"""
227229
<dl>
228-
<dt>'ColorDistance[$c1$, $c2$]'
229-
<dd>returns a measure of color distance between the colors $c1$ and $c2$.
230-
<dt>'ColorDistance[$list$, $c2$]'
231-
<dd>returns a list of color distances between the colors in $list$ and $c2$.
230+
<dt>'ColorDistance[$c1$, $c2$]'
231+
<dd>returns a measure of color distance between the colors $c1$ and $c2$.
232+
233+
<dt>'ColorDistance[$list$, $c2$]'
234+
<dd>returns a list of color distances between the colors in $list$ and $c2$.
232235
</dl>
233236
234237
The option DistanceFunction specifies the method used to measure the color
235238
distance. Available options are:
236239
237-
CIE76: euclidean distance in the LABColor space
238-
CIE94: euclidean distance in the LCHColor space
239-
CIE2000 or CIEDE2000: CIE94 distance with corrections
240-
CMC: Colour Measurement Committee metric (1984)
241-
DeltaL: difference in the L component of LCHColor
242-
DeltaC: difference in the C component of LCHColor
243-
DeltaH: difference in the H component of LCHColor
240+
<ul>
241+
<li>CIE76: Euclidean distance in the LABColor space
242+
<li>CIE94: Euclidean distance in the LCHColor space
243+
<li>CIE2000 or CIEDE2000: CIE94 distance with corrections
244+
<li>CMC: Color Measurement Committee metric (1984)
245+
<li>DeltaL: difference in the L component of LCHColor
246+
<li>DeltaC: difference in the C component of LCHColor
247+
<li>DeltaH: difference in the H component of LCHColor
248+
</ul>
244249
245-
It is also possible to specify a custom distance
250+
It is also possible to specify a custom distance.
246251
247252
>> ColorDistance[Magenta, Green]
248253
= 2.2507
@@ -374,7 +379,7 @@ def compute(a, b):
374379
),
375380
)
376381

377-
if compute == None:
382+
if compute is None:
378383
evaluation.message("ColorDistance", "invdist", distance_function)
379384
return
380385

mathics/builtin/colors/color_operations.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# -*- coding: utf-8 -*-
2-
"""Color Operations"""
2+
"""Color Operations
3+
4+
Functions for manipulating colors and color images.
5+
"""
36

47
from mathics.version import __version__ # noqa used in loading to check consistency.
58

mathics/builtin/colors/named_colors.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
# -*- coding: utf-8 -*-
22
"""Named Colors
33
4-
Mathics has definitions for the most common color names which can be
5-
used in a graphics or style specification.
4+
Mathics has definitions for the most common color names which can be used in a graphics or style specification.
65
"""
76

87
from mathics.builtin.base import Builtin
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
"""
2+
Distance and Similarity Measures
3+
4+
Different measures of distance or similarity for different types of analysis.
5+
"""
6+
7+
from mathics.version import __version__ # noqa used in loading to check consistency.
Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
String Distances and Similarity Measures
4+
"""
5+
6+
import unicodedata
7+
8+
from typing import Callable
9+
10+
from mathics.version import __version__ # noqa used in loading to check consistency.
11+
12+
from mathics.builtin.base import Builtin
13+
14+
from mathics.core.expression import (
15+
Expression,
16+
Integer,
17+
String,
18+
SymbolTrue,
19+
)
20+
21+
22+
# Levenshtein's algorithm is defined by the following construction:
23+
# (adapted from https://de.wikipedia.org/wiki/Levenshtein-Distanz)
24+
#
25+
# given two strings s1, s2, we build a matrix D sized (len(s1) + 1,
26+
# len(s2) + 1) and fill it using the following rules:
27+
#
28+
# (1) D(0, 0) = 0
29+
# (2) D(i, 0) = i, 1 <= i <= len(s1)
30+
# (3) D(0, j) = j, 1 <= j <= len(s2)
31+
# (4) D(i, j) = minimum of
32+
# D(i - 1, j - 1) + 0 if s1(j) = s2(j)
33+
# D(i - 1, j - 1) + 1 (substitution)
34+
# D(i, j - 1) + 1 (insertion)
35+
# D(i - 1, j) + 1 (deletion)
36+
#
37+
# The computed distance will be in D(len(s1) + 1, len(s2) + 1).
38+
#
39+
# note: double brackets indicate 1-based indices below, e.g. s1[[1]]
40+
41+
42+
def _one_based(l): # makes an enumerated generator 1-based
43+
return ((i + 1, x) for i, x in l)
44+
45+
46+
def _prev_curr(l): # yields pairs of (x[i - 1], x[i]) for i in 1, 2, ...
47+
prev = None
48+
for curr in l:
49+
yield prev, curr
50+
prev = curr
51+
52+
53+
def _levenshtein_d0(s2): # compute D(0, ...)
54+
return list(range(len(s2) + 1)) # see (1), (3)
55+
56+
57+
def _levenshtein_di(c1, s2, i, d_prev, sameQ, cost): # compute one new row
58+
# given c1 = s1[i], s2, i, d_prev = D(i - 1, ...), compute D(i, ...)
59+
60+
yield i # start with D(i, 0) = i, see (2)
61+
d_curr_prev_j = i # d_curr_prev_j stores D(i, j - 1)
62+
63+
for j, c2 in _one_based(enumerate(s2)): # c2 = s2[[j]]
64+
cond = 0 if sameQ(c1, c2) else cost
65+
66+
d_curr_j = min( # see (4)
67+
d_prev[j - 1] + cond, # D(i - 1, j - 1) + cond; substitution
68+
d_curr_prev_j + 1, # D(i, j - 1) + 1; insertion
69+
d_prev[j] + 1,
70+
) # D(i - 1, j) + 1; deletion
71+
72+
yield d_curr_j
73+
d_curr_prev_j = d_curr_j
74+
75+
76+
def _levenshtein(s1, s2, sameQ: Callable[..., bool]):
77+
d_prev = _levenshtein_d0(s2)
78+
for i, c1 in _one_based(enumerate(s1)): # c1 = s1[[i]]
79+
d_prev = list(_levenshtein_di(c1, s2, i, d_prev, sameQ, 1))
80+
return d_prev[-1]
81+
82+
83+
def _damerau_levenshtein(s1, s2, sameQ: Callable[..., bool]):
84+
# _damerau_levenshtein works like _levenshtein, except for one additional
85+
# rule covering transposition:
86+
#
87+
# if i > 1 and j > 1 and a[i] == b[j - 1] and a[i - 1] == b[j] then
88+
# D(i, j) = minimum(D(i, j), D(i - 2, j - 2) + transposition_cost)
89+
90+
def row(d_prev_prev, d_prev, i, prev_c1, c1, cost):
91+
# given c1 = s1[i], d_prev_prev = D(i - 2), d_prev = D(i - 1),
92+
# prev_c1 = s1[[i - 1]], c1 = s1[[i]], compute D(i, ...)
93+
for j, d_curr_j in enumerate(_levenshtein_di(c1, s2, i, d_prev, sameQ, cost)):
94+
if i > 1 and j > 1:
95+
if sameQ(c1, s2[j - 2]) and sameQ(prev_c1, s2[j - 1]): # transposition?
96+
# i.e. if s1[[i]] = s2[[j-1]] and s1[[i-1]] = s2[[j]]
97+
d_curr_j = min(d_curr_j, d_prev_prev[j - 2] + cost)
98+
yield d_curr_j
99+
100+
d_prev_prev = None
101+
d_prev = _levenshtein_d0(s2)
102+
for i, (prev_c1, c1) in _one_based(enumerate(_prev_curr(s1))):
103+
d_curr = list(row(d_prev_prev, d_prev, i, prev_c1, c1, 1))
104+
d_prev_prev = d_prev
105+
d_prev = d_curr
106+
107+
return d_prev[-1]
108+
109+
110+
def _levenshtein_like_or_border_cases(s1, s2, sameQ: Callable[..., bool], compute):
111+
if len(s1) == len(s2) and all(sameQ(c1, c2) for c1, c2 in zip(s1, s2)):
112+
return 0
113+
114+
if len(s1) < len(s2):
115+
s1, s2 = s2, s1
116+
117+
if len(s2) == 0:
118+
return len(s1)
119+
120+
return compute(s1, s2, sameQ)
121+
122+
123+
class _StringDistance(Builtin):
124+
options = {"IgnoreCase": "False"}
125+
126+
def apply(self, a, b, evaluation, options):
127+
"%(name)s[a_, b_, OptionsPattern[%(name)s]]"
128+
if isinstance(a, String) and isinstance(b, String):
129+
py_a = a.get_string_value()
130+
py_b = b.get_string_value()
131+
if options["System`IgnoreCase"] == SymbolTrue:
132+
if hasattr(str, "casefold"):
133+
134+
def normalize(c):
135+
return unicodedata.normalize("NFKD", c.casefold())
136+
137+
py_a = [normalize(c) for c in py_a]
138+
py_b = [normalize(c) for c in py_b]
139+
else: # python2, PyPy
140+
py_a = py_a.lower()
141+
py_b = py_b.lower()
142+
return Integer(self._distance(py_a, py_b, lambda u, v: u == v))
143+
elif a.get_head_name() == "System`List" and b.get_head_name() == "System`List":
144+
return Integer(self._distance(a.leaves, b.leaves, lambda u, v: u.sameQ(v)))
145+
else:
146+
return Expression("EditDistance", a, b)
147+
148+
149+
class DamerauLevenshteinDistance(_StringDistance):
150+
"""
151+
<dl>
152+
<dt>'DamerauLevenshteinDistance[$a$, $b$]'
153+
<dd>returns the Damerau-Levenshtein distance of $a$ and $b$, which is defined as the minimum number of
154+
transpositions, insertions, deletions and substitutions needed to transform one into the other.
155+
In contrast to EditDistance, DamerauLevenshteinDistance counts transposition of adjacent items (e.g.
156+
"ab" into "ba") as one operation of change.
157+
</dl>
158+
159+
>> DamerauLevenshteinDistance["kitten", "kitchen"]
160+
= 2
161+
162+
>> DamerauLevenshteinDistance["abc", "ac"]
163+
= 1
164+
165+
>> DamerauLevenshteinDistance["abc", "acb"]
166+
= 1
167+
168+
>> DamerauLevenshteinDistance["azbc", "abxyc"]
169+
= 3
170+
171+
The IgnoreCase option makes DamerauLevenshteinDistance ignore the case of letters:
172+
>> DamerauLevenshteinDistance["time", "Thyme"]
173+
= 3
174+
175+
>> DamerauLevenshteinDistance["time", "Thyme", IgnoreCase -> True]
176+
= 2
177+
178+
DamerauLevenshteinDistance also works on lists:
179+
>> DamerauLevenshteinDistance[{1, E, 2, Pi}, {1, E, Pi, 2}]
180+
= 1
181+
"""
182+
183+
def _distance(self, s1, s2, sameQ: Callable[..., bool]):
184+
return _levenshtein_like_or_border_cases(s1, s2, sameQ, _damerau_levenshtein)
185+
186+
187+
class EditDistance(_StringDistance):
188+
"""
189+
<dl>
190+
<dt>'EditDistance[$a$, $b$]'
191+
<dd>returns the Levenshtein distance of $a$ and $b$, which is defined as the minimum number of
192+
insertions, deletions and substitutions on the constituents of $a$ and $b$ needed to transform
193+
one into the other.
194+
</dl>
195+
196+
>> EditDistance["kitten", "kitchen"]
197+
= 2
198+
199+
>> EditDistance["abc", "ac"]
200+
= 1
201+
202+
>> EditDistance["abc", "acb"]
203+
= 2
204+
205+
>> EditDistance["azbc", "abxyc"]
206+
= 3
207+
208+
The IgnoreCase option makes EditDistance ignore the case of letters:
209+
>> EditDistance["time", "Thyme"]
210+
= 3
211+
212+
>> EditDistance["time", "Thyme", IgnoreCase -> True]
213+
= 2
214+
215+
EditDistance also works on lists:
216+
>> EditDistance[{1, E, 2, Pi}, {1, E, Pi, 2}]
217+
= 2
218+
"""
219+
220+
def _distance(self, s1, s2, sameQ: Callable[..., bool]):
221+
return _levenshtein_like_or_border_cases(s1, s2, sameQ, _levenshtein)
222+
223+
224+
class HammingDistance(Builtin):
225+
"""
226+
<dl>
227+
<dt>'HammingDistance[$u$, $v$]'
228+
<dd>returns the Hamming distance between $u$ and $v$, i.e. the number of different elements.
229+
$u$ and $v$ may be lists or strings.
230+
</dl>
231+
232+
>> HammingDistance[{1, 0, 1, 0}, {1, 0, 0, 1}]
233+
= 2
234+
235+
>> HammingDistance["time", "dime"]
236+
= 1
237+
238+
>> HammingDistance["TIME", "dime", IgnoreCase -> True]
239+
= 1
240+
"""
241+
242+
messages = {
243+
"idim": "`1` and `2` must be of same length.",
244+
}
245+
246+
options = {
247+
"IgnoreCase": "False",
248+
}
249+
250+
@staticmethod
251+
def _compute(u, v, sameQ, evaluation):
252+
if len(u) != len(v):
253+
evaluation.message("HammingDistance", "idim", u, v)
254+
return None
255+
else:
256+
return Integer(sum(0 if sameQ(x, y) else 1 for x, y in zip(u, v)))
257+
258+
def apply_list(self, u, v, evaluation):
259+
"HammingDistance[u_List, v_List]"
260+
return HammingDistance._compute(
261+
u.leaves, v.leaves, lambda x, y: x.sameQ(y), evaluation
262+
)
263+
264+
def apply_string(self, u, v, evaluation, options):
265+
"HammingDistance[u_String, v_String, OptionsPattern[HammingDistance]]"
266+
ignore_case = self.get_option(options, "IgnoreCase", evaluation)
267+
py_u = u.get_string_value()
268+
py_v = v.get_string_value()
269+
if ignore_case and ignore_case.is_true():
270+
py_u = py_u.lower()
271+
py_v = py_v.lower()
272+
return HammingDistance._compute(py_u, py_v, lambda x, y: x == y, evaluation)

mathics/builtin/string/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
"""
2+
Strings and Characters
3+
4+
"""
5+
6+
from mathics.version import __version__ # noqa used in loading to check consistency.

0 commit comments

Comments
 (0)