Skip to content

Commit 05bee75

Browse files
committed
Add get_emoji_regex function
1 parent e1e43c1 commit 05bee75

3 files changed

Lines changed: 38 additions & 15 deletions

File tree

emoji_data_python/__init__.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,24 +4,25 @@
44

55
from .conversion import unified_to_char
66
from .emoji_char import EmojiChar
7-
from .replacement import replace_colons
7+
from .replacement import replace_colons, get_emoji_regex
88
from .search import all_doublebyte, find_by_shortname, find_by_name
99

1010
# Read json data on module load to be cached
1111
with open(path.join(path.dirname(__file__), 'data/emoji.json'), 'r') as full_data:
1212
# Load and parse emoji data from json into EmojiChar objects
1313
emoji_data = [EmojiChar(data_blob) for data_blob in json.loads(full_data.read())] # type: List[EmojiChar]
1414

15-
# Build a cached dictionary of short names for quicker access, short code keys are normalized with underscores
16-
emoji_short_names = {
17-
emoji.short_name.replace('-', '_'): emoji for emoji in emoji_data
18-
} # type: Dict[str, EmojiChar]
15+
# Build a cached dictionary of short names for quicker access, short code keys are normalized with underscores
16+
emoji_short_names = {
17+
emoji.short_name.replace('-', '_'): emoji for emoji in emoji_data
18+
} # type: Dict[str, EmojiChar]
1919

20-
# Add other short names if they are not already used as a primary short name for an other emoji
21-
for emoji in emoji_data:
22-
for short_name in emoji.short_names:
23-
if short_name not in emoji_short_names:
24-
emoji_short_names[short_name] = emoji
20+
# Add other short names if they are not already used as a primary short name for an other emoji
21+
for emoji in emoji_data:
22+
for short_name in emoji.short_names:
23+
if short_name not in emoji_short_names:
24+
emoji_short_names[short_name] = emoji
2525

2626

27-
__all__ = ['unified_to_char', 'EmojiChar', 'replace_colons', 'all_doublebyte', 'find_by_shortname', 'find_by_name', 'emoji_data', 'emoji_short_names']
27+
__all__ = ['unified_to_char', 'EmojiChar', 'replace_colons', 'get_emoji_regex',
28+
'all_doublebyte', 'find_by_shortname', 'find_by_name', 'emoji_data', 'emoji_short_names']

emoji_data_python/replacement.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
from re import sub
1+
import re
22

33

44
def replace_colons(text: str, strip: bool=False) -> str:
55
"""Parses a string with colon encoded emoji and renders found emoji.
66
Unknown emoji are left as is unless `strip` is set to `True`
77
88
:param text: String of text to parse and replace
9-
:param strip: Whether to strip unknown codes or to leave them as `:unkown:`
9+
:param strip: Whether to strip unknown codes or to leave them as `:unknown:`
1010
1111
>>> emoji_data_python.replace_colons('Hello world ! :wave::skin-tone-3: :earth_africa: :exclamation:')
1212
'Hello world ! 👋🏼 🌍 ❗'
@@ -27,4 +27,19 @@ def emoji_repl(matchobj) -> str:
2727

2828
return res
2929

30-
return sub(r'\:[a-zA-Z0-9-_+]+\:(\:skin-tone-[2-6]\:)?', emoji_repl, text)
30+
return re.sub(r'\:[a-zA-Z0-9-_+]+\:(\:skin-tone-[2-6]\:)?', emoji_repl, text)
31+
32+
33+
def get_emoji_regex():
34+
"""Returns a regex to match any emoji
35+
36+
>>> emoji_data_python.get_emoji_regex().findall('Hello world ! 👋🏼 🌍 ❗')
37+
['👋', '🏼', '🌍', '❗']
38+
"""
39+
from emoji_data_python import emoji_data
40+
# Sort emojis by length to make sure mulit-character emojis are
41+
# matched first
42+
43+
emojis = sorted([emoji.char for emoji in emoji_data], key=len, reverse=True)
44+
pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
45+
return re.compile(pattern)

tests/test_replacement.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import unittest
22

3-
from emoji_data_python import replace_colons
3+
from emoji_data_python import replace_colons, get_emoji_regex
44

55

66
class ReplaceColonsTestCase(unittest.TestCase):
@@ -40,3 +40,10 @@ def test_multiline_sentence(self):
4040
How are you :question:""")
4141
)
4242

43+
def test_emoji_regex(self):
44+
regex = get_emoji_regex()
45+
self.assertRegex('😄', regex)
46+
self.assertRegex('👪', regex)
47+
self.assertNotRegex('hello :wave: l → ▶', regex)
48+
res = regex.findall('💩💩 🏼 bla 👋🏼')
49+
self.assertEqual(len(res), 5) # Wave + skin tone counts as two

0 commit comments

Comments
 (0)