Skip to content

Commit 62a9a64

Browse files
feat: remove go-away usage and use in-house profanity filter (#20)
* feat: remove go-away and use in-house profanity filter * chore: add tests for filter trie * feat: add multi character sanitization and log transforming errors
1 parent 24901bf commit 62a9a64

5 files changed

Lines changed: 664 additions & 825 deletions

File tree

internal/pkg/text/filter_trie.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
package text
2+
3+
import "strings"
4+
5+
type negative struct {
6+
Prefix string
7+
Suffix string
8+
}
9+
10+
func (n negative) matches(text string, start int, end int) bool {
11+
prefixIndex := start - len(n.Prefix)
12+
suffixIndex := end + len(n.Suffix)
13+
14+
if prefixIndex < 0 || suffixIndex > len(text) {
15+
return false
16+
}
17+
18+
return text[prefixIndex:start] == n.Prefix && text[end:suffixIndex] == n.Suffix
19+
}
20+
21+
type node struct {
22+
Children map[rune]*node
23+
Negatives []negative
24+
IsEnd bool
25+
}
26+
27+
func newNode() *node {
28+
return &node{Children: make(map[rune]*node), Negatives: []negative{}, IsEnd: false}
29+
}
30+
31+
type FilterTrie struct {
32+
root *node
33+
}
34+
35+
func (f *FilterTrie) Put(word string, negatives ...string) {
36+
word = strings.ToLower(word)
37+
38+
if f.root == nil {
39+
f.root = newNode()
40+
}
41+
42+
current := f.root
43+
for _, char := range word {
44+
if current.Children[char] == nil {
45+
current.Children[char] = newNode()
46+
}
47+
current = current.Children[char]
48+
}
49+
current.IsEnd = true
50+
51+
for _, neg := range negatives {
52+
neg = strings.ToLower(neg)
53+
prefix := neg[:strings.Index(neg, word)]
54+
suffix := neg[strings.Index(neg, word)+len(word):]
55+
current.Negatives = append(current.Negatives, negative{Prefix: prefix, Suffix: suffix})
56+
}
57+
}
58+
59+
func (f *FilterTrie) Test(text string) *string {
60+
for i := 0; i < len(text); i++ {
61+
if matched := f.testAt(text, i); matched != nil {
62+
return matched
63+
}
64+
}
65+
return nil
66+
}
67+
68+
func (f *FilterTrie) testAt(text string, index int) *string {
69+
node := f.root
70+
for i := index; i < len(text); i++ {
71+
char := rune(text[i])
72+
node = node.Children[char]
73+
if node == nil {
74+
return nil
75+
} else if node.IsEnd {
76+
for _, neg := range node.Negatives {
77+
if neg.matches(text, index, i+1) {
78+
if len(node.Children) == 0 {
79+
return nil
80+
}
81+
82+
continue
83+
}
84+
}
85+
86+
matched := text[index : i+1]
87+
return &matched
88+
}
89+
}
90+
return nil
91+
}
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
package text
2+
3+
import "testing"
4+
5+
func getTestTrie() *FilterTrie {
6+
trie := &FilterTrie{}
7+
trie.Put("shit", "horseshit", "bullshit")
8+
trie.Put("fuck")
9+
return trie
10+
}
11+
12+
func TestPositiveProfanities(t *testing.T) {
13+
trie := getTestTrie()
14+
texts := map[string]string{
15+
"this shit": "shit",
16+
"fuck this": "fuck",
17+
"hello world": "",
18+
}
19+
20+
for text, expected := range texts {
21+
result := trie.Test(text)
22+
if result == nil && len(expected) != 0 {
23+
t.Errorf("Expected '%s' but got nil from '%s'", expected, text)
24+
} else if result != nil && expected != *result {
25+
t.Errorf("Expected '%s' but got '%s' from '%s'", expected, *result, text)
26+
}
27+
}
28+
}
29+
30+
func TestNegativeProfanities(t *testing.T) {
31+
trie := getTestTrie()
32+
texts := map[string]string{
33+
"this horseshit": "",
34+
"fuck this bullshit": "fuck",
35+
"hello world": "",
36+
}
37+
38+
for text, expected := range texts {
39+
result := trie.Test(text)
40+
if result == nil && len(expected) != 0 {
41+
t.Errorf("Expected '%s' but got nil from '%s'", expected, text)
42+
} else if result != nil && expected != *result {
43+
t.Errorf("Expected '%s' but got '%s' from '%s'", expected, *result, text)
44+
}
45+
}
46+
}
47+
48+
func TestLongText(t *testing.T) {
49+
trie := getTestTrie()
50+
// an excerpt from romeo and juliet, copyright: public domain
51+
text := `
52+
PRINCE.
53+
Rebellious subjects, enemies to peace,
54+
Profaners of this neighbour-stained steel,—
55+
Will they not hear? What, ho! You men, you beasts,
56+
That quench the fire of your pernicious rage
57+
With purple fountains issuing from your veins,
58+
On pain of torture, from those bloody hands
59+
Throw your mistemper’d weapons to the ground
60+
And hear the sentence of your moved prince.
61+
Three civil brawls, bred of an airy word,
62+
By thee, old Capulet, and Montague,
63+
Have thrice disturb’d the quiet of our streets,
64+
And made Verona’s ancient citizens
65+
Cast by their grave beseeming ornaments,
66+
To wield old partisans, in hands as old,
67+
Canker’d with peace, to part your canker’d hate.
68+
If ever you disturb our streets again,
69+
Your lives shall pay the forfeit of the peace.
70+
For this time all the rest depart away:
71+
You, Capulet, shall go along with me,
72+
And Montague, come you this afternoon,
73+
To know our farther pleasure in this case,
74+
To old Free-town, our common judgement-place.
75+
Once more, on pain of death, all men depart.
76+
77+
[_Exeunt Prince and Attendants; Capulet, Lady Capulet, Tybalt,
78+
Citizens and Servants._]
79+
80+
MONTAGUE.
81+
Who set this ancient quarrel new abroach?
82+
Speak, nephew, were you by when it began?
83+
84+
BENVOLIO.
85+
Here were the servants of your adversary
86+
And yours, close fighting ere I did approach.
87+
I drew to part them, in the instant came
88+
The fiery Tybalt, with his sword prepar’d,
89+
Which, as he breath’d defiance to my ears,
90+
He swung about his head, and cut the winds,
91+
Who nothing hurt withal, hiss’d him in scorn.
92+
While we were interchanging thrusts and blows
93+
Came more and more, and fought on part and part,
94+
Till the Prince came, who parted either part.
95+
96+
LADY MONTAGUE.
97+
O where is Romeo, saw you him today?
98+
Right glad I am he was not at this fray.
99+
100+
BENVOLIO.
101+
Madam, an hour before the worshipp’d sun
102+
Peer’d forth the golden window of the east,
103+
A troubled mind drave me to walk abroad,
104+
Where underneath the grove of sycamore
105+
That westward rooteth from this city side,
106+
So early walking did I see your son.
107+
Towards him I made, but he was ware of me,
108+
And stole into the covert of the wood.
109+
I, measuring his affections by my own,
110+
Which then most sought where most might not be found,
111+
Being one too many by my weary self,
112+
Pursu’d my humour, not pursuing his,
113+
And gladly shunn’d who gladly fled from me.
114+
115+
MONTAGUE.
116+
Many a morning hath he there been seen,
117+
With tears augmenting the fresh morning’s dew,
118+
Adding to clouds more clouds with his deep sighs;
119+
But all so soon as the all-cheering sun
120+
Should in the farthest east begin to draw
121+
The shady curtains from Aurora’s bed,
122+
Away from light steals home my heavy son,
123+
And private in his chamber pens himself,
124+
Shuts up his windows, locks fair daylight out
125+
And makes himself an artificial night.
126+
Black and portentous must this humour prove,
127+
Unless good counsel may the cause remove.
128+
129+
BENVOLIO.
130+
My noble uncle, do you know the cause?
131+
132+
MONTAGUE.
133+
I neither know it nor can learn of him.
134+
135+
BENVOLIO.
136+
Have you importun’d him by any means?
137+
138+
MONTAGUE.
139+
Both by myself and many other friends;
140+
But he, his own affections’ counsellor,
141+
Is to himself—I will not say how true—
142+
But to himself so secret and so close,
143+
So far from sounding and discovery,
144+
As is the bud bit with an envious worm
145+
Ere he can spread his sweet leaves to the air,
146+
Or dedicate his beauty to the sun.
147+
Could we but learn from whence his sorrows grow,
148+
We would as willingly give cure as know.
149+
`
150+
151+
result := trie.Test(text)
152+
if result != nil {
153+
t.Errorf("Expected no profanity but got '%s'", *result)
154+
}
155+
}

internal/pkg/text/static.go

Lines changed: 98 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,114 @@ package text
22

33
import (
44
"context"
5-
6-
goaway "github.com/TwiN/go-away"
5+
"go.uber.org/zap"
6+
"golang.org/x/text/runes"
7+
"golang.org/x/text/transform"
8+
"golang.org/x/text/unicode/norm"
9+
"strings"
10+
"unicode"
711
)
812

13+
var charactersToReplace = map[rune]rune{
14+
'4': 'a',
15+
'@': 'a',
16+
'3': 'e',
17+
'1': 'i',
18+
'0': 'o',
19+
'5': 's',
20+
'7': 't',
21+
'8': 'b',
22+
'9': 'g',
23+
'+': 't',
24+
'$': 's',
25+
'(': 'c',
26+
'{': 'c',
27+
'[': 'c',
28+
'!': 'i',
29+
'|': 'i',
30+
'£': 'e',
31+
'€': 'e',
32+
'¥': 'y',
33+
'¢': 'c',
34+
'<': 'c',
35+
}
36+
var multiCharactersToReplace = map[rune]map[rune]rune{
37+
'(': {
38+
')': 'o',
39+
},
40+
'[': {
41+
']': 'o',
42+
},
43+
'{': {
44+
'}': 'o',
45+
},
46+
'<': {
47+
'>': 'o',
48+
},
49+
}
50+
951
var _ Filter = &StaticFilter{}
1052

1153
type StaticFilter struct {
12-
detector *goaway.ProfanityDetector
54+
trie FilterTrie
1355
}
1456

1557
func NewStaticFilter() *StaticFilter {
16-
detector := goaway.NewProfanityDetector().
17-
WithCustomDictionary(profanities, falsePositives, falseNegatives)
58+
trie := FilterTrie{}
59+
for word, negatives := range profanities {
60+
trie.Put(word, negatives...)
61+
}
1862

19-
return &StaticFilter{detector}
63+
return &StaticFilter{trie: trie}
2064
}
2165

22-
func (s *StaticFilter) Test(ctx context.Context, text string) (result Result) {
23-
result.Engine = "static"
24-
result.MatchedText = s.detector.ExtractProfanity(text)
25-
result.Matched = len(result.MatchedText) != 0
66+
func (c StaticFilter) Test(_ context.Context, text string) (result Result) {
67+
sanitized := sanitize(text)
68+
matched := c.trie.Test(sanitized)
69+
70+
result.Engine = "custom"
71+
result.Matched = matched != nil
72+
if matched != nil {
73+
result.MatchedText = *matched
74+
}
2675
return
2776
}
77+
78+
func sanitize(text string) string {
79+
// If transforming fails it's not the end of the world, we can just use the original text
80+
transformed, _, err := transform.String(
81+
transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC),
82+
text,
83+
)
84+
if err != nil {
85+
zap.S().Errorf("failed to sanitize text: %v", err)
86+
transformed = text
87+
}
88+
89+
builder := strings.Builder{}
90+
for i := 0; i < len(transformed); i++ {
91+
char := rune(transformed[i])
92+
93+
multiReplacement, isPartOfMulti := multiCharactersToReplace[char]
94+
if isPartOfMulti && i+1 < len(transformed) {
95+
nextChar := rune(transformed[i+1])
96+
97+
if replacement, ok := multiReplacement[nextChar]; ok {
98+
builder.WriteRune(replacement)
99+
i++ // Skip the next character since it's part of the multi-character replacement
100+
continue
101+
}
102+
}
103+
104+
if replacement, ok := charactersToReplace[char]; ok {
105+
builder.WriteRune(replacement)
106+
} else if char >= 'A' && char <= 'Z' {
107+
builder.WriteRune(char + ('a' - 'A'))
108+
} else if char >= 'a' && char <= 'z' {
109+
builder.WriteRune(char)
110+
} else if char >= '0' && char <= '9' {
111+
builder.WriteRune(char)
112+
}
113+
}
114+
return builder.String()
115+
}

0 commit comments

Comments
 (0)