Skip to content

Commit be00058

Browse files
committed
further discountParser improvements
* the phonenumber package now handles checking if a line is a phone number * removed the containsPhonePattern() and isNumericPhone() functions as they are no longer needed * the extractEmail() function now uses regex to extract emails instead of for loops * discountParser_test.go updated to remove tests that are no longer needed
1 parent 80247db commit be00058

2 files changed

Lines changed: 16 additions & 116 deletions

File tree

parser/discountsParser.go

Lines changed: 16 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"html"
66
"log"
77
"os"
8+
"regexp"
89
"slices"
910
"strings"
1011

@@ -135,10 +136,13 @@ func parseDiscountItem(s *goquery.Selection, category string) *schema.DiscountPr
135136
}
136137

137138
// Check if it's a phone number
138-
if containsPhonePattern(line) || isNumericPhone(line) {
139-
// assumes that all phone numbers here are US phone numbers
140-
discount.Phone = phonenumber.Parse(line, "US")
139+
// phonenumber.Parse returns "" if not parsable as a phone number
140+
// assumes that all phone numbers here are US phone numbers
141+
parsed := phonenumber.Parse(line, "US")
142+
if parsed != "" {
143+
discount.Phone = parsed
141144
}
145+
142146
}
143147

144148
var addresses = []string{}
@@ -191,17 +195,6 @@ func stripHTMLTags(s string) string {
191195
return s
192196
}
193197

194-
// isNumericPhone checks if a string is mostly numeric (like a phone number)
195-
func isNumericPhone(s string) bool {
196-
digitCount := 0
197-
for _, c := range s {
198-
if c >= '0' && c <= '9' {
199-
digitCount++
200-
}
201-
}
202-
return digitCount >= 7 && len(s) <= 20
203-
}
204-
205198
// isValidDiscount checks if a discount entry has meaningful data
206199
func isValidDiscount(d *schema.DiscountProgram) bool {
207200
// Must have a business name
@@ -223,31 +216,18 @@ func isValidDiscount(d *schema.DiscountProgram) bool {
223216
return hasContent
224217
}
225218

226-
// containsPhonePattern checks if a string contains phone number patterns
227-
func containsPhonePattern(s string) bool {
228-
// Simple check for phone number patterns like XXX-XXX-XXXX or (XXX) XXX-XXXX
229-
return strings.Count(s, "-") >= 2 || (strings.Contains(s, "(") && strings.Contains(s, ")"))
230-
}
231-
232-
// extractEmail extracts email from text
219+
// extractEmail uses regex to extract email addresses from text
233220
func extractEmail(text string) string {
234-
text = strings.TrimSpace(text)
235-
236-
// Find @ symbol and extract email
237-
if idx := strings.Index(text, "@"); idx != -1 {
238-
// Find start and end of email
239-
start := idx
240-
for start > 0 && !strings.ContainsAny(string(text[start-1]), " \t\n\r,;") {
241-
start--
242-
}
243-
end := idx
244-
for end < len(text) && !strings.ContainsAny(string(text[end]), " \t\n\r,;") {
245-
end++
246-
}
247-
return text[start:end]
221+
const emailRegexPattern = `[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,4}`
222+
var emailRegex = regexp.MustCompile(emailRegexPattern)
223+
224+
email := emailRegex.FindString(text)
225+
226+
if email == "" {
227+
return "No email here"
248228
}
249229

250-
return text
230+
return email
251231
}
252232

253233
// trimAfter returns the substring after the first occurrence of sep

parser/discountsParser_test.go

Lines changed: 0 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -254,86 +254,6 @@ func TestCleanText(t *testing.T) {
254254
}
255255
}
256256

257-
// TestContainsPhonePattern tests phone number pattern detection
258-
func TestContainsPhonePattern(t *testing.T) {
259-
t.Parallel()
260-
261-
testCases := map[string]struct {
262-
input string
263-
expected bool
264-
}{
265-
"standard": {
266-
input: "972-214-5510",
267-
expected: true,
268-
},
269-
"parentheses": {
270-
input: "(972) 214-5510",
271-
expected: true,
272-
},
273-
"not_phone": {
274-
input: "Hello World",
275-
expected: false,
276-
},
277-
"single_dash": {
278-
input: "Test-Name",
279-
expected: false,
280-
},
281-
}
282-
283-
for name, tc := range testCases {
284-
t.Run(name, func(t *testing.T) {
285-
t.Parallel()
286-
287-
result := containsPhonePattern(tc.input)
288-
if result != tc.expected {
289-
t.Errorf("containsPhonePattern(%q) = %v, expected %v", tc.input, result, tc.expected)
290-
}
291-
})
292-
}
293-
}
294-
295-
// TestIsNumericPhone tests numeric phone detection
296-
func TestIsNumericPhone(t *testing.T) {
297-
t.Parallel()
298-
299-
testCases := map[string]struct {
300-
input string
301-
expected bool
302-
}{
303-
"numeric_phone": {
304-
input: "9722145510",
305-
expected: true,
306-
},
307-
"with_spaces": {
308-
input: "972 214 5510",
309-
expected: true,
310-
},
311-
"too_short": {
312-
input: "12345",
313-
expected: false,
314-
},
315-
"too_long": {
316-
input: "123456789012345678901",
317-
expected: false,
318-
},
319-
"not_numeric": {
320-
input: "Hello World",
321-
expected: false,
322-
},
323-
}
324-
325-
for name, tc := range testCases {
326-
t.Run(name, func(t *testing.T) {
327-
t.Parallel()
328-
329-
result := isNumericPhone(tc.input)
330-
if result != tc.expected {
331-
t.Errorf("isNumericPhone(%q) = %v, expected %v", tc.input, result, tc.expected)
332-
}
333-
})
334-
}
335-
}
336-
337257
// TestExtractEmail tests email extraction from text
338258
func TestExtractEmail(t *testing.T) {
339259
t.Parallel()

0 commit comments

Comments
 (0)