Skip to content

Commit 58f0849

Browse files
author
Rustem
committed
Add comprehensive unit tests
Added 24 unit tests covering: - HTML cleaning (script/style removal, tag stripping, entity unescaping) - Text fingerprinting (consistency, case sensitivity, MD5 format) - Duplicate detection (fingerprint matching, case insensitivity) - End-to-end pipeline (file handling, URL filtering, stats reporting) All tests pass successfully.
1 parent 2ff437a commit 58f0849

2 files changed

Lines changed: 269 additions & 0 deletions

File tree

tests/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Tests package

tests/test_core.py

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Unit tests for textnano core functionality
4+
"""
5+
6+
import unittest
7+
import tempfile
8+
import os
9+
from pathlib import Path
10+
11+
from textnano.core import (
12+
clean_html,
13+
text_fingerprint,
14+
is_duplicate,
15+
download_and_clean
16+
)
17+
18+
19+
class TestCleanHTML(unittest.TestCase):
20+
"""Test HTML cleaning functionality"""
21+
22+
def test_remove_script_tags(self):
23+
"""Test that script tags are removed"""
24+
html = '<html><script>alert("test")</script><p>Content</p></html>'
25+
result = clean_html(html)
26+
self.assertNotIn('script', result)
27+
self.assertNotIn('alert', result)
28+
self.assertIn('Content', result)
29+
30+
def test_remove_style_tags(self):
31+
"""Test that style tags are removed"""
32+
html = '<html><style>body { color: red; }</style><p>Content</p></html>'
33+
result = clean_html(html)
34+
self.assertNotIn('style', result)
35+
self.assertNotIn('color', result)
36+
self.assertIn('Content', result)
37+
38+
def test_remove_html_tags(self):
39+
"""Test that HTML tags are removed"""
40+
html = '<div><p>Hello <strong>World</strong></p></div>'
41+
result = clean_html(html)
42+
self.assertNotIn('<', result)
43+
self.assertNotIn('>', result)
44+
self.assertIn('Hello', result)
45+
self.assertIn('World', result)
46+
47+
def test_unescape_html_entities(self):
48+
"""Test that HTML entities are unescaped"""
49+
html = '<p>&amp; &quot;quotes&quot; &copy; 2024</p>'
50+
result = clean_html(html)
51+
self.assertIn('&', result)
52+
self.assertIn('"quotes"', result)
53+
self.assertIn('©', result)
54+
55+
def test_normalize_whitespace(self):
56+
"""Test that whitespace is normalized"""
57+
html = '<p>Too many spaces</p>'
58+
result = clean_html(html)
59+
self.assertEqual('Too many spaces', result.strip())
60+
61+
def test_empty_html(self):
62+
"""Test that empty HTML returns empty string"""
63+
html = '<html><body></body></html>'
64+
result = clean_html(html)
65+
self.assertEqual('', result)
66+
67+
def test_nested_script_tags(self):
68+
"""Test removal of nested script tags"""
69+
html = '<html><body><div><script>var x = 1;</script>Text</div></body></html>'
70+
result = clean_html(html)
71+
self.assertNotIn('script', result)
72+
self.assertNotIn('var x', result)
73+
self.assertIn('Text', result)
74+
75+
76+
class TestTextFingerprint(unittest.TestCase):
77+
"""Test text fingerprinting functionality"""
78+
79+
def test_same_text_same_fingerprint(self):
80+
"""Test that same text produces same fingerprint"""
81+
text = "This is a test sentence with some words"
82+
fp1 = text_fingerprint(text)
83+
fp2 = text_fingerprint(text)
84+
self.assertEqual(fp1, fp2)
85+
86+
def test_different_text_different_fingerprint(self):
87+
"""Test that different text produces different fingerprints"""
88+
text1 = "This is the first text"
89+
text2 = "This is the second text"
90+
fp1 = text_fingerprint(text1)
91+
fp2 = text_fingerprint(text2)
92+
self.assertNotEqual(fp1, fp2)
93+
94+
def test_case_insensitive(self):
95+
"""Test that fingerprinting is case insensitive"""
96+
text1 = "Hello World Test"
97+
text2 = "hello world test"
98+
fp1 = text_fingerprint(text1)
99+
fp2 = text_fingerprint(text2)
100+
self.assertEqual(fp1, fp2)
101+
102+
def test_custom_n_words(self):
103+
"""Test that custom n parameter works"""
104+
text = "one two three four five six seven eight nine ten"
105+
fp1 = text_fingerprint(text, n=3)
106+
fp2 = text_fingerprint(text, n=5)
107+
# Different n values should potentially give different fingerprints
108+
# unless the first n words are the same
109+
self.assertIsNotNone(fp1)
110+
self.assertIsNotNone(fp2)
111+
112+
def test_short_text(self):
113+
"""Test fingerprinting with text shorter than n words"""
114+
text = "short"
115+
fp = text_fingerprint(text, n=10)
116+
self.assertIsNotNone(fp)
117+
self.assertIsInstance(fp, str)
118+
119+
def test_fingerprint_format(self):
120+
"""Test that fingerprint is a valid MD5 hash"""
121+
text = "test text"
122+
fp = text_fingerprint(text)
123+
self.assertEqual(len(fp), 32) # MD5 hash length
124+
self.assertTrue(all(c in '0123456789abcdef' for c in fp))
125+
126+
127+
class TestIsDuplicate(unittest.TestCase):
128+
"""Test duplicate detection functionality"""
129+
130+
def test_first_text_not_duplicate(self):
131+
"""Test that first text is not marked as duplicate"""
132+
seen = set()
133+
text = "This is a unique text"
134+
result = is_duplicate(text, seen)
135+
self.assertFalse(result)
136+
self.assertEqual(len(seen), 1)
137+
138+
def test_same_text_is_duplicate(self):
139+
"""Test that same text is marked as duplicate"""
140+
seen = set()
141+
text = "This is a test text"
142+
is_duplicate(text, seen) # First time
143+
result = is_duplicate(text, seen) # Second time
144+
self.assertTrue(result)
145+
146+
def test_different_texts_not_duplicate(self):
147+
"""Test that different texts are not duplicates"""
148+
seen = set()
149+
text1 = "First unique text"
150+
text2 = "Second unique text"
151+
result1 = is_duplicate(text1, seen)
152+
result2 = is_duplicate(text2, seen)
153+
self.assertFalse(result1)
154+
self.assertFalse(result2)
155+
self.assertEqual(len(seen), 2)
156+
157+
def test_case_insensitive_duplicate(self):
158+
"""Test that duplicate detection is case insensitive"""
159+
seen = set()
160+
text1 = "Hello World Test"
161+
text2 = "hello world test"
162+
is_duplicate(text1, seen)
163+
result = is_duplicate(text2, seen)
164+
self.assertTrue(result)
165+
166+
167+
class TestDownloadAndClean(unittest.TestCase):
168+
"""Test end-to-end download and clean functionality"""
169+
170+
def setUp(self):
171+
"""Set up test fixtures"""
172+
self.temp_dir = tempfile.mkdtemp()
173+
self.urls_file = os.path.join(self.temp_dir, 'urls.txt')
174+
self.output_dir = os.path.join(self.temp_dir, 'output')
175+
176+
def tearDown(self):
177+
"""Clean up test fixtures"""
178+
import shutil
179+
shutil.rmtree(self.temp_dir, ignore_errors=True)
180+
181+
def test_empty_url_file(self):
182+
"""Test handling of empty URL file"""
183+
# Create empty URL file
184+
with open(self.urls_file, 'w') as f:
185+
f.write('')
186+
187+
stats = download_and_clean(self.urls_file, self.output_dir)
188+
189+
self.assertEqual(stats['success'], 0)
190+
self.assertEqual(stats['failed'], 0)
191+
192+
def test_output_directory_created(self):
193+
"""Test that output directory is created"""
194+
with open(self.urls_file, 'w') as f:
195+
f.write('https://example.com\n')
196+
197+
download_and_clean(self.urls_file, self.output_dir)
198+
199+
self.assertTrue(os.path.exists(self.output_dir))
200+
201+
def test_log_files_created(self):
202+
"""Test that success and failed log files are created"""
203+
with open(self.urls_file, 'w') as f:
204+
f.write('https://example.com\n')
205+
206+
download_and_clean(self.urls_file, self.output_dir)
207+
208+
self.assertTrue(os.path.exists(os.path.join(self.output_dir, 'success.txt')))
209+
self.assertTrue(os.path.exists(os.path.join(self.output_dir, 'failed.txt')))
210+
211+
def test_max_urls_limit(self):
212+
"""Test that max_urls parameter limits processing"""
213+
with open(self.urls_file, 'w') as f:
214+
for i in range(10):
215+
f.write(f'https://example{i}.com\n')
216+
217+
stats = download_and_clean(self.urls_file, self.output_dir, max_urls=3)
218+
219+
# Should only attempt 3 URLs
220+
total = stats['success'] + stats['failed'] + stats['excluded']
221+
self.assertLessEqual(total, 3)
222+
223+
def test_domain_exclusion(self):
224+
"""Test that excluded domains are filtered out"""
225+
with open(self.urls_file, 'w') as f:
226+
f.write('https://twitter.com/test\n')
227+
f.write('https://example.com\n')
228+
229+
stats = download_and_clean(
230+
self.urls_file,
231+
self.output_dir,
232+
exclude_domains=['twitter.com'],
233+
use_default_excludes=False
234+
)
235+
236+
self.assertGreater(stats['excluded'], 0)
237+
238+
def test_extension_exclusion(self):
239+
"""Test that excluded extensions are filtered out"""
240+
with open(self.urls_file, 'w') as f:
241+
f.write('https://example.com/file.pdf\n')
242+
f.write('https://example.com/page\n')
243+
244+
stats = download_and_clean(
245+
self.urls_file,
246+
self.output_dir,
247+
exclude_extensions=['pdf'],
248+
use_default_excludes=False
249+
)
250+
251+
self.assertGreater(stats['excluded'], 0)
252+
253+
def test_stats_structure(self):
254+
"""Test that returned stats have correct structure"""
255+
with open(self.urls_file, 'w') as f:
256+
f.write('https://example.com\n')
257+
258+
stats = download_and_clean(self.urls_file, self.output_dir)
259+
260+
self.assertIn('success', stats)
261+
self.assertIn('failed', stats)
262+
self.assertIn('duplicates', stats)
263+
self.assertIn('too_short', stats)
264+
self.assertIn('excluded', stats)
265+
266+
267+
if __name__ == '__main__':
268+
unittest.main()

0 commit comments

Comments
 (0)