Skip to content

Commit c8d439b

Browse files
author
Rustem
committed
Refactor CLI to use argparse properly and add test step to workflow
- Fix __main__.py to import from cli.py instead of core.py - Remove unnecessary manual argv validation in merge command (argparse handles this) - Add pytest test step to GitHub workflow before basic functionality tests
1 parent 58f0849 commit c8d439b

5 files changed

Lines changed: 16 additions & 30 deletions

File tree

.github/workflows/test-publish.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,12 @@ jobs:
2525
- name: Install package
2626
run: |
2727
python -m pip install --upgrade pip
28-
pip install -e .
29-
28+
pip install -e .[dev]
29+
30+
- name: Run tests
31+
run: |
32+
python -m pytest tests/ -v
33+
3034
- name: Test basic functionality
3135
run: |
3236
textnano --help || echo "CLI not fully configured yet"

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,6 @@ textnano merge output_*/ final_dataset/
236236
| Feature | textnano | lazynlp | beautifulsoup + requests |
237237
|---------|----------|---------|--------------------------|
238238
| **Files** | 1 | 5 | Your custom code |
239-
| **LOC** | ~200 | ~800 | 100-200 (yours) |
240239
| **Dependencies** | 0 | 2 | 2+ |
241240
| **Learning time** | 5 min | 30 min | 1 hour |
242241
| **Deduplication** ||| ❌ (you implement) |

textnano/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Entry point for running textnano as a module: python -m textnano
33
"""
44

5-
from textnano.core import main
5+
from textnano.cli import main
66

77
if __name__ == '__main__':
88
main()

textnano/cli.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,6 @@ def main():
9595
print(f"Avg/file: {stats['avg_words_per_file']} words")
9696

9797
elif args.command == 'merge':
98-
if len(args.dirs) < 2:
99-
print("Usage: textnano merge <dir1> <dir2> ... <output_dir>")
100-
sys.exit(1)
10198
output = args.dirs[-1]
10299
inputs = args.dirs[:-1]
103100
merge_datasets(*inputs, output_dir=output, is_duplicate_func=is_duplicate)

textnano/core.py

Lines changed: 9 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -38,39 +38,34 @@
3838
# DOWNLOAD
3939
# =============================================================================
4040

41-
def download_text(url: str, timeout: int = 30) -> Optional[str]:
41+
def download_text(url: str, timeout: int = 30) -> str:
4242
"""Download and extract text from a URL.
4343
4444
Returns:
45-
str or None: Cleaned text content, or None if failed
45+
str: Cleaned text content, or empty string if failed
4646
"""
4747
try:
48-
# Download
4948
headers = {'User-Agent': 'Mozilla/5.0'}
5049
req = urllib.request.Request(url, headers=headers)
5150

52-
# Create SSL context that doesn't verify certificates
5351
context = ssl.create_default_context()
5452
context.check_hostname = False
5553
context.verify_mode = ssl.CERT_NONE
5654

5755
with urllib.request.urlopen(req, timeout=timeout, context=context) as response:
5856
content = response.read().decode('utf-8', errors='ignore')
5957

60-
# Basic HTML cleaning
61-
text = clean_html(content)
62-
63-
return text if text.strip() else None
58+
return clean_html(content)
6459

6560
except urllib.error.HTTPError as e:
6661
logging.error(f"HTTP error {e.code} for {url}: {e.reason}")
67-
return None
62+
return ""
6863
except urllib.error.URLError as e:
6964
logging.error(f"URL error for {url}: {e.reason}")
70-
return None
65+
return ""
7166
except Exception as e:
7267
logging.error(f"Unexpected error for {url}: {type(e).__name__}: {e}")
73-
return None
68+
return ""
7469

7570

7671
# =============================================================================
@@ -86,21 +81,12 @@ def clean_html(html_content: str) -> str:
8681
Returns:
8782
str: Clean text
8883
"""
89-
# Unescape HTML entities
90-
text = html.unescape(html_content)
91-
92-
# Remove script and style tags
93-
text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
84+
text = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
9485
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
95-
96-
# Remove HTML tags
9786
text = re.sub(r'<[^>]+>', '', text)
98-
99-
# Normalize whitespace
87+
text = html.unescape(text)
10088
text = re.sub(r'\s+', ' ', text)
10189
text = re.sub(r'\n\s*\n', '\n\n', text)
102-
103-
# Remove leading/trailing whitespace
10490
text = text.strip()
10591

10692
return text
@@ -120,7 +106,7 @@ def text_fingerprint(text: str, n: int = 8) -> str:
120106
Returns:
121107
str: MD5 hash of first N words
122108
"""
123-
words = text.lower().split()[:n]
109+
words = text.lower().split(maxsplit=n)[:n]
124110
fingerprint_text = ' '.join(words)
125111
return hashlib.md5(fingerprint_text.encode()).hexdigest()
126112

0 commit comments

Comments
 (0)