Refactor CLI to use argparse properly and add test step to workflow

Rustem · Rustem · commit c8d439bb335c · 2025-10-29T22:29:02.000-07:00
- Fix __main__.py to import from cli.py instead of core.py
- Remove unnecessary manual argv validation in merge command (argparse handles this)
- Add pytest test step to GitHub workflow before basic functionality tests
diff --git a/.github/workflows/test-publish.yml b/.github/workflows/test-publish.yml
@@ -25,8 +25,12 @@ jobs:
     - name: Install package
       run: |
         python -m pip install --upgrade pip
-        pip install -e .
-    
+        pip install -e .[dev]
+
+    - name: Run tests
+      run: |
+        python -m pytest tests/ -v
+
     - name: Test basic functionality
       run: |
         textnano --help || echo "CLI not fully configured yet"
diff --git a/README.md b/README.md
@@ -236,7 +236,6 @@ textnano merge output_*/ final_dataset/
 | Feature | textnano | lazynlp | beautifulsoup + requests |
 |---------|----------|---------|--------------------------|
 | **Files** | 1 | 5 | Your custom code |
-| **LOC** | ~200 | ~800 | 100-200 (yours) |
 | **Dependencies** | 0 | 2 | 2+ |
 | **Learning time** | 5 min | 30 min | 1 hour |
 | **Deduplication** | ✅ | ✅ | ❌ (you implement) |
diff --git a/textnano/__main__.py b/textnano/__main__.py
@@ -2,7 +2,7 @@
 Entry point for running textnano as a module: python -m textnano
 """
 
-from textnano.core import main
+from textnano.cli import main
 
 if __name__ == '__main__':
     main()
diff --git a/textnano/cli.py b/textnano/cli.py
@@ -95,9 +95,6 @@ def main():
         print(f"Avg/file:  {stats['avg_words_per_file']} words")
 
     elif args.command == 'merge':
-        if len(args.dirs) < 2:
-            print("Usage: textnano merge <dir1> <dir2> ... <output_dir>")
-            sys.exit(1)
         output = args.dirs[-1]
         inputs = args.dirs[:-1]
         merge_datasets(*inputs, output_dir=output, is_duplicate_func=is_duplicate)
diff --git a/textnano/core.py b/textnano/core.py
@@ -38,39 +38,34 @@
 # DOWNLOAD
 # =============================================================================
 
-def download_text(url: str, timeout: int = 30) -> Optional[str]:
+def download_text(url: str, timeout: int = 30) -> str:
     """Download and extract text from a URL.
 
     Returns:
-        str or None: Cleaned text content, or None if failed
+        str: Cleaned text content, or empty string if failed
     """
     try:
-        # Download
         headers = {'User-Agent': 'Mozilla/5.0'}
         req = urllib.request.Request(url, headers=headers)
 
-        # Create SSL context that doesn't verify certificates
         context = ssl.create_default_context()
         context.check_hostname = False
         context.verify_mode = ssl.CERT_NONE
 
         with urllib.request.urlopen(req, timeout=timeout, context=context) as response:
             content = response.read().decode('utf-8', errors='ignore')
 
-        # Basic HTML cleaning
-        text = clean_html(content)
-
-        return text if text.strip() else None
+        return clean_html(content)
 
     except urllib.error.HTTPError as e:
         logging.error(f"HTTP error {e.code} for {url}: {e.reason}")
-        return None
+        return ""
     except urllib.error.URLError as e:
         logging.error(f"URL error for {url}: {e.reason}")
-        return None
+        return ""
     except Exception as e:
         logging.error(f"Unexpected error for {url}: {type(e).__name__}: {e}")
-        return None
+        return ""
 
 
 # =============================================================================
@@ -86,21 +81,12 @@ def clean_html(html_content: str) -> str:
     Returns:
         str: Clean text
     """
-    # Unescape HTML entities
-    text = html.unescape(html_content)
-
-    # Remove script and style tags
-    text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
+    text = re.sub(r'<script[^>]*>.*?</script>', '', html_content, flags=re.DOTALL | re.IGNORECASE)
     text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
-
-    # Remove HTML tags
     text = re.sub(r'<[^>]+>', '', text)
-
-    # Normalize whitespace
+    text = html.unescape(text)
     text = re.sub(r'\s+', ' ', text)
     text = re.sub(r'\n\s*\n', '\n\n', text)
-
-    # Remove leading/trailing whitespace
     text = text.strip()
 
     return text
@@ -120,7 +106,7 @@ def text_fingerprint(text: str, n: int = 8) -> str:
     Returns:
         str: MD5 hash of first N words
     """
-    words = text.lower().split()[:n]
+    words = text.lower().split(maxsplit=n)[:n]
     fingerprint_text = ' '.join(words)
     return hashlib.md5(fingerprint_text.encode()).hexdigest()