From 99b27646956571354652b0d86c3b8135c087425a Mon Sep 17 00:00:00 2001
From: fiandev <alfiansa@proton.me>
Date: Thu, 16 Apr 2026 10:07:47 +0700
Subject: [PATCH] feat: improve wget integration and parsing logic

- Update wget arguments for better mirroring and file filtering.
- Force C locale for reliable subprocess output parsing.
- Organize downloads into domain-based subdirectories.
- Enhance UI feedback, terminal spinner, and result summary.
- Bump version to 1.0.0.patch-006.

Co-authored-by: gemini-cli gemini-3-flash-preview <218195315+gemini-cli@users.noreply.github.com>
---
 pyhttrack.py | 261 +++++++++++++++++++--------------------------------
 1 file changed, 94 insertions(+), 167 deletions(-)

diff --git a/pyhttrack.py b/pyhttrack.py
index 40b476b..c79306c 100644
--- a/pyhttrack.py
+++ b/pyhttrack.py
@@ -9,35 +9,23 @@
 import time
 from datetime import datetime
 from colorama import Fore, Style, init as colorama_init
+from urllib.parse import urlparse
 
 colorama_init()
 
+VERSION = "1.0.0.patch-006"
 
 def get_base_dir():
     if getattr(sys, "frozen", False):
         return os.path.dirname(sys.executable)
     return os.path.dirname(os.path.abspath(__file__))
 
-
 BASE_DIR = get_base_dir()
 
 parser = argparse.ArgumentParser(description="PyHttrack - Website Downloader")
 parser.add_argument("--url", help="Single URL to download (overrides web.json)")
 args = parser.parse_args()
 
-
-def format_size(bytes_num):
-    try:
-        bytes_num = int(bytes_num)
-        for unit in ["B", "KB", "MB", "GB", "TB"]:
-            if bytes_num < 1024:
-                return f"{bytes_num:.2f} {unit}"
-            bytes_num /= 1024
-        return f"{bytes_num:.2f} PB"
-    except:
-        return "-"
-
-
 arch = platform.machine().lower()
 system = platform.system().lower()
 
@@ -56,7 +44,6 @@ def format_size(bytes_num):
     os.path.join(BASE_DIR, "wget", folder_arch, wget_filename) if folder_arch else None
 )
 
-
 def install_wget():
     if system == "linux":
         print("Wget not found. Trying to install wget...")
@@ -71,7 +58,6 @@ def install_wget():
         print("Automatic installation only supported on Linux.")
     return None
 
-
 def print_banner():
     print(f"""
 {Fore.RED}
@@ -81,10 +67,10 @@ def print_banner():
 ===        ===   ===  ===   ===     ===   === ===  ===  === ===      === === 
 ===        ===   ===  ===   ===     ===   ===  === ===  ===  ======= ===  ===
                                                                             
+v{VERSION}
 {Style.RESET_ALL}
 """)
 
-
 if wget_path and os.path.isfile(wget_path):
     if system != "windows":
         os.chmod(wget_path, 0o755)
@@ -102,13 +88,6 @@ def print_banner():
 results = []
 urls = []
 
-web_dir = (
-    os.path.join(os.path.expanduser("~"), "pyhttrack")
-    if system == "linux"
-    else os.getcwd()
-)
-os.makedirs(web_dir, exist_ok=True)
-
 try:
     web_json_path = (
         os.path.join(os.path.expanduser("~"), "pyhttrack", "web.json")
@@ -118,7 +97,7 @@ def print_banner():
     with open(web_json_path, "r") as file:
         urls = json.load(file)
 except FileNotFoundError:
-    print("File 'web.json' not found.")
+    pass
 
 if args.url:
     urls = [args.url]
@@ -128,44 +107,46 @@ def print_banner():
 spinner_active = False
 spinner_stop_event = threading.Event()
 
-
 def spin():
     chars = "|/-\\"
     idx = 0
     while not spinner_stop_event.is_set():
-        sys.stdout.write(
-            f"\r{Fore.CYAN}Downloading... {chars[idx % len(chars)]}{Style.RESET_ALL}"
-        )
+        sys.stdout.write(f"\r{Fore.CYAN}Downloading... {chars[idx % len(chars)]}{Style.RESET_ALL}")
         sys.stdout.flush()
         idx += 1
         time.sleep(0.1)
+    # Hapus teks spinner pas selesai biar output gak numpuk
+    sys.stdout.write("\r" + " " * 30 + "\r")
     sys.stdout.flush()
 
-
 def start_spinner():
     global spinner_active
     spinner_active = True
     spinner_stop_event.clear()
     threading.Thread(target=spin, daemon=True).start()
 
-
 def stop_spinner():
     global spinner_active
     spinner_active = False
     spinner_stop_event.set()
 
-
 if not urls:
-    print("No URLs found in 'web.json'.")
     urls.append(input("Enter URL: "))
 
 print(f"\nTotal URL : {len(urls)}")
 print("==================\n")
 
+env = os.environ.copy()
+env["LC_ALL"] = "C"
+env["LANG"] = "C"
+
 for url in urls:
-    print(f"Downloading: {url}\n")
+    print(f"Target: {url}")
     url_has_result = False
     start_spinner()
+    domain = urlparse(url).netloc
+    web_dir = os.path.join(os.getcwd(), domain)
+    os.makedirs(web_dir, exist_ok=True)
 
     try:
         process = subprocess.Popen(
@@ -173,19 +154,20 @@ def stop_spinner():
                 wget_exec,
                 "-r",
                 "-m",
-                "--no-parent",
-                "--convert-links",
-                "--adjust-extension",
-                "--page-requisites",
-                "--limit-rate=100k",
-                "--random-wait",
-                "--wait=1",
-                "--timeout=15",
+                "-p",
+                "-k",
+                "-E",
+                "-nH",
+                "--restrict-file-names=windows",
+                "--accept",
+                "css,html,html,js,jpg,jpeg,png,gif,svg,ico,webp,bmp,tiff",
                 "--tries=3",
                 "--no-check-certificate",
                 "--retry-connrefused",
                 "-e",
                 "robots=off",
+                "--execute",
+                "robots=off",
                 "--user-agent=Mozilla/5.0",
                 "--directory-prefix",
                 web_dir,
@@ -194,173 +176,118 @@ def stop_spinner():
             stdout=subprocess.PIPE,
             stderr=subprocess.STDOUT,
             text=True,
+            env=env
         )
 
         for line in process.stdout:
             now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
             line = line.strip()
 
-            if "Saving" in line and "/" in line:
+            if "Saving" in line or "Menyimpan" in line:
                 try:
-                    path = (
-                        line.split("'")[1]
-                        if "'" in line
-                        else line.split("Saving ")[-1].strip()
-                    )
-                    size = "-"
-                    results.append(
-                        {
-                            "timestamp": now,
-                            "url": url,
-                            "file": path,
-                            "status": "success",
-                            "size": size,
-                        }
-                    )
-                    print(f"{Fore.GREEN}[{now}] Downloaded:{Style.RESET_ALL} {path}")
+                    if "'" in line:
+                        path = line.split("'")[1]
+                    elif "‘" in line and "’" in line:
+                        path = line.split("‘")[1].split("’")[0]
+                    else:
+                        path = line.split(" ")[-1].strip().strip("'").strip('"')
+
+                    if os.path.isabs(path):
+                        path = os.path.relpath(path, web_dir)
+
+                    results.append({"timestamp": now, "url": url, "file": path, "status": "success", "size": "-"})
+                    print(f"\r{Fore.GREEN}[{now}] Downloaded:{Style.RESET_ALL} {path}" + " "*10)
                     url_has_result = True
                 except:
                     continue
 
             elif "HTTP response 304" in line:
-                try:
-                    results.append(
-                        {
-                            "timestamp": now,
-                            "url": url,
-                            "file": "-",
-                            "status": "success",
-                            "size": "-",
-                        }
-                    )
-                    print(
-                        f"{Fore.YELLOW}[{now}] Already up to date:{Style.RESET_ALL} {url}"
-                    )
-                    url_has_result = True
-                except:
-                    continue
+                results.append({"timestamp": now, "url": url, "file": "-", "status": "success", "size": "-"})
+                print(f"\r{Fore.YELLOW}[{now}] Already up to date:{Style.RESET_ALL} {url}" + " "*10)
+                url_has_result = True
 
-            elif "convert" in line.lower() and "http" not in line.lower():
-                try:
-                    results.append(
-                        {
-                            "timestamp": now,
-                            "url": url,
-                            "file": "-",
-                            "status": "success",
-                            "size": "-",
-                        }
-                    )
-                    print(
-                        f"{Fore.GREEN}[{now}] Download Complete:{Style.RESET_ALL} {url}"
-                    )
-                    url_has_result = True
-                except:
-                    continue
+            elif "Downloaded:" in line and "files" in line:
+                url_has_result = True
 
-            elif "not modified" in line and "'" in line:
-                try:
-                    path = (
-                        line.split("'")[1]
-                        if "'" in line
-                        else line.split("Saving ")[-1].strip()
-                    )
-                    size = "-"
-                    results.append(
-                        {
-                            "timestamp": now,
-                            "url": url,
-                            "file": path,
-                            "status": "success",
-                            "size": size,
-                        }
-                    )
-                    print(f"{Fore.GREEN}[{now}] Downloaded:{Style.RESET_ALL} {path}")
-                    url_has_result = True
-                except:
-                    continue
+            elif line.startswith("convert ") or ("convert" in line.lower() and "http" in line.lower()):
+                url_has_result = True
 
-            elif "not modified" in line and "'" in line:
+            elif "not modified" in line and ("'" in line or "‘" in line):
                 try:
-                    path = line.split("'")[1]
-                    results.append(
-                        {
-                            "timestamp": now,
-                            "url": url,
-                            "file": path,
-                            "status": "not modified",
-                            "size": "-",
-                        }
-                    )
-                    print(f"{Fore.YELLOW}[{now}] Skipped:{Style.RESET_ALL} {path}")
+                    if "'" in line:
+                        path = line.split("'")[1]
+                    elif "‘" in line and "’" in line:
+                        path = line.split("‘")[1].split("’")[0]
+                    else:
+                        path = line.split(" ")[-1].strip().strip("'").strip('"')
+
+                    if os.path.isabs(path):
+                        path = os.path.relpath(path, web_dir)
+
+                    results.append({"timestamp": now, "url": url, "file": path, "status": "not modified", "size": "-"})
+                    print(f"\r{Fore.YELLOW}[{now}] Skipped:{Style.RESET_ALL} {path}" + " "*10)
                     url_has_result = True
                 except:
                     continue
 
         process.wait()
 
+        # Kalau bener-bener gak ada log sukses/gagal dari Wget
         if not url_has_result:
             now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-            print(f"{Fore.RED}Failed to Download:{Style.RESET_ALL} {url}")
-            results.append(
-                {
-                    "timestamp": now,
-                    "url": url,
-                    "file": "-",
-                    "status": "failed",
-                    "size": "-",
-                }
-            )
+            print(f"\r{Fore.RED}Failed to Download:{Style.RESET_ALL} {url}" + " "*20)
+            results.append({"timestamp": now, "url": url, "file": "-", "status": "failed", "size": "-"})
+        else:
+            # Kalau sukses jalan tapi list results untuk URL ini masih kosong (berarti gak ada file baru/udah ke-cache)
+            if not any(r["url"] == url for r in results):
+                now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                print(f"\r{Fore.YELLOW}[{now}] Checked (Already up to date / No new files):{Style.RESET_ALL} {url}" + " "*10)
+                results.append({"timestamp": now, "url": url, "file": "No new files", "status": "success", "size": "-"})
 
     except Exception as e:
         now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        print(f"{Fore.RED}Error while downloading:{Style.RESET_ALL} {url} | {str(e)}")
-        results.append(
-            {"timestamp": now, "url": url, "file": "-", "status": "failed", "size": "-"}
-        )
+        print(f"\r{Fore.RED}Error while downloading:{Style.RESET_ALL} {url} | {str(e)}" + " "*20)
+        results.append({"timestamp": now, "url": url, "file": "-", "status": "failed", "size": "-"})
     finally:
         stop_spinner()
 
 if results:
-    base_dir_for_log = (
-        os.path.join(os.path.expanduser("~"), "pyhttrack")
-        if system == "linux"
-        else BASE_DIR
-    )
+    base_dir_for_log = os.getcwd()
     log_path = os.path.join(base_dir_for_log, "log.txt")
     with open(log_path, "a", encoding="utf-8") as log_file:
         for result in results:
-            log_file.write(
-                f"[{result['timestamp']}] {result['url']} | {result['file']} | {result['status']} | {result['size']}\n"
-            )
+            log_file.write(f"[{result['timestamp']}] {result['url']} | {result['file']} | {result['status']} | {result['size']}\n")
 else:
-    print("No URLs to process.")
+    print("\nNo URLs to process.")
     sys.exit(0)
 
 success = sum(r["status"] == "success" for r in results)
 skipped = sum(r["status"] == "not modified" for r in results)
-failed = sum(
-    r["status"] in ["failed", "404 not found", "403 forbidden"] for r in results
-)
+failed = sum(r["status"] in ["failed", "404 not found", "403 forbidden"] for r in results)
 
 print(f"\n{Fore.GREEN}Success : {success}{Style.RESET_ALL}")
 print(f"{Fore.YELLOW}Skipped : {skipped}{Style.RESET_ALL}")
 print(f"{Fore.RED}Failed  : {failed}{Style.RESET_ALL}")
 
-output_base = (
-    os.path.join(os.path.expanduser("~"), "pyhttrack")
-    if system == "linux"
-    else os.getcwd()
-)
+output_base = os.getcwd()
+
+print(f"\n{Fore.CYAN}Output  :{Style.RESET_ALL}")
+saved_files = []
+for r in results:
+    if r["status"] in ["success", "not modified"] and r["file"] not in ["-", "No new files"]:
+        if r["file"].startswith(web_dir):
+            rel_path = os.path.relpath(r["file"], web_dir)
+            saved_files.append(rel_path)
+        else:
+            saved_files.append(r["file"])
+
+if saved_files:
+    print(f"  ./")
+    for file in saved_files[:8]:
+        print(f"    {file}")
+    if len(saved_files) > 8:
+        print(f"    ... and {len(saved_files) - 8} more files")
+else:
+    print("  (Semua resource sudah ter-download/up-to-date)")
 
-if results:
-    print(f"\n{Fore.CYAN}Output  :{Style.RESET_ALL}")
-    for r in results:
-        if r["status"] == "success":
-            if r["file"] != "-":
-                folder = r["file"].split("/")[0] if "/" in r["file"] else r["file"]
-            else:
-                from urllib.parse import urlparse
-
-                folder = urlparse(r["url"]).netloc
-            print(f"  {output_base}/{folder}")
+print(f"\n  All files saved in: {output_base}/")
\ No newline at end of file