From 99b27646956571354652b0d86c3b8135c087425a Mon Sep 17 00:00:00 2001 From: fiandev Date: Thu, 16 Apr 2026 10:07:47 +0700 Subject: [PATCH] feat: improve wget integration and parsing logic - Update wget arguments for better mirroring and file filtering. - Force C locale for reliable subprocess output parsing. - Organize downloads into domain-based subdirectories. - Enhance UI feedback, terminal spinner, and result summary. - Bump version to 1.0.0.patch-006. Co-authored-by: gemini-cli gemini-3-flash-preview <218195315+gemini-cli@users.noreply.github.com> --- pyhttrack.py | 261 +++++++++++++++++++-------------------------------- 1 file changed, 94 insertions(+), 167 deletions(-) diff --git a/pyhttrack.py b/pyhttrack.py index 40b476b..c79306c 100644 --- a/pyhttrack.py +++ b/pyhttrack.py @@ -9,35 +9,23 @@ import time from datetime import datetime from colorama import Fore, Style, init as colorama_init +from urllib.parse import urlparse colorama_init() +VERSION = "1.0.0.patch-006" def get_base_dir(): if getattr(sys, "frozen", False): return os.path.dirname(sys.executable) return os.path.dirname(os.path.abspath(__file__)) - BASE_DIR = get_base_dir() parser = argparse.ArgumentParser(description="PyHttrack - Website Downloader") parser.add_argument("--url", help="Single URL to download (overrides web.json)") args = parser.parse_args() - -def format_size(bytes_num): - try: - bytes_num = int(bytes_num) - for unit in ["B", "KB", "MB", "GB", "TB"]: - if bytes_num < 1024: - return f"{bytes_num:.2f} {unit}" - bytes_num /= 1024 - return f"{bytes_num:.2f} PB" - except: - return "-" - - arch = platform.machine().lower() system = platform.system().lower() @@ -56,7 +44,6 @@ def format_size(bytes_num): os.path.join(BASE_DIR, "wget", folder_arch, wget_filename) if folder_arch else None ) - def install_wget(): if system == "linux": print("Wget not found. Trying to install wget...") @@ -71,7 +58,6 @@ def install_wget(): print("Automatic installation only supported on Linux.") return None - def print_banner(): print(f""" {Fore.RED} @@ -81,10 +67,10 @@ def print_banner(): === === === === === === === === === === === === === === === === === === === === === === === ======= === === +v{VERSION} {Style.RESET_ALL} """) - if wget_path and os.path.isfile(wget_path): if system != "windows": os.chmod(wget_path, 0o755) @@ -102,13 +88,6 @@ def print_banner(): results = [] urls = [] -web_dir = ( - os.path.join(os.path.expanduser("~"), "pyhttrack") - if system == "linux" - else os.getcwd() -) -os.makedirs(web_dir, exist_ok=True) - try: web_json_path = ( os.path.join(os.path.expanduser("~"), "pyhttrack", "web.json") @@ -118,7 +97,7 @@ def print_banner(): with open(web_json_path, "r") as file: urls = json.load(file) except FileNotFoundError: - print("File 'web.json' not found.") + pass if args.url: urls = [args.url] @@ -128,44 +107,46 @@ def print_banner(): spinner_active = False spinner_stop_event = threading.Event() - def spin(): chars = "|/-\\" idx = 0 while not spinner_stop_event.is_set(): - sys.stdout.write( - f"\r{Fore.CYAN}Downloading... {chars[idx % len(chars)]}{Style.RESET_ALL}" - ) + sys.stdout.write(f"\r{Fore.CYAN}Downloading... {chars[idx % len(chars)]}{Style.RESET_ALL}") sys.stdout.flush() idx += 1 time.sleep(0.1) + # Hapus teks spinner pas selesai biar output gak numpuk + sys.stdout.write("\r" + " " * 30 + "\r") sys.stdout.flush() - def start_spinner(): global spinner_active spinner_active = True spinner_stop_event.clear() threading.Thread(target=spin, daemon=True).start() - def stop_spinner(): global spinner_active spinner_active = False spinner_stop_event.set() - if not urls: - print("No URLs found in 'web.json'.") urls.append(input("Enter URL: ")) print(f"\nTotal URL : {len(urls)}") print("==================\n") +env = os.environ.copy() +env["LC_ALL"] = "C" +env["LANG"] = "C" + for url in urls: - print(f"Downloading: {url}\n") + print(f"Target: {url}") url_has_result = False start_spinner() + domain = urlparse(url).netloc + web_dir = os.path.join(os.getcwd(), domain) + os.makedirs(web_dir, exist_ok=True) try: process = subprocess.Popen( @@ -173,19 +154,20 @@ def stop_spinner(): wget_exec, "-r", "-m", - "--no-parent", - "--convert-links", - "--adjust-extension", - "--page-requisites", - "--limit-rate=100k", - "--random-wait", - "--wait=1", - "--timeout=15", + "-p", + "-k", + "-E", + "-nH", + "--restrict-file-names=windows", + "--accept", + "css,html,html,js,jpg,jpeg,png,gif,svg,ico,webp,bmp,tiff", "--tries=3", "--no-check-certificate", "--retry-connrefused", "-e", "robots=off", + "--execute", + "robots=off", "--user-agent=Mozilla/5.0", "--directory-prefix", web_dir, @@ -194,173 +176,118 @@ def stop_spinner(): stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, + env=env ) for line in process.stdout: now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") line = line.strip() - if "Saving" in line and "/" in line: + if "Saving" in line or "Menyimpan" in line: try: - path = ( - line.split("'")[1] - if "'" in line - else line.split("Saving ")[-1].strip() - ) - size = "-" - results.append( - { - "timestamp": now, - "url": url, - "file": path, - "status": "success", - "size": size, - } - ) - print(f"{Fore.GREEN}[{now}] Downloaded:{Style.RESET_ALL} {path}") + if "'" in line: + path = line.split("'")[1] + elif "‘" in line and "’" in line: + path = line.split("‘")[1].split("’")[0] + else: + path = line.split(" ")[-1].strip().strip("'").strip('"') + + if os.path.isabs(path): + path = os.path.relpath(path, web_dir) + + results.append({"timestamp": now, "url": url, "file": path, "status": "success", "size": "-"}) + print(f"\r{Fore.GREEN}[{now}] Downloaded:{Style.RESET_ALL} {path}" + " "*10) url_has_result = True except: continue elif "HTTP response 304" in line: - try: - results.append( - { - "timestamp": now, - "url": url, - "file": "-", - "status": "success", - "size": "-", - } - ) - print( - f"{Fore.YELLOW}[{now}] Already up to date:{Style.RESET_ALL} {url}" - ) - url_has_result = True - except: - continue + results.append({"timestamp": now, "url": url, "file": "-", "status": "success", "size": "-"}) + print(f"\r{Fore.YELLOW}[{now}] Already up to date:{Style.RESET_ALL} {url}" + " "*10) + url_has_result = True - elif "convert" in line.lower() and "http" not in line.lower(): - try: - results.append( - { - "timestamp": now, - "url": url, - "file": "-", - "status": "success", - "size": "-", - } - ) - print( - f"{Fore.GREEN}[{now}] Download Complete:{Style.RESET_ALL} {url}" - ) - url_has_result = True - except: - continue + elif "Downloaded:" in line and "files" in line: + url_has_result = True - elif "not modified" in line and "'" in line: - try: - path = ( - line.split("'")[1] - if "'" in line - else line.split("Saving ")[-1].strip() - ) - size = "-" - results.append( - { - "timestamp": now, - "url": url, - "file": path, - "status": "success", - "size": size, - } - ) - print(f"{Fore.GREEN}[{now}] Downloaded:{Style.RESET_ALL} {path}") - url_has_result = True - except: - continue + elif line.startswith("convert ") or ("convert" in line.lower() and "http" in line.lower()): + url_has_result = True - elif "not modified" in line and "'" in line: + elif "not modified" in line and ("'" in line or "‘" in line): try: - path = line.split("'")[1] - results.append( - { - "timestamp": now, - "url": url, - "file": path, - "status": "not modified", - "size": "-", - } - ) - print(f"{Fore.YELLOW}[{now}] Skipped:{Style.RESET_ALL} {path}") + if "'" in line: + path = line.split("'")[1] + elif "‘" in line and "’" in line: + path = line.split("‘")[1].split("’")[0] + else: + path = line.split(" ")[-1].strip().strip("'").strip('"') + + if os.path.isabs(path): + path = os.path.relpath(path, web_dir) + + results.append({"timestamp": now, "url": url, "file": path, "status": "not modified", "size": "-"}) + print(f"\r{Fore.YELLOW}[{now}] Skipped:{Style.RESET_ALL} {path}" + " "*10) url_has_result = True except: continue process.wait() + # Kalau bener-bener gak ada log sukses/gagal dari Wget if not url_has_result: now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - print(f"{Fore.RED}Failed to Download:{Style.RESET_ALL} {url}") - results.append( - { - "timestamp": now, - "url": url, - "file": "-", - "status": "failed", - "size": "-", - } - ) + print(f"\r{Fore.RED}Failed to Download:{Style.RESET_ALL} {url}" + " "*20) + results.append({"timestamp": now, "url": url, "file": "-", "status": "failed", "size": "-"}) + else: + # Kalau sukses jalan tapi list results untuk URL ini masih kosong (berarti gak ada file baru/udah ke-cache) + if not any(r["url"] == url for r in results): + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + print(f"\r{Fore.YELLOW}[{now}] Checked (Already up to date / No new files):{Style.RESET_ALL} {url}" + " "*10) + results.append({"timestamp": now, "url": url, "file": "No new files", "status": "success", "size": "-"}) except Exception as e: now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - print(f"{Fore.RED}Error while downloading:{Style.RESET_ALL} {url} | {str(e)}") - results.append( - {"timestamp": now, "url": url, "file": "-", "status": "failed", "size": "-"} - ) + print(f"\r{Fore.RED}Error while downloading:{Style.RESET_ALL} {url} | {str(e)}" + " "*20) + results.append({"timestamp": now, "url": url, "file": "-", "status": "failed", "size": "-"}) finally: stop_spinner() if results: - base_dir_for_log = ( - os.path.join(os.path.expanduser("~"), "pyhttrack") - if system == "linux" - else BASE_DIR - ) + base_dir_for_log = os.getcwd() log_path = os.path.join(base_dir_for_log, "log.txt") with open(log_path, "a", encoding="utf-8") as log_file: for result in results: - log_file.write( - f"[{result['timestamp']}] {result['url']} | {result['file']} | {result['status']} | {result['size']}\n" - ) + log_file.write(f"[{result['timestamp']}] {result['url']} | {result['file']} | {result['status']} | {result['size']}\n") else: - print("No URLs to process.") + print("\nNo URLs to process.") sys.exit(0) success = sum(r["status"] == "success" for r in results) skipped = sum(r["status"] == "not modified" for r in results) -failed = sum( - r["status"] in ["failed", "404 not found", "403 forbidden"] for r in results -) +failed = sum(r["status"] in ["failed", "404 not found", "403 forbidden"] for r in results) print(f"\n{Fore.GREEN}Success : {success}{Style.RESET_ALL}") print(f"{Fore.YELLOW}Skipped : {skipped}{Style.RESET_ALL}") print(f"{Fore.RED}Failed : {failed}{Style.RESET_ALL}") -output_base = ( - os.path.join(os.path.expanduser("~"), "pyhttrack") - if system == "linux" - else os.getcwd() -) +output_base = os.getcwd() + +print(f"\n{Fore.CYAN}Output :{Style.RESET_ALL}") +saved_files = [] +for r in results: + if r["status"] in ["success", "not modified"] and r["file"] not in ["-", "No new files"]: + if r["file"].startswith(web_dir): + rel_path = os.path.relpath(r["file"], web_dir) + saved_files.append(rel_path) + else: + saved_files.append(r["file"]) + +if saved_files: + print(f" ./") + for file in saved_files[:8]: + print(f" {file}") + if len(saved_files) > 8: + print(f" ... and {len(saved_files) - 8} more files") +else: + print(" (Semua resource sudah ter-download/up-to-date)") -if results: - print(f"\n{Fore.CYAN}Output :{Style.RESET_ALL}") - for r in results: - if r["status"] == "success": - if r["file"] != "-": - folder = r["file"].split("/")[0] if "/" in r["file"] else r["file"] - else: - from urllib.parse import urlparse - - folder = urlparse(r["url"]).netloc - print(f" {output_base}/{folder}") +print(f"\n All files saved in: {output_base}/") \ No newline at end of file