1111"""
1212
1313import argparse
14- import logging
1514import re
1615import sys
1716from pathlib import Path
1817from prefect import flow , task
1918from cirada_software .delete_duplicate_downloads import dedupe_tiles
2019
2120
22- LOGGER = logging .getLogger (__name__ )
23-
24-
2521def latest_file (directory : Path , pattern : str ) -> Path | None :
2622 """
2723 Return the latest-modified file in 'directory' that matches 'pattern'.
28-
29- Parameters
30- ----------
31- directory : Path
32- Directory to search in.
33- pattern : str
34- Glob pattern, e.g. 'symbolic_links_log_*' or '*log'.
35-
36- Returns
37- -------
38- Path | None
39- Path to the most recently modified matching file, or None if no match.
4024 """
4125 directory = Path (directory )
4226 if not directory .is_dir ():
@@ -57,24 +41,6 @@ def parse_skipped_tiles_with_excess_files(
5741 """
5842 Parse the log file for tiles that were skipped because they had
5943 between min_files and max_files (inclusive) instead of 4.
60-
61- Expected line format:
62- 'Tile 5761 skipped, band: 943MHz found 5 files instead of 4.'
63-
64- Parameters
65- ----------
66- log_file : Path
67- Path to the log file.
68- min_files : int
69- Minimum number of files (inclusive) to consider "excess".
70- max_files : int
71- Maximum number of files (inclusive) to consider.
72-
73- Returns
74- -------
75- dict[int, int]
76- Mapping from tile number to number of files.
77- If the same tile appears multiple times, the last occurrence wins.
7844 """
7945 pattern = re .compile (
8046 r"Tile\s+(\d+)\s+skipped,\s+band:\s+\S+\s+found\s+(\d+)\s+files instead of 4\." ,
@@ -98,6 +64,7 @@ def parse_skipped_tiles_with_excess_files(
9864
9965 return tiles
10066
67+
10168@task (name = "Run deduplication of tiles with excess files" )
10269def run_deduplication (
10370 download_dir : Path ,
@@ -109,57 +76,45 @@ def run_deduplication(
10976 - Find latest log file.
11077 - Parse tiles with excess files.
11178 - Call dedupe_tiles for each tile.
112-
113- Parameters
114- ----------
115- download_dir : Path
116- Root directory where tile downloads are stored.
117- log_dir : Path
118- Directory containing symbolic link logs.
119- log_glob : str
120- Glob pattern to find log files in log_dir.
12179 """
122- LOGGER . info ( "Using download directory: %s" , download_dir )
123- LOGGER . info ( "Using log directory: %s" , log_dir )
80+ print ( f "Using download directory: { download_dir } " )
81+ print ( f "Using log directory: { log_dir } " )
12482
12583 if not download_dir .is_dir ():
126- LOGGER . error (
127- " Download directory does not exist or is not a directory: %s" , download_dir
84+ print (
85+ f"ERROR: Download directory does not exist or is not a directory: { download_dir } "
12886 )
12987 sys .exit (1 )
13088
13189 if not log_dir .is_dir ():
132- LOGGER . error ( " Log directory does not exist or is not a directory: %s" , log_dir )
90+ print ( f"ERROR: Log directory does not exist or is not a directory: { log_dir } " )
13391 sys .exit (1 )
13492
13593 latest_log = latest_file (log_dir , log_glob )
13694 if latest_log is None :
137- LOGGER . error (
138- " No log files found in %s matching pattern '%s'" , log_dir , log_glob
95+ print (
96+ f"ERROR: No log files found in { log_dir } matching pattern '{ log_glob } '"
13997 )
14098 sys .exit (1 )
14199
142- LOGGER . info ( "Latest log file: %s" , latest_log )
100+ print ( f "Latest log file: { latest_log } " )
143101
144102 tiles_with_excess = parse_skipped_tiles_with_excess_files (latest_log )
145103 if not tiles_with_excess :
146- LOGGER . info (
147- "No tiles with more than 4 (and up to 8) files found in %s. Nothing to do." ,
148- latest_log ,
104+ print (
105+ f "No tiles with more than 4 (and up to 8) files found in { latest_log } . "
106+ "Nothing to do."
149107 )
150108 return
151109
152- LOGGER .info (
153- "Found %d tile(s) with excess files: %s" ,
154- len (tiles_with_excess ),
155- ", " .join (str (t ) for t in sorted (tiles_with_excess )),
110+ print (
111+ f"Found { len (tiles_with_excess )} tile(s) with excess files: "
112+ f"{ ', ' .join (str (t ) for t in sorted (tiles_with_excess ))} "
156113 )
157114
158115 for tile , nfiles in sorted (tiles_with_excess .items ()):
159- LOGGER .info (
160- "Running dedupe_tiles for tile %d (found %d files)." ,
161- tile ,
162- nfiles ,
116+ print (
117+ f"Running dedupe_tiles for tile { tile } (found { nfiles } files)."
163118 )
164119 pattern = r"20*-0*/*/*/*_{tile}_*"
165120 dedupe_tiles (
@@ -195,16 +150,12 @@ def build_arg_parser() -> argparse.ArgumentParser:
195150
196151 return parser
197152
153+
198154@flow (name = "fix_duplicate_downloads" , log_prints = True )
199155def main () -> None :
200156 """
201157 Entry point for command line execution.
202158 """
203- logging .basicConfig (
204- level = logging .INFO ,
205- format = "%(asctime)s [%(levelname)s] %(name)s: %(message)s" ,
206- )
207-
208159 parser = build_arg_parser ()
209160 args = parser .parse_args ()
210161
0 commit comments