Skip to content

Commit bef9ee2

Browse files
committed
relink.py: Can now accept specific file(s) to process.
Resolves NCAR/inputdataTools#5.
1 parent 8a0b914 commit bef9ee2

8 files changed

Lines changed: 605 additions & 123 deletions

relink.py

Lines changed: 97 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def always(self, message, *args, **kwargs):
3535
logging.Logger.always = always
3636

3737

38-
def _handle_non_dir_entry(entry, user_uid):
38+
def _handle_non_dir_entry(entry: os.DirEntry, user_uid: int):
3939
"""
4040
Check if a non-directory entry is owned by the user and should be processed.
4141
@@ -61,6 +61,38 @@ def _handle_non_dir_entry(entry, user_uid):
6161
return None
6262

6363

64+
def _handle_non_dir_str(path: str, user_uid: int):
65+
"""
66+
Check if a non-directory string is owned by the user and should be processed. This should only
67+
ever be needed if the user specified a file to process on the command line. Because we don't
68+
expect users to process large numbers of files at once in this way, it's okay if this function
69+
isn't performance-optimized.
70+
71+
Args:
72+
path (str): A filesystem path.
73+
user_uid (int): The UID of the user whose files to find.
74+
75+
Returns:
76+
str or None: The absolute path to the file if it's owned by the user
77+
and is a regular file (not a symlink), otherwise None.
78+
"""
79+
# Is this even owned by the user?
80+
if os.stat(path, follow_symlinks=False).st_uid == user_uid:
81+
82+
is_file = os.path.isfile(path)
83+
is_symlink = os.path.islink(path)
84+
85+
# Log about skipping symlinks
86+
if is_symlink:
87+
logger.debug("Skipping symlink: %s", path)
88+
89+
# Return if it's a file (and not a symlink)
90+
elif is_file:
91+
return path
92+
93+
return None
94+
95+
6496
def handle_non_dir(var, user_uid, inputdata_root):
6597
"""
6698
Check if a non-directory is owned by the user and should be processed. Passes var to a
@@ -79,14 +111,22 @@ def handle_non_dir(var, user_uid, inputdata_root):
79111
TypeError: If var is not a DirEntry-like object.
80112
ValueError: If the file path is not under inputdata_root.
81113
"""
114+
logger.debug("starting handle_non_dir()")
115+
116+
# Handle a variable of type str.
117+
if isinstance(var, str):
118+
logger.debug("isinstance(var, str)")
119+
file_path = _handle_non_dir_str(var, user_uid)
82120

121+
# Handle a variable of type like os.DirEntry.
83122
# Fall back to duck typing: If var has the required DirEntry methods and members, treat it as a
84123
# DirEntry. This is necessary for this conditional to work with the MockDirEntry type used in
85124
# testing. ("If it looks, walks, and quacks like a duck...")
86-
if isinstance(var, os.DirEntry) or all(
125+
elif isinstance(var, os.DirEntry) or all(
87126
hasattr(var, m) for m in ["stat", "is_file", "is_symlink", "path"]
88127
):
89128
file_path = _handle_non_dir_entry(var, user_uid)
129+
90130
else:
91131
raise TypeError(
92132
f"Unsure how to handle non-directory variable of type {type(var)}"
@@ -101,15 +141,15 @@ def handle_non_dir(var, user_uid, inputdata_root):
101141
return file_path
102142

103143

104-
def find_owned_files_scandir(directory, user_uid, inputdata_root=DEFAULT_SOURCE_ROOT):
144+
def find_owned_files_scandir(item, user_uid, inputdata_root=DEFAULT_SOURCE_ROOT):
105145
"""
106146
Efficiently find all files owned by a specific user using os.scandir().
107147
108148
This is more efficient than os.walk() because os.scandir() caches stat
109149
information during directory traversal, reducing system calls.
110150
111151
Args:
112-
directory (str): The root directory to search.
152+
item (str): The root directory to search, or the file to check.
113153
user_uid (int): The UID of the user whose files to find.
114154
inputdata_root (str): The root of the directory tree containing CESM input data.
115155
@@ -120,7 +160,7 @@ def find_owned_files_scandir(directory, user_uid, inputdata_root=DEFAULT_SOURCE_
120160
ValueError: If any file found is not under inputdata_root.
121161
"""
122162
try:
123-
with os.scandir(directory) as entries:
163+
with os.scandir(item) as entries:
124164
for entry in entries:
125165
try:
126166
# Recursively process directories (not following symlinks)
@@ -139,26 +179,30 @@ def find_owned_files_scandir(directory, user_uid, inputdata_root=DEFAULT_SOURCE_
139179
logger.debug("Error accessing %s: %s. Skipping.", entry.path, e)
140180
continue
141181

182+
except NotADirectoryError:
183+
if (file_path := handle_non_dir(item, user_uid, inputdata_root)) is not None:
184+
yield file_path
185+
142186
except (OSError, PermissionError) as e:
143-
logger.debug("Error accessing %s: %s. Skipping.", directory, e)
187+
logger.warning("Error accessing %s: %s. Skipping.", item, e)
144188

145189

146190
def replace_files_with_symlinks(
147-
source_dir, target_dir, username, inputdata_root=DEFAULT_SOURCE_ROOT, dry_run=False
191+
item_to_process, target_dir, username, inputdata_root=DEFAULT_SOURCE_ROOT, dry_run=False
148192
):
149193
"""
150194
Finds files owned by a specific user in a source directory tree,
151195
deletes them, and replaces them with symbolic links to the same
152196
relative path in a target directory tree.
153197
154198
Args:
155-
source_dir (str): The root of the directory tree to search for files.
199+
item_to_process (str): The root directory to search, or the file to process.
156200
target_dir (str): The root of the directory tree containing the new files.
157201
inputdata_root (str): The root of the directory tree containing CESM input data.
158202
username (str): The name of the user whose files will be processed.
159203
dry_run (bool): If True, only show what would be done without making changes.
160204
"""
161-
source_dir = os.path.abspath(source_dir)
205+
item_to_process = os.path.abspath(item_to_process)
162206
target_dir = os.path.abspath(target_dir)
163207

164208
# Get the user ID (UID) for the specified username
@@ -175,38 +219,38 @@ def replace_files_with_symlinks(
175219
"Searching for files owned by '%s' (UID: %s) in '%s'...",
176220
username,
177221
user_uid,
178-
source_dir,
222+
item_to_process,
179223
)
180224

181225
# Use efficient scandir-based search
182-
for file_path in find_owned_files_scandir(source_dir, user_uid, inputdata_root):
183-
replace_one_file_with_symlink(
184-
source_dir, target_dir, file_path, dry_run=dry_run
185-
)
226+
for file_path in find_owned_files_scandir(item_to_process, user_uid, inputdata_root):
227+
replace_one_file_with_symlink(inputdata_root, target_dir, file_path, dry_run=dry_run)
186228

187229

188-
def replace_one_file_with_symlink(source_dir, target_dir, file_path, dry_run=False):
230+
def replace_one_file_with_symlink(
231+
inputdata_root, target_dir, file_path, dry_run=False
232+
):
189233
"""
190234
Given a file, replaces it with a symbolic link to the same relative path in a target directory
191235
tree.
192236
193237
Args:
194-
source_dir (str): The root of the directory tree to search for files.
238+
inputdata_root (str): The root of the directory tree containing CESM input data.
195239
target_dir (str): The root of the directory tree containing the new files.
196240
file_path (str): The path of the file to be replaced.
197241
dry_run (bool): If True, only show what would be done without making changes.
198242
"""
199243
logger.info("Found owned file: %s", file_path)
200244

201245
# Determine the relative path and the new link's destination
202-
relative_path = os.path.relpath(file_path, source_dir)
246+
relative_path = os.path.relpath(file_path, inputdata_root)
203247
link_target = os.path.join(target_dir, relative_path)
204248

205249
# Check if the target file actually exists
206250
if not os.path.exists(link_target):
207251
logger.warning(
208-
"Warning: Corresponding file not found in '%s' for '%s'. Skipping.",
209-
target_dir,
252+
"Warning: Corresponding file '%s' not found for '%s'. Skipping.",
253+
link_target,
210254
file_path,
211255
)
212256
return
@@ -242,9 +286,9 @@ def replace_one_file_with_symlink(source_dir, target_dir, file_path, dry_run=Fal
242286
logger.error("Error creating symlink for %s: %s. Skipping.", link_name, e)
243287

244288

245-
def validate_directory(path):
289+
def validate_paths(path, check_is_dir=False):
246290
"""
247-
Validate that one or more paths exist and are directories.
291+
Validate that one or more paths exist.
248292
249293
Args:
250294
path (str or list): The path to validate, or a list of such paths.
@@ -253,27 +297,43 @@ def validate_directory(path):
253297
str or list: The absolute path(s) if valid.
254298
255299
Raises:
256-
argparse.ArgumentTypeError: If a path doesn't exist or is not a directory.
300+
argparse.ArgumentTypeError: If a path doesn't exist.
257301
"""
258302
if isinstance(path, list):
259303
result = []
260304
for item in path:
261-
result.append(validate_directory(item))
305+
result.append(validate_paths(item, check_is_dir=check_is_dir))
262306
return result
263307

264308
if not os.path.exists(path):
265-
raise argparse.ArgumentTypeError(f"Directory '{path}' does not exist")
266-
if not os.path.isdir(path):
309+
raise argparse.ArgumentTypeError(f"'{path}' does not exist")
310+
if check_is_dir and not os.path.isdir(path):
267311
raise argparse.ArgumentTypeError(f"'{path}' is not a directory")
268312
return os.path.abspath(path)
269313

270314

315+
def validate_directory(path):
316+
"""
317+
Validate that one or more directories exist.
318+
319+
Args:
320+
path (str or list): The directory to validate, or a list of such directories.
321+
322+
Returns:
323+
str or list: The absolute path(s) if valid.
324+
325+
Raises:
326+
argparse.ArgumentTypeError: If a path doesn't exist.
327+
"""
328+
return validate_paths(path, check_is_dir=True)
329+
330+
271331
def parse_arguments():
272332
"""
273333
Parse command-line arguments.
274334
275335
Returns:
276-
argparse.Namespace: Parsed arguments containing source_root,
336+
argparse.Namespace: Parsed arguments containing items_to_process,
277337
target_root, and verbosity settings.
278338
"""
279339
parser = argparse.ArgumentParser(
@@ -282,11 +342,12 @@ def parse_arguments():
282342
)
283343
)
284344
parser.add_argument(
285-
"source_root",
345+
"items_to_process",
286346
nargs="*",
287347
default=DEFAULT_SOURCE_ROOT,
348+
type=validate_paths,
288349
help=(
289-
f"One or more directories to search for files (default: {DEFAULT_SOURCE_ROOT})"
350+
f"One or more (directories to search for) files (default: {DEFAULT_SOURCE_ROOT})"
290351
),
291352
)
292353
parser.add_argument(
@@ -356,13 +417,13 @@ def process_args(args):
356417
else:
357418
args.log_level = logging.INFO
358419

359-
# Ensure that source_root is a list
360-
if hasattr(args, "source_root") and not isinstance(args.source_root, list):
361-
args.source_root = [args.source_root]
420+
# Ensure that items_to_process is a list
421+
if hasattr(args, "items_to_process") and not isinstance(args.items_to_process, list):
422+
args.items_to_process = [args.items_to_process]
362423

363-
# Check that every item in source_root is a child of inputdata_root
364-
if hasattr(args, "source_root"): # Sometimes doesn't if we're testing
365-
for item in args.source_root:
424+
# Check that every item in items_to_process is a child of inputdata_root
425+
if hasattr(args, "items_to_process"): # Sometimes doesn't if we're testing
426+
for item in args.items_to_process:
366427
if not Path(item).is_relative_to(args.inputdata_root):
367428
raise argparse.ArgumentTypeError(
368429
f"Item '{item}' not under inputdata root '{args.inputdata_root}'"
@@ -378,6 +439,7 @@ def process_args(args):
378439

379440

380441
def main():
442+
# pylint: disable=missing-function-docstring
381443

382444
args = parse_arguments()
383445

@@ -388,7 +450,7 @@ def main():
388450
start_time = time.time()
389451

390452
# --- Execution ---
391-
for item in args.source_root:
453+
for item in args.items_to_process:
392454
replace_files_with_symlinks(
393455
item,
394456
args.target_root,

0 commit comments

Comments
 (0)