Skip to content

Commit 578c02b

Browse files
committed
Refactor glob expansion with ignore and globset
Moved input glob expansion logic from main.rs to a new path_glob.rs module, now using the ignore and globset crates for faster, .gitignore-aware, parallel file matching. Updated dependencies in Cargo.toml to include rayon, ignore, and globset.
1 parent 2c5f105 commit 578c02b

3 files changed

Lines changed: 121 additions & 50 deletions

File tree

langcodec-cli/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ serde_json = "1.0"
2222
serde_yaml = "0.9"
2323
unic-langid = "0.9.6"
2424
glob = "0.3"
25+
rayon = "1.10"
26+
ignore = "0.4"
27+
globset = "0.4"
2528

2629
[dev-dependencies]
2730
tempfile = "3.8"

langcodec-cli/src/main.rs

Lines changed: 3 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
mod debug;
22
mod formats;
33
mod merge;
4+
mod path_glob;
45
mod transformers;
56
mod validation;
67
mod view;
@@ -15,7 +16,6 @@ use clap::{CommandFactory, Parser, Subcommand};
1516
use clap_complete::{Shell, generate};
1617

1718
use langcodec::{Codec, convert_auto, formats::FormatType};
18-
use std::collections::HashSet;
1919
use std::fs::File;
2020
use std::io::BufWriter;
2121

@@ -202,7 +202,7 @@ fn main() {
202202
} => {
203203
// Expand any glob patterns in inputs (e.g., *.strings, **/*.xml)
204204
println!("Expanding glob patterns in inputs: {:?}", inputs);
205-
let expanded_inputs = match expand_input_globs(&inputs) {
205+
let expanded_inputs = match path_glob::expand_input_globs(&inputs) {
206206
Ok(list) => list,
207207
Err(e) => {
208208
eprintln!("❌ Failed to expand input patterns: {}", e);
@@ -736,51 +736,4 @@ fn read_resources_from_any_input(
736736
))
737737
}
738738

739-
/// Expand possible glob patterns in a list of input strings into concrete file paths
740-
fn expand_input_globs(inputs: &Vec<String>) -> Result<Vec<String>, String> {
741-
use rayon::prelude::*;
742-
743-
// Expand each pattern concurrently, collecting per-pattern results first
744-
let expanded: Vec<String> = inputs
745-
.par_iter()
746-
.map(|pattern| {
747-
// Try treating the input as a glob pattern. If it fails to parse as a glob,
748-
// just treat it as a literal path.
749-
match glob::glob(pattern) {
750-
Ok(paths) => {
751-
let mut out: Vec<String> = Vec::new();
752-
let mut matched = false;
753-
for entry in paths {
754-
match entry {
755-
Ok(path) => {
756-
if path.is_file() {
757-
out.push(path.to_string_lossy().to_string());
758-
matched = true;
759-
}
760-
}
761-
Err(e) => {
762-
return Err(format!("Glob error for '{}': {}", pattern, e));
763-
}
764-
}
765-
}
766-
if matched { Ok(out) } else { Ok(vec![pattern.clone()]) }
767-
}
768-
Err(_) => Ok(vec![pattern.clone()]),
769-
}
770-
})
771-
.collect::<Result<Vec<Vec<String>>, String>>()?
772-
.into_iter()
773-
.flatten()
774-
.collect();
775-
776-
// Deduplicate while preserving the first-seen order
777-
let mut seen: HashSet<String> = HashSet::new();
778-
let mut results: Vec<String> = Vec::with_capacity(expanded.len());
779-
for s in expanded {
780-
if seen.insert(s.clone()) {
781-
results.push(s);
782-
}
783-
}
784-
785-
Ok(results)
786-
}
739+
// Path glob expansion is implemented in path_glob.rs

langcodec-cli/src/path_glob.rs

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
use std::collections::HashSet;
2+
use std::path::{Path, PathBuf};
3+
4+
use globset::{GlobBuilder, GlobSetBuilder};
5+
use ignore::WalkBuilder;
6+
use rayon::prelude::*;
7+
8+
/// Expand possible glob patterns in a list of input strings into concrete file paths.
9+
/// Uses ignore + globset for fast, parallel, .gitignore-aware traversal.
10+
pub fn expand_input_globs(inputs: &Vec<String>) -> Result<Vec<String>, String> {
11+
fn has_glob_meta(s: &str) -> bool {
12+
s.bytes().any(|b| matches!(b, b'*' | b'?' | b'[' | b'{'))
13+
}
14+
15+
// Extract a static directory prefix before the first glob meta-character
16+
fn static_prefix_dir(pattern: &str) -> PathBuf {
17+
let bytes = pattern.as_bytes();
18+
let mut idx = 0usize;
19+
while idx < bytes.len() {
20+
match bytes[idx] {
21+
b'*' | b'?' | b'[' | b'{' => break,
22+
_ => idx += 1,
23+
}
24+
}
25+
let prefix = &pattern[..idx];
26+
let p = Path::new(prefix);
27+
if p.is_dir() {
28+
p.to_path_buf()
29+
} else {
30+
p.parent()
31+
.map(|pp| pp.to_path_buf())
32+
.unwrap_or_else(|| PathBuf::from("."))
33+
}
34+
}
35+
36+
// Build one GlobSet for all patterns (literal_separator to avoid '/' matching)
37+
let mut builder = GlobSetBuilder::new();
38+
for pat in inputs {
39+
let glob = GlobBuilder::new(pat)
40+
.literal_separator(true)
41+
.build()
42+
.map_err(|e| format!("Invalid glob pattern '{}': {}", pat, e))?;
43+
builder.add(glob);
44+
}
45+
let set = builder
46+
.build()
47+
.map_err(|e| format!("Failed to build glob set: {}", e))?;
48+
49+
// Collect unique roots to minimize directory walks
50+
let mut roots: Vec<PathBuf> = Vec::new();
51+
for pat in inputs {
52+
let root = if has_glob_meta(pat) {
53+
static_prefix_dir(pat)
54+
} else {
55+
Path::new(pat)
56+
.parent()
57+
.map(|p| p.to_path_buf())
58+
.unwrap_or_else(|| PathBuf::from("."))
59+
};
60+
if !roots.iter().any(|r| r == &root) {
61+
roots.push(root);
62+
}
63+
}
64+
65+
// Walk roots in parallel and match files against the GlobSet
66+
let collected: Vec<String> = roots
67+
.par_iter()
68+
.map(|root| {
69+
let mut out: Vec<String> = Vec::new();
70+
let walker = WalkBuilder::new(root)
71+
.git_ignore(true)
72+
.git_global(true)
73+
.git_exclude(true)
74+
.hidden(false)
75+
.ignore(true)
76+
.parents(true)
77+
.build();
78+
79+
for dent in walker {
80+
let dent = match dent {
81+
Ok(d) => d,
82+
Err(_e) => continue,
83+
};
84+
let ftype = match dent.file_type() {
85+
Some(t) => t,
86+
None => continue,
87+
};
88+
if !ftype.is_file() {
89+
continue;
90+
}
91+
let s = dent.path().to_string_lossy();
92+
if set.is_match(s.as_ref()) {
93+
out.push(s.to_string());
94+
}
95+
}
96+
out
97+
})
98+
.flatten()
99+
.collect();
100+
101+
// If nothing matched, preserve original inputs to surface errors later
102+
if collected.is_empty() {
103+
return Ok(inputs.clone());
104+
}
105+
106+
// Deduplicate while preserving order
107+
let mut seen: HashSet<String> = HashSet::new();
108+
let mut results: Vec<String> = Vec::with_capacity(collected.len());
109+
for s in collected {
110+
if seen.insert(s.clone()) {
111+
results.push(s);
112+
}
113+
}
114+
Ok(results)
115+
}

0 commit comments

Comments
 (0)