diff --git a/upload-api/migration-sitecore/libs/extractLocales.js b/upload-api/migration-sitecore/libs/extractLocales.js index f99ae48e5..3b1d5995e 100644 --- a/upload-api/migration-sitecore/libs/extractLocales.js +++ b/upload-api/migration-sitecore/libs/extractLocales.js @@ -1,78 +1,181 @@ /* eslint-disable @typescript-eslint/no-var-requires */ -const fs = require('fs'); -const path = require('path'); - -const extractLocales = (dir) => { - console.info('🔍 [DEBUG] extractLocales - Starting locale extraction from:', dir); - console.time('🔍 [DEBUG] extractLocales - Total extraction time'); - - // ✅ Create a new Set for each function call instead of using global - const uniqueLanguages = new Set(); - let fileCount = 0; - let processedFiles = 0; - - const extractRecursive = (currentDir) => { - try { - const items = fs?.readdirSync?.(currentDir, { withFileTypes: true }); - - for (const item of items) { - const fullPath = path?.join?.(currentDir, item?.name); - - if (item?.isDirectory()) { - // ✅ Skip certain directories that are unlikely to contain locale data - const skipDirs = [ - '__Standard Values', - '__Prototypes', - '__Masters', - 'blob', - 'media library' - ]; - if (!skipDirs.some((skipDir) => item.name.includes(skipDir))) { - extractRecursive(fullPath); - } - } else if (item?.isFile() && item?.name === 'data.json') { - fileCount++; - try { - const rawData = fs?.readFileSync?.(fullPath, 'utf8'); - const jsonData = JSON?.parse?.(rawData); - const language = jsonData?.item?.$?.language; - - if (language) { - uniqueLanguages?.add?.(language); - processedFiles++; - console.info( - `🔍 [DEBUG] extractLocales - Found locale: "${language}" in ${fullPath}` - ); - } - } catch (error) { - console.error(`🔍 [DEBUG] extractLocales - Error reading ${fullPath}:`, error?.message); - } - - // ✅ Progress logging every 100 files - if (fileCount % 100 === 0) { - console.info( - `🔍 [DEBUG] extractLocales - Progress: ${fileCount} files scanned, ${uniqueLanguages.size} unique locales found` - ); - } - } +const fs = require("fs"); +const path = require("path"); + +// ─── tunables ──────────────────────────────────────────────────────────────── +const HEAD_BYTES = 131_072; // 128 KiB – covers item.$.language in all known exports +const CONCURRENCY = Number(process.env.LOCALE_CONCURRENCY) || 24; +const DEBUG = process.env.DEBUG_SITECORE_LOCALES === "1"; + +// Fast-path: find the "$" metadata block first, then extract language from it. +// This avoids matching a "language" key that belongs to nested field content. +// +// Strategy: +// 1. Find the first "$": { block in the head window (where item.$ lives) +// 2. Extract up to 512 chars after it (enough to cover all metadata keys) +// 3. Match "language" only within that narrow slice +// +// Fallback to full JSON.parse handles any file where this doesn't match. +const META_BLOCK_RE = /"\$"\s*:\s*\{([^}]{1,512})\}/; +const LANG_IN_META_RE = /"language"\s*:\s*"([^"]{1,64})"/; + +// Hoisted once – never recreated in the hot path +// Combines your original Sitecore system dirs + filesystem noise dirs +const SKIP_DIRS = new Set([ + "__Standard Values", + "__Prototypes", + "__Masters", + "blob", + "media library", + "node_modules", + ".git", + "__MACOSX", +]); + +// ─── phase 1: collect all data.json paths ──────────────────────────────────── + +async function collectPaths(dir, results = []) { + if (dir == null || typeof dir !== "string" || dir?.length === 0) { + console.error("[extractLocales] collectPaths: invalid or empty dir"); + return results; + } + + let entries; + try { + entries = await fs.promises.readdir(dir, { withFileTypes: true }); + } catch (err) { + console.error(`[extractLocales] cannot read dir ${dir}:`, err?.message ?? err); + return results; + } + + const subdirs = []; + + for (const entry of entries) { + const name = entry?.name; + if (typeof name !== "string") continue; + + // Match your original logic: skip if any skipDir is a substring of the name + if ([...SKIP_DIRS].some((s) => name.includes(s))) continue; + + const full = path.join(dir, name); + + if (entry?.isDirectory?.()) { + subdirs.push(full); + } else if (entry?.isFile?.() && name === "data.json") { + results.push(full); + } + } + + await Promise.all(subdirs.map((d) => collectPaths(d, results))); + return results; +} + +// ─── phase 2: extract language from one file ───────────────────────────────── + +async function extractLanguage(filePath) { + if (filePath == null || typeof filePath !== "string" || filePath.length === 0) { + return null; + } + + let fd; + try { + // Fast path — read only the first 128 KiB + fd = await fs.promises.open(filePath, "r"); + const buf = Buffer.allocUnsafe(HEAD_BYTES); + const { bytesRead } = await fd.read(buf, 0, HEAD_BYTES, 0); + await fd.close(); + fd = null; + + const head = buf.toString("utf8", 0, bytesRead); + const block = META_BLOCK_RE.exec(head); + const metaSlice = block != null && block[1] != null ? block[1] : null; + const m = metaSlice != null ? LANG_IN_META_RE.exec(metaSlice) : null; + + if (m != null && m[1] != null && m[1] !== "") { + if (DEBUG) console.debug(`[fast] ${filePath} → ${m[1]}`); + return m[1]; + } + + // Fallback — full parse (identical to original behaviour) + if (DEBUG) console.debug(`[fallback] ${filePath}`); + const raw = await fs.promises.readFile(filePath, "utf8"); + const json = JSON.parse(raw); + return json?.item?.$?.language ?? null; + + } catch (err) { + console.error(`[extractLocales] error reading ${filePath}:`, err?.message ?? err); + return null; + } finally { + if (fd) await fd.close().catch(() => { }); + } +} + +// ─── phase 3: bounded-concurrency processing ───────────────────────────────── + +async function processWithConcurrency(paths, concurrency) { + const locales = new Set(); + if (!Array.isArray(paths)) { + return locales; + } + + const limit = Math.max(1, Number(concurrency) || CONCURRENCY); + const total = paths?.length; + let idx = 0; + let scanned = 0; + + async function worker() { + while (idx < paths?.length) { + const filePath = paths[idx++]; + if (filePath == null || typeof filePath !== "string") continue; + const lang = await extractLanguage(filePath); + if (lang) locales.add(lang); + scanned++; + if (scanned % 100 === 0) { + console.info( + `[extractLocales] progress: ${scanned}/${total} files scanned, ${locales.size} unique locale(s) found` + ); } - } catch (error) { - console.error( - `🔍 [DEBUG] extractLocales - Error reading directory ${currentDir}:`, - error?.message - ); } - }; + } + + await Promise.all( + Array.from({ length: Math.min(limit, paths.length) }, worker) + ); + + return locales; +} + +// ─── public API ────────────────────────────────────────────────────────────── + +/** + * Walk `dir` and return a Set of all unique locale strings found in data.json files. + * Async drop-in replacement for the original synchronous version. + * + * @param {string} dir + * @returns {Promise>} + */ +const extractLocales = async (dir) => { + const empty = new Set(); + if (dir == null || typeof dir !== "string" || dir?.length === 0) { + console.error("[extractLocales] invalid or empty dir; returning empty locale set"); + return empty; + } + + console.info("[extractLocales] starting locale extraction from:", dir); + console.time("[extractLocales] total extraction time"); + + const paths = await collectPaths(dir); + console.info(`[extractLocales] found ${paths.length} data.json files`); - extractRecursive(dir); + const locales = await processWithConcurrency(paths, CONCURRENCY); - console.timeEnd('🔍 [DEBUG] extractLocales - Total extraction time'); + console.timeEnd("[extractLocales] total extraction time"); console.info( - `🔍 [DEBUG] extractLocales - Final results: ${fileCount} total files scanned, ${processedFiles} files with locale data, ${uniqueLanguages.size} unique locales found` + `[extractLocales] done — ${paths.length} files scanned, ${locales.size} unique locale(s):`, + Array.from(locales) ); - console.info('🔍 [DEBUG] extractLocales - Unique locales:', Array.from(uniqueLanguages)); - return uniqueLanguages; + return locales; }; -module.exports = extractLocales; +module.exports = extractLocales; \ No newline at end of file