Skip to content
239 changes: 171 additions & 68 deletions upload-api/migration-sitecore/libs/extractLocales.js
Original file line number Diff line number Diff line change
@@ -1,78 +1,181 @@
/* eslint-disable @typescript-eslint/no-var-requires */
const fs = require('fs');
const path = require('path');

const extractLocales = (dir) => {
console.info('🔍 [DEBUG] extractLocales - Starting locale extraction from:', dir);
console.time('🔍 [DEBUG] extractLocales - Total extraction time');

// ✅ Create a new Set for each function call instead of using global
const uniqueLanguages = new Set();
let fileCount = 0;
let processedFiles = 0;

const extractRecursive = (currentDir) => {
try {
const items = fs?.readdirSync?.(currentDir, { withFileTypes: true });

for (const item of items) {
const fullPath = path?.join?.(currentDir, item?.name);

if (item?.isDirectory()) {
// ✅ Skip certain directories that are unlikely to contain locale data
const skipDirs = [
'__Standard Values',
'__Prototypes',
'__Masters',
'blob',
'media library'
];
if (!skipDirs.some((skipDir) => item.name.includes(skipDir))) {
extractRecursive(fullPath);
}
} else if (item?.isFile() && item?.name === 'data.json') {
fileCount++;
try {
const rawData = fs?.readFileSync?.(fullPath, 'utf8');
const jsonData = JSON?.parse?.(rawData);
const language = jsonData?.item?.$?.language;

if (language) {
uniqueLanguages?.add?.(language);
processedFiles++;
console.info(
`🔍 [DEBUG] extractLocales - Found locale: "${language}" in ${fullPath}`
);
}
} catch (error) {
console.error(`🔍 [DEBUG] extractLocales - Error reading ${fullPath}:`, error?.message);
}

// ✅ Progress logging every 100 files
if (fileCount % 100 === 0) {
console.info(
`🔍 [DEBUG] extractLocales - Progress: ${fileCount} files scanned, ${uniqueLanguages.size} unique locales found`
);
}
}
const fs = require("fs");
const path = require("path");

// ─── tunables ────────────────────────────────────────────────────────────────
const HEAD_BYTES = 131_072; // 128 KiB – covers item.$.language in all known exports
const CONCURRENCY = Number(process.env.LOCALE_CONCURRENCY) || 24;
const DEBUG = process.env.DEBUG_SITECORE_LOCALES === "1";

// Fast-path: find the "$" metadata block first, then extract language from it.
// This avoids matching a "language" key that belongs to nested field content.
//
// Strategy:
// 1. Find the first "$": { block in the head window (where item.$ lives)
// 2. Extract up to 512 chars after it (enough to cover all metadata keys)
// 3. Match "language" only within that narrow slice
//
// Fallback to full JSON.parse handles any file where this doesn't match.
const META_BLOCK_RE = /"\$"\s*:\s*\{([^}]{1,512})\}/;
const LANG_IN_META_RE = /"language"\s*:\s*"([^"]{1,64})"/;

// Hoisted once – never recreated in the hot path
// Combines your original Sitecore system dirs + filesystem noise dirs
const SKIP_DIRS = new Set([
"__Standard Values",
"__Prototypes",
"__Masters",
"blob",
"media library",
"node_modules",
".git",
"__MACOSX",
]);

// ─── phase 1: collect all data.json paths ────────────────────────────────────

async function collectPaths(dir, results = []) {
if (dir == null || typeof dir !== "string" || dir?.length === 0) {
console.error("[extractLocales] collectPaths: invalid or empty dir");
return results;
}

let entries;
try {
entries = await fs.promises.readdir(dir, { withFileTypes: true });
} catch (err) {
console.error(`[extractLocales] cannot read dir ${dir}:`, err?.message ?? err);
return results;
}

const subdirs = [];

for (const entry of entries) {
const name = entry?.name;
if (typeof name !== "string") continue;

// Match your original logic: skip if any skipDir is a substring of the name
if ([...SKIP_DIRS].some((s) => name.includes(s))) continue;

const full = path.join(dir, name);

if (entry?.isDirectory?.()) {
subdirs.push(full);
} else if (entry?.isFile?.() && name === "data.json") {
results.push(full);
}
}

await Promise.all(subdirs.map((d) => collectPaths(d, results)));
return results;
}

// ─── phase 2: extract language from one file ─────────────────────────────────

async function extractLanguage(filePath) {
if (filePath == null || typeof filePath !== "string" || filePath.length === 0) {
return null;
}

let fd;
try {
// Fast path — read only the first 128 KiB
fd = await fs.promises.open(filePath, "r");
const buf = Buffer.allocUnsafe(HEAD_BYTES);
const { bytesRead } = await fd.read(buf, 0, HEAD_BYTES, 0);
await fd.close();
fd = null;

const head = buf.toString("utf8", 0, bytesRead);
const block = META_BLOCK_RE.exec(head);
const metaSlice = block != null && block[1] != null ? block[1] : null;
const m = metaSlice != null ? LANG_IN_META_RE.exec(metaSlice) : null;

if (m != null && m[1] != null && m[1] !== "") {
if (DEBUG) console.debug(`[fast] ${filePath} → ${m[1]}`);
return m[1];
}

// Fallback — full parse (identical to original behaviour)
if (DEBUG) console.debug(`[fallback] ${filePath}`);
const raw = await fs.promises.readFile(filePath, "utf8");
const json = JSON.parse(raw);
return json?.item?.$?.language ?? null;

} catch (err) {
console.error(`[extractLocales] error reading ${filePath}:`, err?.message ?? err);
return null;
} finally {
if (fd) await fd.close().catch(() => { });
}
}

// ─── phase 3: bounded-concurrency processing ─────────────────────────────────

async function processWithConcurrency(paths, concurrency) {
const locales = new Set();
if (!Array.isArray(paths)) {
return locales;
}

const limit = Math.max(1, Number(concurrency) || CONCURRENCY);
const total = paths?.length;
let idx = 0;
let scanned = 0;

async function worker() {
while (idx < paths?.length) {
const filePath = paths[idx++];
if (filePath == null || typeof filePath !== "string") continue;
const lang = await extractLanguage(filePath);
if (lang) locales.add(lang);
scanned++;
if (scanned % 100 === 0) {
console.info(
`[extractLocales] progress: ${scanned}/${total} files scanned, ${locales.size} unique locale(s) found`
);
}
} catch (error) {
console.error(
`🔍 [DEBUG] extractLocales - Error reading directory ${currentDir}:`,
error?.message
);
}
};
}

await Promise.all(
Array.from({ length: Math.min(limit, paths.length) }, worker)
);

return locales;
}

// ─── public API ──────────────────────────────────────────────────────────────

/**
* Walk `dir` and return a Set of all unique locale strings found in data.json files.
* Async drop-in replacement for the original synchronous version.
*
* @param {string} dir
* @returns {Promise<Set<string>>}
*/
const extractLocales = async (dir) => {
const empty = new Set();
if (dir == null || typeof dir !== "string" || dir?.length === 0) {
console.error("[extractLocales] invalid or empty dir; returning empty locale set");
return empty;
}

console.info("[extractLocales] starting locale extraction from:", dir);
console.time("[extractLocales] total extraction time");

const paths = await collectPaths(dir);
console.info(`[extractLocales] found ${paths.length} data.json files`);

extractRecursive(dir);
const locales = await processWithConcurrency(paths, CONCURRENCY);

console.timeEnd('🔍 [DEBUG] extractLocales - Total extraction time');
console.timeEnd("[extractLocales] total extraction time");
console.info(
`🔍 [DEBUG] extractLocales - Final results: ${fileCount} total files scanned, ${processedFiles} files with locale data, ${uniqueLanguages.size} unique locales found`
`[extractLocales] done — ${paths.length} files scanned, ${locales.size} unique locale(s):`,
Array.from(locales)
);
console.info('🔍 [DEBUG] extractLocales - Unique locales:', Array.from(uniqueLanguages));

return uniqueLanguages;
return locales;
};

module.exports = extractLocales;
module.exports = extractLocales;
Loading