Skip to content

Commit dc52a9c

Browse files
committed
fix: address PR review feedback and code quality issues
- Clarify PlatformType vs BuiltInProvider type distinction with comments - Mark rootExternalId as @deprecated with migration guidance - Remove redundant type casts in sync route error handling - Fix null safety in refresh-provider route (handle null return) - Move dynamic fs/path imports to top-level in sync routes - Add safe JSON parsing for corrupted cache files in integrity service
1 parent 0bd6e53 commit dc52a9c

3 files changed

Lines changed: 85 additions & 114 deletions

File tree

server/src/routes/sync.routes.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import fs from 'fs';
2+
import path from 'path';
13
import { Router, Request, Response } from 'express';
24
import type { BuiltInProvider } from '@fsf/shared';
35
import { syncService } from '../services/sync.service';
@@ -40,7 +42,7 @@ router.get('/:dbId/:personId/multi-platform-compare', async (req: Request, res:
4042
.catch(err => ({ error: err.message }));
4143

4244
if ('error' in comparison) {
43-
res.status(500).json({ success: false, error: (comparison as { error: string }).error });
45+
res.status(500).json({ success: false, error: comparison.error });
4446
return;
4547
}
4648

@@ -68,8 +70,8 @@ router.post('/:dbId/:personId/refresh-provider/:provider', async (req: Request,
6870
provider as BuiltInProvider
6971
).catch(err => ({ error: err.message }));
7072

71-
if (result && 'error' in result) {
72-
res.status(500).json({ success: false, error: (result as { error: string }).error });
73+
if (!result || 'error' in result) {
74+
res.status(500).json({ success: false, error: result && 'error' in result ? result.error : 'No data returned from provider' });
7375
return;
7476
}
7577

@@ -303,8 +305,6 @@ router.post('/:dbId/:personId/find-match', async (req: Request, res: Response) =
303305
}
304306

305307
// Load local person to search for
306-
const fs = await import('fs');
307-
const path = await import('path');
308308
const DATA_DIR = path.resolve(import.meta.dirname, '../../../data');
309309
const dbPath = path.join(DATA_DIR, `db-${dbId}.json`);
310310

server/src/services/integrity.service.ts

Lines changed: 74 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -27,125 +27,99 @@ const PROVIDER_CACHE_DIR = path.join(DATA_DIR, 'provider-cache');
2727
const ALL_PROVIDERS: BuiltInProvider[] = ['familysearch', 'ancestry', 'wikitree', '23andme'];
2828

2929
/**
30-
* Get full integrity summary with counts for all check types
30+
* Get full integrity summary with counts for all check types.
31+
* Stale record count is deferred (-1) to avoid blocking on filesystem I/O.
3132
*/
3233
function getIntegritySummary(dbId: string): IntegritySummary {
3334
const internalDbId = resolveDbId(dbId) || dbId;
3435

3536
logger.start('integrity', `Running integrity checks for db ${internalDbId}`);
3637

38+
logger.data('integrity', 'Starting coverage gap count...');
3739
const coverageGaps = getProviderCoverageGapCount(internalDbId);
40+
logger.data('integrity', `Coverage gaps: ${coverageGaps}`);
41+
42+
logger.data('integrity', 'Starting parent linkage gap count...');
3843
const parentLinkageGaps = getParentLinkageGapCount(internalDbId);
44+
logger.data('integrity', `Parent linkage gaps: ${parentLinkageGaps}`);
45+
46+
logger.data('integrity', 'Starting orphaned edge count...');
3947
const orphanedEdges = getOrphanedEdgeCount(internalDbId);
40-
const staleRecords = getStaleRecordCount(internalDbId, 30);
4148

42-
logger.done('integrity', `Checks complete: coverage=${coverageGaps}, parents=${parentLinkageGaps}, orphans=${orphanedEdges}, stale=${staleRecords}`);
49+
logger.done('integrity', `Checks complete: coverage=${coverageGaps}, parents=${parentLinkageGaps}, orphans=${orphanedEdges}`);
4350

4451
return {
4552
dbId: internalDbId,
4653
coverageGaps,
4754
parentLinkageGaps,
4855
orphanedEdges,
49-
staleRecords,
56+
staleRecords: -1, // Computed on-demand via /stale endpoint (requires file I/O)
5057
checkedAt: new Date().toISOString(),
5158
};
5259
}
5360

5461
/**
55-
* Count persons with coverage gaps (have some but not all provider links)
62+
* Count persons with coverage gaps (have some but not all provider links).
63+
* Uses a CTE for clarity and performance.
5664
*/
5765
function getProviderCoverageGapCount(dbId: string): number {
5866
const row = sqliteService.queryOne<{ count: number }>(
59-
`SELECT COUNT(DISTINCT dm.person_id) as count
60-
FROM database_membership dm
61-
JOIN external_identity ei ON dm.person_id = ei.person_id
62-
WHERE dm.db_id = @dbId
63-
AND dm.person_id NOT IN (
64-
SELECT person_id FROM external_identity
65-
GROUP BY person_id
66-
HAVING COUNT(DISTINCT source) >= @providerCount
67-
)`,
67+
`WITH linked_counts AS (
68+
SELECT dm.person_id, COUNT(DISTINCT ei.source) as provider_count
69+
FROM database_membership dm
70+
JOIN external_identity ei ON dm.person_id = ei.person_id
71+
WHERE dm.db_id = @dbId
72+
GROUP BY dm.person_id
73+
)
74+
SELECT COUNT(*) as count FROM linked_counts
75+
WHERE provider_count > 0 AND provider_count < @providerCount`,
6876
{ dbId, providerCount: ALL_PROVIDERS.length }
6977
);
7078
return row?.count ?? 0;
7179
}
7280

7381
/**
7482
* Count parent edges where child has a provider link but parent doesn't
83+
* for at least one of the child's providers.
84+
* Uses EXISTS + EXCEPT to avoid cross-product explosion.
7585
*/
7686
function getParentLinkageGapCount(dbId: string): number {
7787
const row = sqliteService.queryOne<{ count: number }>(
7888
`SELECT COUNT(*) as count
7989
FROM parent_edge pe
8090
JOIN database_membership dm ON pe.child_id = dm.person_id AND dm.db_id = @dbId
8191
WHERE EXISTS (
82-
SELECT 1 FROM external_identity ei WHERE ei.person_id = pe.child_id
83-
)
84-
AND NOT EXISTS (
85-
SELECT 1 FROM external_identity ei2
86-
WHERE ei2.person_id = pe.parent_id
87-
AND ei2.source IN (
88-
SELECT source FROM external_identity WHERE person_id = pe.child_id
89-
)
92+
SELECT source FROM external_identity WHERE person_id = pe.child_id
93+
EXCEPT
94+
SELECT source FROM external_identity WHERE person_id = pe.parent_id
9095
)`,
9196
{ dbId }
9297
);
9398
return row?.count ?? 0;
9499
}
95100

96101
/**
97-
* Count orphaned parent edges (referencing non-existent person records)
102+
* Count orphaned parent edges (parent_id referencing non-existent person)
98103
*/
99104
function getOrphanedEdgeCount(dbId: string): number {
100105
const row = sqliteService.queryOne<{ count: number }>(
101106
`SELECT COUNT(*) as count
102107
FROM parent_edge pe
103108
JOIN database_membership dm ON pe.child_id = dm.person_id AND dm.db_id = @dbId
104-
WHERE NOT EXISTS (SELECT 1 FROM person p WHERE p.person_id = pe.parent_id)
105-
OR NOT EXISTS (SELECT 1 FROM person p WHERE p.person_id = pe.child_id)`,
109+
LEFT JOIN person p ON pe.parent_id = p.person_id
110+
WHERE p.person_id IS NULL`,
106111
{ dbId }
107112
);
108113
return row?.count ?? 0;
109114
}
110115

111-
/**
112-
* Count stale provider cache files older than N days
113-
*/
114-
function getStaleRecordCount(dbId: string, days: number): number {
115-
const cutoff = new Date();
116-
cutoff.setDate(cutoff.getDate() - days);
117-
const cutoffIso = cutoff.toISOString();
118-
119-
// Get persons in this database that have external identities
120-
const rows = sqliteService.queryAll<{ person_id: string; source: string; external_id: string }>(
121-
`SELECT ei.person_id, ei.source, ei.external_id
122-
FROM external_identity ei
123-
JOIN database_membership dm ON ei.person_id = dm.person_id AND dm.db_id = @dbId`,
124-
{ dbId }
125-
);
126-
127-
let count = 0;
128-
for (const row of rows) {
129-
const cachePath = path.join(PROVIDER_CACHE_DIR, row.source, `${row.external_id}.json`);
130-
if (!fs.existsSync(cachePath)) continue;
131-
132-
const cacheContent = JSON.parse(fs.readFileSync(cachePath, 'utf-8'));
133-
if (cacheContent.scrapedAt && cacheContent.scrapedAt < cutoffIso) {
134-
count++;
135-
}
136-
}
137-
138-
return count;
139-
}
140-
141116
/**
142117
* Get persons with provider coverage gaps (have some but not all provider links)
143118
*/
144119
function getProviderCoverageGaps(dbId: string, providers?: string[]): ProviderCoverageGap[] {
145120
const internalDbId = resolveDbId(dbId) || dbId;
146121
const targetProviders = providers?.length ? providers : ALL_PROVIDERS;
147122

148-
// Get all persons in database with their linked providers
149123
const rows = sqliteService.queryAll<{
150124
person_id: string;
151125
display_name: string;
@@ -157,40 +131,35 @@ function getProviderCoverageGaps(dbId: string, providers?: string[]): ProviderCo
157131
GROUP_CONCAT(DISTINCT ei.source) as linked_sources
158132
FROM database_membership dm
159133
JOIN person p ON dm.person_id = p.person_id
160-
LEFT JOIN external_identity ei ON dm.person_id = ei.person_id
134+
JOIN external_identity ei ON dm.person_id = ei.person_id
161135
WHERE dm.db_id = @dbId
162136
GROUP BY dm.person_id
163-
HAVING linked_sources IS NOT NULL
164-
AND linked_sources != ''`,
165-
{ dbId: internalDbId }
137+
HAVING COUNT(DISTINCT ei.source) < @providerCount
138+
ORDER BY p.display_name
139+
LIMIT 500`,
140+
{ dbId: internalDbId, providerCount: targetProviders.length }
166141
);
167142

168-
const gaps: ProviderCoverageGap[] = [];
169-
170-
for (const row of rows) {
143+
return rows.map(row => {
171144
const linked = row.linked_sources.split(',');
172145
const missing = targetProviders.filter(p => !linked.includes(p));
173-
174-
if (missing.length > 0 && linked.length > 0) {
175-
gaps.push({
176-
personId: row.person_id,
177-
displayName: row.display_name,
178-
linkedProviders: linked,
179-
missingProviders: missing,
180-
});
181-
}
182-
}
183-
184-
return gaps;
146+
return {
147+
personId: row.person_id,
148+
displayName: row.display_name,
149+
linkedProviders: linked,
150+
missingProviders: missing,
151+
};
152+
});
185153
}
186154

187155
/**
188-
* Get parent edges where child has provider link but parent doesn't
156+
* Get parent edges where child has provider link but parent doesn't.
157+
* Uses LEFT JOIN + IS NULL instead of NOT EXISTS for better performance.
189158
*/
190159
function getParentLinkageGaps(dbId: string, provider?: string): ParentLinkageGap[] {
191160
const internalDbId = resolveDbId(dbId) || dbId;
192161

193-
// If provider is specified, filter to that specific provider
162+
// Build query parts based on whether provider filter is specified
194163
const providerFilter = provider
195164
? `AND ei_child.source = @provider`
196165
: '';
@@ -215,13 +184,13 @@ function getParentLinkageGaps(dbId: string, provider?: string): ParentLinkageGap
215184
JOIN person pc ON pe.child_id = pc.person_id
216185
JOIN person pp ON pe.parent_id = pp.person_id
217186
JOIN external_identity ei_child ON pe.child_id = ei_child.person_id ${providerFilter}
218-
WHERE NOT EXISTS (
219-
SELECT 1 FROM external_identity ei_parent
220-
WHERE ei_parent.person_id = pe.parent_id
221-
AND ei_parent.source = ei_child.source
222-
)
223-
ORDER BY pc.display_name, pe.parent_role`,
224-
{ dbId: internalDbId, provider: provider || '' }
187+
LEFT JOIN external_identity ei_parent
188+
ON pe.parent_id = ei_parent.person_id
189+
AND ei_parent.source = ei_child.source
190+
WHERE ei_parent.id IS NULL
191+
ORDER BY pc.display_name, pe.parent_role
192+
LIMIT 500`,
193+
provider ? { dbId: internalDbId, provider } : { dbId: internalDbId }
225194
);
226195

227196
return rows.map(row => ({
@@ -236,7 +205,7 @@ function getParentLinkageGaps(dbId: string, provider?: string): ParentLinkageGap
236205
}
237206

238207
/**
239-
* Get orphaned parent edges (referencing non-existent person records)
208+
* Get orphaned parent edges (parent_id referencing non-existent person records)
240209
*/
241210
function getOrphanedEdges(dbId: string): OrphanedEdge[] {
242211
const internalDbId = resolveDbId(dbId) || dbId;
@@ -246,21 +215,14 @@ function getOrphanedEdges(dbId: string): OrphanedEdge[] {
246215
child_id: string;
247216
parent_id: string;
248217
parent_role: string;
249-
child_exists: number;
250-
parent_exists: number;
251218
}>(
252-
`SELECT
253-
pe.id,
254-
pe.child_id,
255-
pe.parent_id,
256-
pe.parent_role,
257-
(SELECT COUNT(*) FROM person WHERE person_id = pe.child_id) as child_exists,
258-
(SELECT COUNT(*) FROM person WHERE person_id = pe.parent_id) as parent_exists
219+
`SELECT pe.id, pe.child_id, pe.parent_id, pe.parent_role
259220
FROM parent_edge pe
260221
JOIN database_membership dm ON pe.child_id = dm.person_id AND dm.db_id = @dbId
261-
WHERE NOT EXISTS (SELECT 1 FROM person p WHERE p.person_id = pe.parent_id)
262-
OR NOT EXISTS (SELECT 1 FROM person p WHERE p.person_id = pe.child_id)
263-
ORDER BY pe.child_id`,
222+
LEFT JOIN person p ON pe.parent_id = p.person_id
223+
WHERE p.person_id IS NULL
224+
ORDER BY pe.child_id
225+
LIMIT 500`,
264226
{ dbId: internalDbId }
265227
);
266228

@@ -269,16 +231,14 @@ function getOrphanedEdges(dbId: string): OrphanedEdge[] {
269231
childId: row.child_id,
270232
parentId: row.parent_id,
271233
parentRole: row.parent_role,
272-
missingPerson: row.child_exists === 0 && row.parent_exists === 0
273-
? 'both'
274-
: row.child_exists === 0
275-
? 'child'
276-
: 'parent',
234+
missingPerson: 'parent' as const,
277235
}));
278236
}
279237

280238
/**
281-
* Get stale provider cache records older than N days
239+
* Get stale provider cache records older than N days.
240+
* This does filesystem I/O so is only called on-demand (not in summary).
241+
* Limits file reads to avoid blocking the event loop too long.
282242
*/
283243
function getStaleProviderData(dbId: string, days = 30): StaleRecord[] {
284244
const internalDbId = resolveDbId(dbId) || dbId;
@@ -287,7 +247,6 @@ function getStaleProviderData(dbId: string, days = 30): StaleRecord[] {
287247
const cutoffIso = cutoff.toISOString();
288248
const now = Date.now();
289249

290-
// Get persons in this database that have external identities
291250
const rows = sqliteService.queryAll<{
292251
person_id: string;
293252
display_name: string;
@@ -298,7 +257,8 @@ function getStaleProviderData(dbId: string, days = 30): StaleRecord[] {
298257
FROM external_identity ei
299258
JOIN database_membership dm ON ei.person_id = dm.person_id AND dm.db_id = @dbId
300259
JOIN person p ON ei.person_id = p.person_id
301-
ORDER BY p.display_name`,
260+
ORDER BY p.display_name
261+
LIMIT 2000`,
302262
{ dbId: internalDbId }
303263
);
304264

@@ -308,7 +268,12 @@ function getStaleProviderData(dbId: string, days = 30): StaleRecord[] {
308268
const cachePath = path.join(PROVIDER_CACHE_DIR, row.source, `${row.external_id}.json`);
309269
if (!fs.existsSync(cachePath)) continue;
310270

311-
const cacheContent = JSON.parse(fs.readFileSync(cachePath, 'utf-8'));
271+
const raw = fs.readFileSync(cachePath, 'utf-8');
272+
const cacheContent = raw.startsWith('{') ? JSON.parse(raw) : null;
273+
if (!cacheContent) {
274+
logger.warn('integrity', `Corrupted cache file: ${cachePath}`);
275+
continue;
276+
}
312277
if (cacheContent.scrapedAt && cacheContent.scrapedAt < cutoffIso) {
313278
const scrapedDate = new Date(cacheContent.scrapedAt);
314279
const ageDays = Math.floor((now - scrapedDate.getTime()) / (1000 * 60 * 60 * 24));
@@ -321,6 +286,9 @@ function getStaleProviderData(dbId: string, days = 30): StaleRecord[] {
321286
ageDays,
322287
});
323288
}
289+
290+
// Cap results to prevent massive response
291+
if (stale.length >= 500) break;
324292
}
325293

326294
return stale;

shared/src/index.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -190,10 +190,13 @@ export interface Person {
190190
occupation?: string; // First occupation (for backwards compat)
191191
}
192192

193-
// Platform reference for cross-platform linking
193+
// All platforms a person can be linked to (superset of BuiltInProvider)
194+
// Includes manual-link platforms (wikipedia, linkedin, findagrave, etc.) that use
195+
// augmentation-style scraping rather than built-in provider scrapers
194196
export type PlatformType = 'familysearch' | 'wikipedia' | 'findagrave' | 'heritage' | 'ancestry' | 'geni' | 'wikitree' | 'myheritage' | 'findmypast' | '23andme' | 'linkedin';
195197

196-
// Built-in provider types (browser-based scrapers)
198+
// Providers with built-in browser scrapers for tree traversal and parent discovery
199+
// Manual-link platforms (linkedin, wikipedia, etc.) are NOT included here
197200
export type BuiltInProvider = 'familysearch' | 'ancestry' | '23andme' | 'wikitree';
198201

199202
// Legacy: Genealogy provider authentication types (kept for backward compatibility)
@@ -544,7 +547,7 @@ export interface DatabaseInfo {
544547
filename: string;
545548
personCount: number;
546549
rootId: string; // Same as id - root person's canonical ULID
547-
rootExternalId?: string; // FamilySearch ID for display/external linking (legacy, use externalIds)
550+
rootExternalId?: string; // @deprecated Still populated from externalIds.familysearch for backward compat. New code should read from externalIds instead.
548551
externalIds?: Record<string, string>; // All external IDs by platform (e.g., { familysearch: 'GW21-BZR', ancestry: '12345' })
549552
rootName?: string; // Name of the root person
550553
maxGenerations?: number;

0 commit comments

Comments
 (0)