@@ -27,125 +27,99 @@ const PROVIDER_CACHE_DIR = path.join(DATA_DIR, 'provider-cache');
2727const ALL_PROVIDERS : BuiltInProvider [ ] = [ 'familysearch' , 'ancestry' , 'wikitree' , '23andme' ] ;
2828
2929/**
30- * Get full integrity summary with counts for all check types
30+ * Get full integrity summary with counts for all check types.
31+ * Stale record count is deferred (-1) to avoid blocking on filesystem I/O.
3132 */
3233function getIntegritySummary ( dbId : string ) : IntegritySummary {
3334 const internalDbId = resolveDbId ( dbId ) || dbId ;
3435
3536 logger . start ( 'integrity' , `Running integrity checks for db ${ internalDbId } ` ) ;
3637
38+ logger . data ( 'integrity' , 'Starting coverage gap count...' ) ;
3739 const coverageGaps = getProviderCoverageGapCount ( internalDbId ) ;
40+ logger . data ( 'integrity' , `Coverage gaps: ${ coverageGaps } ` ) ;
41+
42+ logger . data ( 'integrity' , 'Starting parent linkage gap count...' ) ;
3843 const parentLinkageGaps = getParentLinkageGapCount ( internalDbId ) ;
44+ logger . data ( 'integrity' , `Parent linkage gaps: ${ parentLinkageGaps } ` ) ;
45+
46+ logger . data ( 'integrity' , 'Starting orphaned edge count...' ) ;
3947 const orphanedEdges = getOrphanedEdgeCount ( internalDbId ) ;
40- const staleRecords = getStaleRecordCount ( internalDbId , 30 ) ;
4148
42- logger . done ( 'integrity' , `Checks complete: coverage=${ coverageGaps } , parents=${ parentLinkageGaps } , orphans=${ orphanedEdges } , stale= ${ staleRecords } ` ) ;
49+ logger . done ( 'integrity' , `Checks complete: coverage=${ coverageGaps } , parents=${ parentLinkageGaps } , orphans=${ orphanedEdges } ` ) ;
4350
4451 return {
4552 dbId : internalDbId ,
4653 coverageGaps,
4754 parentLinkageGaps,
4855 orphanedEdges,
49- staleRecords,
56+ staleRecords : - 1 , // Computed on-demand via /stale endpoint (requires file I/O)
5057 checkedAt : new Date ( ) . toISOString ( ) ,
5158 } ;
5259}
5360
5461/**
55- * Count persons with coverage gaps (have some but not all provider links)
62+ * Count persons with coverage gaps (have some but not all provider links).
63+ * Uses a CTE for clarity and performance.
5664 */
5765function getProviderCoverageGapCount ( dbId : string ) : number {
5866 const row = sqliteService . queryOne < { count : number } > (
59- `SELECT COUNT(DISTINCT dm.person_id) as count
60- FROM database_membership dm
61- JOIN external_identity ei ON dm.person_id = ei.person_id
62- WHERE dm.db_id = @dbId
63- AND dm.person_id NOT IN (
64- SELECT person_id FROM external_identity
65- GROUP BY person_id
66- HAVING COUNT(DISTINCT source) >= @providerCount
67- ) ` ,
67+ `WITH linked_counts AS (
68+ SELECT dm.person_id, COUNT(DISTINCT ei.source) as provider_count
69+ FROM database_membership dm
70+ JOIN external_identity ei ON dm.person_id = ei.person_id
71+ WHERE dm.db_id = @dbId
72+ GROUP BY dm. person_id
73+ )
74+ SELECT COUNT(*) as count FROM linked_counts
75+ WHERE provider_count > 0 AND provider_count < @providerCount ` ,
6876 { dbId, providerCount : ALL_PROVIDERS . length }
6977 ) ;
7078 return row ?. count ?? 0 ;
7179}
7280
7381/**
7482 * Count parent edges where child has a provider link but parent doesn't
83+ * for at least one of the child's providers.
84+ * Uses EXISTS + EXCEPT to avoid cross-product explosion.
7585 */
7686function getParentLinkageGapCount ( dbId : string ) : number {
7787 const row = sqliteService . queryOne < { count : number } > (
7888 `SELECT COUNT(*) as count
7989 FROM parent_edge pe
8090 JOIN database_membership dm ON pe.child_id = dm.person_id AND dm.db_id = @dbId
8191 WHERE EXISTS (
82- SELECT 1 FROM external_identity ei WHERE ei.person_id = pe.child_id
83- )
84- AND NOT EXISTS (
85- SELECT 1 FROM external_identity ei2
86- WHERE ei2.person_id = pe.parent_id
87- AND ei2.source IN (
88- SELECT source FROM external_identity WHERE person_id = pe.child_id
89- )
92+ SELECT source FROM external_identity WHERE person_id = pe.child_id
93+ EXCEPT
94+ SELECT source FROM external_identity WHERE person_id = pe.parent_id
9095 )` ,
9196 { dbId }
9297 ) ;
9398 return row ?. count ?? 0 ;
9499}
95100
96101/**
97- * Count orphaned parent edges (referencing non-existent person records )
102+ * Count orphaned parent edges (parent_id referencing non-existent person)
98103 */
99104function getOrphanedEdgeCount ( dbId : string ) : number {
100105 const row = sqliteService . queryOne < { count : number } > (
101106 `SELECT COUNT(*) as count
102107 FROM parent_edge pe
103108 JOIN database_membership dm ON pe.child_id = dm.person_id AND dm.db_id = @dbId
104- WHERE NOT EXISTS (SELECT 1 FROM person p WHERE p.person_id = pe.parent_id)
105- OR NOT EXISTS (SELECT 1 FROM person p WHERE p.person_id = pe.child_id) ` ,
109+ LEFT JOIN person p ON pe.parent_id = p.person_id
110+ WHERE p.person_id IS NULL ` ,
106111 { dbId }
107112 ) ;
108113 return row ?. count ?? 0 ;
109114}
110115
111- /**
112- * Count stale provider cache files older than N days
113- */
114- function getStaleRecordCount ( dbId : string , days : number ) : number {
115- const cutoff = new Date ( ) ;
116- cutoff . setDate ( cutoff . getDate ( ) - days ) ;
117- const cutoffIso = cutoff . toISOString ( ) ;
118-
119- // Get persons in this database that have external identities
120- const rows = sqliteService . queryAll < { person_id : string ; source : string ; external_id : string } > (
121- `SELECT ei.person_id, ei.source, ei.external_id
122- FROM external_identity ei
123- JOIN database_membership dm ON ei.person_id = dm.person_id AND dm.db_id = @dbId` ,
124- { dbId }
125- ) ;
126-
127- let count = 0 ;
128- for ( const row of rows ) {
129- const cachePath = path . join ( PROVIDER_CACHE_DIR , row . source , `${ row . external_id } .json` ) ;
130- if ( ! fs . existsSync ( cachePath ) ) continue ;
131-
132- const cacheContent = JSON . parse ( fs . readFileSync ( cachePath , 'utf-8' ) ) ;
133- if ( cacheContent . scrapedAt && cacheContent . scrapedAt < cutoffIso ) {
134- count ++ ;
135- }
136- }
137-
138- return count ;
139- }
140-
141116/**
142117 * Get persons with provider coverage gaps (have some but not all provider links)
143118 */
144119function getProviderCoverageGaps ( dbId : string , providers ?: string [ ] ) : ProviderCoverageGap [ ] {
145120 const internalDbId = resolveDbId ( dbId ) || dbId ;
146121 const targetProviders = providers ?. length ? providers : ALL_PROVIDERS ;
147122
148- // Get all persons in database with their linked providers
149123 const rows = sqliteService . queryAll < {
150124 person_id : string ;
151125 display_name : string ;
@@ -157,40 +131,35 @@ function getProviderCoverageGaps(dbId: string, providers?: string[]): ProviderCo
157131 GROUP_CONCAT(DISTINCT ei.source) as linked_sources
158132 FROM database_membership dm
159133 JOIN person p ON dm.person_id = p.person_id
160- LEFT JOIN external_identity ei ON dm.person_id = ei.person_id
134+ JOIN external_identity ei ON dm.person_id = ei.person_id
161135 WHERE dm.db_id = @dbId
162136 GROUP BY dm.person_id
163- HAVING linked_sources IS NOT NULL
164- AND linked_sources != ''` ,
165- { dbId : internalDbId }
137+ HAVING COUNT(DISTINCT ei.source) < @providerCount
138+ ORDER BY p.display_name
139+ LIMIT 500` ,
140+ { dbId : internalDbId , providerCount : targetProviders . length }
166141 ) ;
167142
168- const gaps : ProviderCoverageGap [ ] = [ ] ;
169-
170- for ( const row of rows ) {
143+ return rows . map ( row => {
171144 const linked = row . linked_sources . split ( ',' ) ;
172145 const missing = targetProviders . filter ( p => ! linked . includes ( p ) ) ;
173-
174- if ( missing . length > 0 && linked . length > 0 ) {
175- gaps . push ( {
176- personId : row . person_id ,
177- displayName : row . display_name ,
178- linkedProviders : linked ,
179- missingProviders : missing ,
180- } ) ;
181- }
182- }
183-
184- return gaps ;
146+ return {
147+ personId : row . person_id ,
148+ displayName : row . display_name ,
149+ linkedProviders : linked ,
150+ missingProviders : missing ,
151+ } ;
152+ } ) ;
185153}
186154
187155/**
188- * Get parent edges where child has provider link but parent doesn't
156+ * Get parent edges where child has provider link but parent doesn't.
157+ * Uses LEFT JOIN + IS NULL instead of NOT EXISTS for better performance.
189158 */
190159function getParentLinkageGaps ( dbId : string , provider ?: string ) : ParentLinkageGap [ ] {
191160 const internalDbId = resolveDbId ( dbId ) || dbId ;
192161
193- // If provider is specified, filter to that specific provider
162+ // Build query parts based on whether provider filter is specified
194163 const providerFilter = provider
195164 ? `AND ei_child.source = @provider`
196165 : '' ;
@@ -215,13 +184,13 @@ function getParentLinkageGaps(dbId: string, provider?: string): ParentLinkageGap
215184 JOIN person pc ON pe.child_id = pc.person_id
216185 JOIN person pp ON pe.parent_id = pp.person_id
217186 JOIN external_identity ei_child ON pe.child_id = ei_child.person_id ${ providerFilter }
218- WHERE NOT EXISTS (
219- SELECT 1 FROM external_identity ei_parent
220- WHERE ei_parent.person_id = pe.parent_id
221- AND ei_parent.source = ei_child.source
222- )
223- ORDER BY pc.display_name, pe.parent_role ` ,
224- { dbId : internalDbId , provider : provider || '' }
187+ LEFT JOIN external_identity ei_parent
188+ ON pe.parent_id = ei_parent.person_id
189+ AND ei_parent.source = ei_child.source
190+ WHERE ei_parent.id IS NULL
191+ ORDER BY pc.display_name, pe.parent_role
192+ LIMIT 500 ` ,
193+ provider ? { dbId : internalDbId , provider } : { dbId : internalDbId }
225194 ) ;
226195
227196 return rows . map ( row => ( {
@@ -236,7 +205,7 @@ function getParentLinkageGaps(dbId: string, provider?: string): ParentLinkageGap
236205}
237206
238207/**
239- * Get orphaned parent edges (referencing non-existent person records)
208+ * Get orphaned parent edges (parent_id referencing non-existent person records)
240209 */
241210function getOrphanedEdges ( dbId : string ) : OrphanedEdge [ ] {
242211 const internalDbId = resolveDbId ( dbId ) || dbId ;
@@ -246,21 +215,14 @@ function getOrphanedEdges(dbId: string): OrphanedEdge[] {
246215 child_id : string ;
247216 parent_id : string ;
248217 parent_role : string ;
249- child_exists : number ;
250- parent_exists : number ;
251218 } > (
252- `SELECT
253- pe.id,
254- pe.child_id,
255- pe.parent_id,
256- pe.parent_role,
257- (SELECT COUNT(*) FROM person WHERE person_id = pe.child_id) as child_exists,
258- (SELECT COUNT(*) FROM person WHERE person_id = pe.parent_id) as parent_exists
219+ `SELECT pe.id, pe.child_id, pe.parent_id, pe.parent_role
259220 FROM parent_edge pe
260221 JOIN database_membership dm ON pe.child_id = dm.person_id AND dm.db_id = @dbId
261- WHERE NOT EXISTS (SELECT 1 FROM person p WHERE p.person_id = pe.parent_id)
262- OR NOT EXISTS (SELECT 1 FROM person p WHERE p.person_id = pe.child_id)
263- ORDER BY pe.child_id` ,
222+ LEFT JOIN person p ON pe.parent_id = p.person_id
223+ WHERE p.person_id IS NULL
224+ ORDER BY pe.child_id
225+ LIMIT 500` ,
264226 { dbId : internalDbId }
265227 ) ;
266228
@@ -269,16 +231,14 @@ function getOrphanedEdges(dbId: string): OrphanedEdge[] {
269231 childId : row . child_id ,
270232 parentId : row . parent_id ,
271233 parentRole : row . parent_role ,
272- missingPerson : row . child_exists === 0 && row . parent_exists === 0
273- ? 'both'
274- : row . child_exists === 0
275- ? 'child'
276- : 'parent' ,
234+ missingPerson : 'parent' as const ,
277235 } ) ) ;
278236}
279237
280238/**
281- * Get stale provider cache records older than N days
239+ * Get stale provider cache records older than N days.
240+ * This does filesystem I/O so is only called on-demand (not in summary).
241+ * Limits file reads to avoid blocking the event loop too long.
282242 */
283243function getStaleProviderData ( dbId : string , days = 30 ) : StaleRecord [ ] {
284244 const internalDbId = resolveDbId ( dbId ) || dbId ;
@@ -287,7 +247,6 @@ function getStaleProviderData(dbId: string, days = 30): StaleRecord[] {
287247 const cutoffIso = cutoff . toISOString ( ) ;
288248 const now = Date . now ( ) ;
289249
290- // Get persons in this database that have external identities
291250 const rows = sqliteService . queryAll < {
292251 person_id : string ;
293252 display_name : string ;
@@ -298,7 +257,8 @@ function getStaleProviderData(dbId: string, days = 30): StaleRecord[] {
298257 FROM external_identity ei
299258 JOIN database_membership dm ON ei.person_id = dm.person_id AND dm.db_id = @dbId
300259 JOIN person p ON ei.person_id = p.person_id
301- ORDER BY p.display_name` ,
260+ ORDER BY p.display_name
261+ LIMIT 2000` ,
302262 { dbId : internalDbId }
303263 ) ;
304264
@@ -308,7 +268,12 @@ function getStaleProviderData(dbId: string, days = 30): StaleRecord[] {
308268 const cachePath = path . join ( PROVIDER_CACHE_DIR , row . source , `${ row . external_id } .json` ) ;
309269 if ( ! fs . existsSync ( cachePath ) ) continue ;
310270
311- const cacheContent = JSON . parse ( fs . readFileSync ( cachePath , 'utf-8' ) ) ;
271+ const raw = fs . readFileSync ( cachePath , 'utf-8' ) ;
272+ const cacheContent = raw . startsWith ( '{' ) ? JSON . parse ( raw ) : null ;
273+ if ( ! cacheContent ) {
274+ logger . warn ( 'integrity' , `Corrupted cache file: ${ cachePath } ` ) ;
275+ continue ;
276+ }
312277 if ( cacheContent . scrapedAt && cacheContent . scrapedAt < cutoffIso ) {
313278 const scrapedDate = new Date ( cacheContent . scrapedAt ) ;
314279 const ageDays = Math . floor ( ( now - scrapedDate . getTime ( ) ) / ( 1000 * 60 * 60 * 24 ) ) ;
@@ -321,6 +286,9 @@ function getStaleProviderData(dbId: string, days = 30): StaleRecord[] {
321286 ageDays,
322287 } ) ;
323288 }
289+
290+ // Cap results to prevent massive response
291+ if ( stale . length >= 500 ) break ;
324292 }
325293
326294 return stale ;
0 commit comments