@@ -218,9 +218,17 @@ def _fit_column_metrics(
218218 metrics : dict [str , list [ColumnMetricFactory ]],
219219):
220220 get_table = lazy_load_tables (tables )
221+ table = get_table (name )
221222
222223 if ref .table_has_reference ():
223224 ids = ref .find_foreign_ids (name , get_table )
225+
226+ if len (table .index .symmetric_difference (ids .index )):
227+ old_len = len (table )
228+ table = table .reindex (ids .index )
229+ logger .warn (
230+ f"There are missing ids for rows in { name } , dropping { old_len - len (table )} /{ old_len } rows with missing ids."
231+ )
224232 else :
225233 ids = None
226234
@@ -237,22 +245,22 @@ def _fit_column_metrics(
237245 m = factory .build (** col .args )
238246
239247 if isinstance (m , ColumnMetric ):
240- m .fit (name , col_name , col , get_table ( name ) [col_name ])
248+ m .fit (name , col_name , col , table [col_name ])
241249 elif isinstance (m , RefColumnMetric ):
242- ref_col = _calc_joined_refs (name , get_table , ids , col .ref )
250+ ref_col = _calc_joined_refs (name , get_table , ids , col .ref , table )
243251 m .fit (
244252 name ,
245253 col_name ,
246254 col ,
247- RefColumnData (data = get_table ( name ) [col_name ], ref = ref_col ),
255+ RefColumnData (data = table [col_name ], ref = ref_col ),
248256 )
249257 elif isinstance (m , SeqColumnMetric ):
250- ref_col = _calc_unjoined_refs (name , get_table , col .ref )
258+ ref_col = _calc_unjoined_refs (name , get_table , col .ref , table )
251259 m .fit (
252260 name ,
253261 col_name ,
254262 col ,
255- SeqColumnData (data = get_table ( name ) [col_name ], ref = ref_col , ids = ids ),
263+ SeqColumnData (data = table [col_name ], ref = ref_col , ids = ids ),
256264 )
257265 else :
258266 assert False , f"Unknown column metric type: { type (m )} "
@@ -272,10 +280,25 @@ def _preprocess_metrics(
272280):
273281 get_table_wrk = lazy_load_tables (tables_wrk )
274282 get_table_ref = lazy_load_tables (tables_ref )
283+ table_wrk = get_table_wrk (name )
284+ table_ref = get_table_ref (name )
275285
276286 if ref .table_has_reference ():
277287 ids_wrk = ref .find_foreign_ids (name , get_table_wrk )
278288 ids_ref = ref .find_foreign_ids (name , get_table_ref )
289+
290+ if len (table_wrk .index .symmetric_difference (ids_wrk .index )):
291+ old_len = len (table_wrk )
292+ table_wrk = table_wrk .reindex (ids_wrk .index )
293+ logger .warn (
294+ f"There are missing ids for rows in { name } , dropping { old_len - len (table_wrk )} /{ old_len } rows with missing ids."
295+ )
296+ if len (table_ref .index .symmetric_difference (ids_ref .index )):
297+ old_len = len (table_ref )
298+ table_ref = table_ref .reindex (ids_ref .index )
299+ logger .warn (
300+ f"There are missing ids for rows in { name } , dropping { old_len - len (table_ref )} /{ old_len } rows with missing ids."
301+ )
279302 else :
280303 ids_wrk = None
281304 ids_ref = None
@@ -286,30 +309,38 @@ def _preprocess_metrics(
286309 col = meta [name ][col_name ]
287310 if isinstance (m , ColumnMetric ):
288311 prec = m .preprocess (
289- get_table_wrk ( name ) [col_name ],
290- get_table_ref ( name ) [col_name ],
312+ table_wrk [col_name ],
313+ table_ref [col_name ],
291314 )
292315 elif isinstance (m , RefColumnMetric ):
293316 prec = m .preprocess (
294317 RefColumnData (
295- data = get_table_wrk (name )[col_name ],
296- ref = _calc_joined_refs (name , get_table_wrk , ids_ref , col .ref ),
318+ data = table_wrk [col_name ],
319+ ref = _calc_joined_refs (
320+ name , get_table_wrk , ids_ref , col .ref , table_wrk
321+ ),
297322 ),
298323 RefColumnData (
299- data = get_table_ref (name )[col_name ],
300- ref = _calc_joined_refs (name , get_table_ref , ids_ref , col .ref ),
324+ data = table_ref [col_name ],
325+ ref = _calc_joined_refs (
326+ name , get_table_ref , ids_ref , col .ref , table_ref
327+ ),
301328 ),
302329 )
303330 elif isinstance (m , SeqColumnMetric ):
304331 prec = m .preprocess (
305332 SeqColumnData (
306- data = get_table_wrk (name )[col_name ],
307- ref = _calc_unjoined_refs (name , get_table_wrk , col .ref ),
333+ data = table_wrk [col_name ],
334+ ref = _calc_unjoined_refs (
335+ name , get_table_wrk , col .ref , table_wrk
336+ ),
308337 ids = ids_wrk ,
309338 ),
310339 SeqColumnData (
311- data = get_table_ref (name )[col_name ],
312- ref = _calc_unjoined_refs (name , get_table_ref , col .ref ),
340+ data = table_ref [col_name ],
341+ ref = _calc_unjoined_refs (
342+ name , get_table_ref , col .ref , table_ref
343+ ),
313344 ids = ids_ref ,
314345 ),
315346 )
@@ -334,11 +365,33 @@ def _process_metrics(
334365 get_table_wrk = lazy_load_tables (tables_wrk )
335366 get_table_ref = lazy_load_tables (tables_ref )
336367 get_table_syn = lazy_load_tables (tables_syn )
368+ table_wrk = get_table_wrk (name )
369+ table_ref = get_table_ref (name )
370+ table_syn = get_table_syn (name )
337371
338372 if ref .table_has_reference ():
339373 ids_wrk = ref .find_foreign_ids (name , get_table_wrk )
340374 ids_ref = ref .find_foreign_ids (name , get_table_ref )
341375 ids_syn = ref .find_foreign_ids (name , get_table_syn )
376+
377+ if len (table_wrk .index .symmetric_difference (ids_wrk .index )):
378+ old_len = len (table_wrk )
379+ table_wrk = table_wrk .reindex (ids_wrk .index )
380+ logger .warn (
381+ f"There are missing ids for rows in { name } , dropping { old_len - len (table_wrk )} /{ old_len } rows with missing ids."
382+ )
383+ if len (table_ref .index .symmetric_difference (ids_ref .index )):
384+ old_len = len (table_ref )
385+ table_ref = table_ref .reindex (ids_ref .index )
386+ logger .warn (
387+ f"There are missing ids for rows in { name } , dropping { old_len - len (table_ref )} /{ old_len } rows with missing ids."
388+ )
389+ if len (table_syn .index .symmetric_difference (ids_syn .index )):
390+ old_len = len (table_syn )
391+ table_syn = table_syn .reindex (ids_syn .index )
392+ logger .warn (
393+ f"There are missing ids for rows in { name } , dropping { old_len - len (table_syn )} /{ old_len } rows with missing ids."
394+ )
342395 else :
343396 ids_wrk = None
344397 ids_ref = None
@@ -358,34 +411,46 @@ def _process_metrics(
358411 elif isinstance (m , RefColumnMetric ):
359412 proc = m .process (
360413 RefColumnData (
361- data = get_table_wrk (name )[col_name ],
362- ref = _calc_joined_refs (name , get_table_wrk , ids_wrk , col .ref ),
414+ data = table_wrk [col_name ],
415+ ref = _calc_joined_refs (
416+ name , get_table_wrk , ids_wrk , col .ref , table_wrk
417+ ),
363418 ),
364419 RefColumnData (
365- data = get_table_ref (name )[col_name ],
366- ref = _calc_joined_refs (name , get_table_ref , ids_ref , col .ref ),
420+ data = table_ref [col_name ],
421+ ref = _calc_joined_refs (
422+ name , get_table_ref , ids_ref , col .ref , table_ref
423+ ),
367424 ),
368425 RefColumnData (
369- data = get_table_syn (name )[col_name ],
370- ref = _calc_joined_refs (name , get_table_syn , ids_syn , col .ref ),
426+ data = table_syn [col_name ],
427+ ref = _calc_joined_refs (
428+ name , get_table_syn , ids_syn , col .ref , table_syn
429+ ),
371430 ),
372431 prec ,
373432 )
374433 elif isinstance (m , SeqColumnMetric ):
375434 proc = m .process (
376435 SeqColumnData (
377- data = get_table_wrk (name )[col_name ],
378- ref = _calc_unjoined_refs (name , get_table_wrk , col .ref ),
436+ data = table_wrk [col_name ],
437+ ref = _calc_unjoined_refs (
438+ name , get_table_wrk , col .ref , table_wrk
439+ ),
379440 ids = ids_wrk ,
380441 ),
381442 SeqColumnData (
382- data = get_table_ref (name )[col_name ],
383- ref = _calc_unjoined_refs (name , get_table_ref , col .ref ),
443+ data = table_ref [col_name ],
444+ ref = _calc_unjoined_refs (
445+ name , get_table_ref , col .ref , table_ref
446+ ),
384447 ids = ids_ref ,
385448 ),
386449 SeqColumnData (
387- data = get_table_syn (name )[col_name ],
388- ref = _calc_unjoined_refs (name , get_table_syn , col .ref ),
450+ data = table_syn [col_name ],
451+ ref = _calc_unjoined_refs (
452+ name , get_table_syn , col .ref , table_syn
453+ ),
389454 ids = ids_syn ,
390455 ),
391456 prec ,
0 commit comments