@@ -33,6 +33,7 @@ def __init__(self, lightbeam=None):
3333
3434 # Validates (selected) endpoints
3535 def validate (self ):
36+
3637 # The below should go in __init__(), but rely on lightbeam.config which is not yet available there.
3738 self .fail_fast_threshold = self .lightbeam .config .get ("validate" ,{}).get ("references" ,{}).get ("max_failures" , self .DEFAULT_FAIL_FAST_THRESHOLD )
3839 self .validation_methods = self .lightbeam .config .get ("validate" ,{}).get ("methods" ,self .DEFAULT_VALIDATION_METHODS )
@@ -65,6 +66,13 @@ def validate(self):
6566 # to comparatively small datasets (sections, schools, students).
6667 self .build_local_reference_cache (endpoint )
6768 asyncio .run (self .validate_endpoint (endpoint ))
69+
70+ # write structured output (if needed)
71+ self .lightbeam .write_structured_output ()
72+
73+ if self .lightbeam .metadata ["total_records_processed" ] == self .lightbeam .metadata ["total_records_failed" ]:
74+ self .logger .info ("all payloads failed" )
75+ exit (1 ) # signal to downstream tasks (in Airflow) all payloads failed
6876
6977 def build_local_reference_cache (self , endpoint ):
7078 swagger = self .lightbeam .api .resources_swagger
@@ -173,14 +181,15 @@ def get_swagger_definition_for_endpoint(self, endpoint):
173181
174182 # Validates a single endpoint based on the Swagger docs
175183 async def validate_endpoint (self , endpoint ):
184+ self .lightbeam .metadata ["resources" ].update ({endpoint : {}})
176185 definition = self .get_swagger_definition_for_endpoint (endpoint )
177186 data_files = self .lightbeam .get_data_files_for_endpoint (endpoint )
178187 tasks = []
179188 total_counter = 0
189+ self .lightbeam .num_errors = 0
180190 for file_name in data_files :
181191 self .logger .info (f"validating { file_name } against { definition } schema..." )
182192 with open (file_name ) as file :
183- self .lightbeam .num_errors = 0
184193 for line_counter , line in enumerate (file ):
185194 total_counter += 1
186195 data = line .strip ()
@@ -200,13 +209,20 @@ async def validate_endpoint(self, endpoint):
200209 break
201210
202211 if len (tasks )> 0 : await self .lightbeam .do_tasks (tasks , total_counter , log_status_counts = False )
212+
213+ # update metadata counts for this endpoint
214+ self .lightbeam .metadata ["resources" ][endpoint ].update ({
215+ "records_processed" : total_counter ,
216+ "records_skipped" : self .lightbeam .num_skipped ,
217+ "records_failed" : self .lightbeam .num_errors
218+ })
203219
204220 if self .lightbeam .num_errors == 0 : self .logger .info (f"... all lines validate ok!" )
205221 else :
206222 num_others = self .lightbeam .num_errors - self .MAX_VALIDATION_ERRORS_TO_DISPLAY
207223 if self .lightbeam .num_errors > self .MAX_VALIDATION_ERRORS_TO_DISPLAY :
208- self .logger .critical (f"... and { num_others } others!" )
209- self .logger .critical (f"... VALIDATION ERRORS on { self .lightbeam .num_errors } of { line_counter } lines in { file_name } ; see details above." )
224+ self .logger .warn (f"... and { num_others } others!" )
225+ self .logger .warn (f"... VALIDATION ERRORS on { self .lightbeam .num_errors } of { line_counter } lines in { file_name } ; see details above." )
210226
211227
212228 async def do_validate_payload (self , endpoint , file_name , data , line_counter ):
@@ -233,41 +249,33 @@ async def do_validate_payload(self, endpoint, file_name, data, line_counter):
233249 try :
234250 payload = json .loads (data )
235251 except Exception as e :
236- if self .lightbeam .num_errors < self .MAX_VALIDATION_ERRORS_TO_DISPLAY :
237- self .logger .warning (f"... VALIDATION ERROR (line { line_counter } ): invalid JSON" + str (e ).replace (" line 1" ,"" ))
238- self .lightbeam .num_errors += 1
252+ self .log_validation_error (endpoint , file_name , line_counter , "json" , f"invalid JSON { str (e ).replace (' line 1' ,'' )} " )
239253 return
240254
241255 # check payload obeys Swagger schema
242256 if "schema" in self .validation_methods :
243257 try :
244258 validator .validate (payload )
245259 except Exception as e :
246- if self .lightbeam .num_errors < self .MAX_VALIDATION_ERRORS_TO_DISPLAY :
247- e_path = [str (x ) for x in list (e .path )]
248- context = ""
249- if len (e_path )> 0 : context = " in " + " -> " .join (e_path )
250- self .logger .warning (f"... VALIDATION ERROR (line { line_counter } ): " + str (e .message ) + context )
251- self .lightbeam .num_errors += 1
260+ e_path = [str (x ) for x in list (e .path )]
261+ context = ""
262+ if len (e_path )> 0 : context = " in " + " -> " .join (e_path )
263+ self .log_validation_error (endpoint , file_name , line_counter , "schema" , f"{ str (e .message )} { context } " )
252264 return
253265
254266 # check descriptor values are valid
255267 if "descriptors" in self .validation_methods :
256268 error_message = self .has_invalid_descriptor_values (payload , path = "" )
257269 if error_message != "" :
258- if self .lightbeam .num_errors < self .MAX_VALIDATION_ERRORS_TO_DISPLAY :
259- self .logger .warning (f"... VALIDATION ERROR (line { line_counter } ): " + error_message )
260- self .lightbeam .num_errors += 1
270+ self .log_validation_error (endpoint , file_name , line_counter , "descriptors" , error_message )
261271 return
262272
263273 # check natural keys are unique
264274 if "uniqueness" in self .validation_methods :
265275 params = json .dumps (util .interpolate_params (params_structure , payload ))
266276 params_hash = hashlog .get_hash (params )
267277 if params_hash in distinct_params :
268- if self .lightbeam .num_errors < self .MAX_VALIDATION_ERRORS_TO_DISPLAY :
269- self .logger .warning (f"... VALIDATION ERROR (line { line_counter } ): duplicate value(s) for natural key(s): { params } " )
270- self .lightbeam .num_errors += 1
278+ self .log_validation_error (endpoint , file_name , line_counter , "uniqueness" , "duplicate value(s) for natural key(s): {params}" )
271279 return
272280 else : distinct_params .append (params_hash )
273281
@@ -276,11 +284,33 @@ async def do_validate_payload(self, endpoint, file_name, data, line_counter):
276284 self .lightbeam .api .do_oauth ()
277285 error_message = self .has_invalid_references (payload , path = "" )
278286 if error_message != "" :
279- if self .lightbeam .num_errors < self .MAX_VALIDATION_ERRORS_TO_DISPLAY :
280- self .logger .warning (f"... VALIDATION ERROR (line { line_counter } ): " + error_message )
281- self .lightbeam .num_errors += 1
287+ self .log_validation_error (endpoint , file_name , line_counter , "references" , error_message )
282288
283289
290+ def log_validation_error (self , endpoint , file_name , line_number , method , message ):
291+ if self .lightbeam .num_errors < self .MAX_VALIDATION_ERRORS_TO_DISPLAY :
292+ self .logger .warning (f"... VALIDATION ERROR (line { line_number } ): { message } " )
293+ self .lightbeam .num_errors += 1
294+
295+ # update run metadata...
296+ failures = self .lightbeam .metadata ["resources" ][endpoint ].get ("failures" , [])
297+ do_append = True
298+ for index , item in enumerate (failures ):
299+ if item ["method" ]== method and item ["message" ]== message and item ["file" ]== file_name :
300+ failures [index ]["line_numbers" ].append (line_number )
301+ failures [index ]["count" ] += 1
302+ do_append = False
303+ if do_append :
304+ failure = {
305+ 'method' : method ,
306+ 'message' : message ,
307+ 'file' : file_name ,
308+ 'line_numbers' : [line_number ],
309+ 'count' : 1
310+ }
311+ failures .append (failure )
312+ self .lightbeam .metadata ["resources" ][endpoint ]["failures" ] = failures
313+
284314 def load_local_descriptors (self ):
285315 local_descriptors = []
286316 all_endpoints = self .lightbeam .api .get_sorted_endpoints ()
0 commit comments