implementing structured results file output for validate

Tom Reitz · Tom Reitz · commit d4a61ddaca56 · 2024-08-13T10:28:18.000-05:00
diff --git a/lightbeam/lightbeam.py b/lightbeam/lightbeam.py
@@ -1,4 +1,5 @@
 import os
+import re
 import json
 import yaml
 import logging
@@ -80,6 +81,7 @@ def __init__(self, config_file, logger=None, selector="*", exclude="", keep_keys
         self.api = EdFiAPI(self)
         self.token_version = 0        
         self.results_file = results_file
+        self.start_timestamp = datetime.now()
 
         # load params and/or env vars for config YAML interpolation
         self.params = json.loads(params) if params else {}
@@ -115,6 +117,51 @@ def __init__(self, config_file, logger=None, selector="*", exclude="", keep_keys
         if self.track_state and not os.path.isdir(self.config["state_dir"]):
             self.logger.debug("creating state dir {0}".format(self.config["state_dir"]))
             os.mkdir(self.config["state_dir"])
+
+        # Initialize a dictionary for tracking run metadata (for structured output)
+        self.metadata = {
+            "started_at": self.start_timestamp.isoformat(timespec='microseconds'),
+            "working_dir": os.getcwd(),
+            "config_file": self.config_file,
+            "data_dir": self.config["data_dir"],
+            "api_url": self.config["edfi_api"]["base_url"],
+            "namespace": self.config["namespace"],
+            "resources": {}
+        }
+    
+    # helper function used below
+    def replace_linebreaks(self, m):
+        return re.sub(r"\s+", '', m.group(0))
+
+    def write_structured_output(self):
+        ### Create structured output results_file if necessary
+        self.end_timestamp = datetime.now()
+        self.metadata.update({
+            "completed_at": self.end_timestamp.isoformat(timespec='microseconds'),
+            "runtime_sec": (self.end_timestamp - self.start_timestamp).total_seconds(),
+            "total_records_processed": sum(item['records_processed'] for item in self.metadata["resources"].values()),
+            "total_records_skipped": sum(item['records_skipped'] for item in self.metadata["resources"].values()),
+            "total_records_failed": sum(item['records_failed'] for item in self.metadata["resources"].values())
+        })
+        # sort failing line numbers
+        for resource in self.metadata["resources"].keys():
+            if "failures" in self.metadata["resources"][resource].keys():
+                for idx, _ in enumerate(self.metadata["resources"][resource]["failures"]):
+                    self.metadata["resources"][resource]["failures"][idx]["line_numbers"].sort()
+
+        ### Create structured output results_file if necessary
+        if self.results_file:
+
+            # create directory if not exists
+            os.makedirs(os.path.dirname(self.results_file), exist_ok=True)
+
+            with open(self.results_file, 'w') as fp:
+                content = json.dumps(self.metadata, indent=4)
+                # failures.line_numbers are split each on their own line; here we remove those line breaks
+                content = re.sub(r'"line_numbers": \[(\d|,|\s|\n)*\]', self.replace_linebreaks, content)
+                fp.write(content)
+        self.logger.info(f"results written to {self.results_file}")
+        
     
     def load_config_file(self) -> dict:
         _env_backup = os.environ.copy()
diff --git a/lightbeam/send.py b/lightbeam/send.py
@@ -3,7 +3,6 @@
 import time
 import json
 import asyncio
-import datetime
 
 from lightbeam import util
 from lightbeam import hashlog
@@ -16,22 +15,10 @@ def __init__(self, lightbeam=None):
         self.lightbeam.reset_counters()
         self.logger = self.lightbeam.logger
         self.hashlog_data = {}
-        self.start_timestamp = datetime.datetime.now()
 
     # Sends all (selected) endpoints
     def send(self):
 
-        # Initialize a dictionary for tracking run metadata (for structured output)
-        self.metadata = {
-            "started_at": self.start_timestamp.isoformat(timespec='microseconds'),
-            "working_dir": os.getcwd(),
-            "config_file": self.lightbeam.config_file,
-            "data_dir": self.lightbeam.config["data_dir"],
-            "api_url": self.lightbeam.config["edfi_api"]["base_url"],
-            "namespace": self.lightbeam.config["namespace"],
-            "resources": {}
-        }
-
         # get token with which to send requests
         self.lightbeam.api.do_oauth()
 
@@ -47,43 +34,15 @@ def send(self):
             self.logger.info("finished processing endpoint {0}!".format(endpoint))
             self.logger.info("  (final status counts: {0}) ".format(self.lightbeam.status_counts))
             self.lightbeam.log_status_reasons()
+        
+        # write structured output (if needed)
+        self.lightbeam.write_structured_output()
 
-        ### Create structured output results_file if necessary
-        self.end_timestamp = datetime.datetime.now()
-        self.metadata.update({
-            "completed_at": self.end_timestamp.isoformat(timespec='microseconds'),
-            "runtime_sec": (self.end_timestamp - self.start_timestamp).total_seconds(),
-            "total_records_processed": sum(item['records_processed'] for item in self.metadata["resources"].values()),
-            "total_records_skipped": sum(item['records_skipped'] for item in self.metadata["resources"].values()),
-            "total_records_failed": sum(item['records_failed'] for item in self.metadata["resources"].values())
-        })
-        # sort failing line numbers
-        for resource in self.metadata["resources"].keys():
-            if "failures" in self.metadata["resources"][resource].keys():
-                for idx, _ in enumerate(self.metadata["resources"][resource]["failures"]):
-                    self.metadata["resources"][resource]["failures"][idx]["line_numbers"].sort()
-
-        # helper function used below
-        def repl(m):
-            return re.sub(r"\s+", '', m.group(0))
-
-        ### Create structured output results_file if necessary
-        if self.lightbeam.results_file:
-
-            # create directory if not exists
-            os.makedirs(os.path.dirname(self.lightbeam.results_file), exist_ok=True)
-
-            with open(self.lightbeam.results_file, 'w') as fp:
-                content = json.dumps(self.metadata, indent=4)
-                # failures.line_numbers are split each on their own line; here we remove those line breaks
-                content = re.sub(r'"line_numbers": \[(\d|,|\s|\n)*\]', repl, content)
-                fp.write(content)
-
-        if self.metadata["total_records_processed"] == self.metadata["total_records_skipped"]:
+        if self.lightbeam.metadata["total_records_processed"] == self.lightbeam.metadata["total_records_skipped"]:
             self.logger.info("all payloads skipped")
             exit(99) # signal to downstream tasks (in Airflow) all payloads skipped
 
-        if self.metadata["total_records_processed"] == self.metadata["total_records_failed"]:
+        if self.lightbeam.metadata["total_records_processed"] == self.lightbeam.metadata["total_records_failed"]:
             self.logger.info("all payloads failed")
             exit(1) # signal to downstream tasks (in Airflow) all payloads failed
 
@@ -100,7 +59,7 @@ async def do_send(self, endpoint):
             hashlog_file = os.path.join(self.lightbeam.config["state_dir"], f"{endpoint}.dat")
             self.hashlog_data = hashlog.load(hashlog_file)
 
-        self.metadata["resources"].update({endpoint: {}})
+        self.lightbeam.metadata["resources"].update({endpoint: {}})
         self.lightbeam.reset_counters()
 
         # process each file
@@ -169,8 +128,8 @@ async def do_send(self, endpoint):
             if status>=200 and status<300:
                 successes.append({"status_code": status, "count": self.lightbeam.status_counts[status]})
         if len(successes)>0:
-            self.metadata["resources"][endpoint].update({"successes": successes})
-        self.metadata["resources"][endpoint].update({
+            self.lightbeam.metadata["resources"][endpoint].update({"successes": successes})
+        self.lightbeam.metadata["resources"][endpoint].update({
             "records_processed": total_counter,
             "records_skipped": self.lightbeam.num_skipped,
             "records_failed": self.lightbeam.num_errors
@@ -199,7 +158,7 @@ async def do_post(self, endpoint, file_name, data, line, data_hash):
                             message = str(response.status) + ": " + util.linearize(json.loads(body).get("message"))
 
                             # update run metadata...
-                            failures = self.metadata["resources"][endpoint].get("failures", [])
+                            failures = self.lightbeam.metadata["resources"][endpoint].get("failures", [])
                             do_append = True
                             for index, item in enumerate(failures):
                                 if item["status_code"]==response.status and item["message"]==message and item["file"]==file_name:
@@ -215,7 +174,7 @@ async def do_post(self, endpoint, file_name, data, line, data_hash):
                                     'count': 1
                                 }
                                 failures.append(failure)
-                            self.metadata["resources"][endpoint]["failures"] = failures
+                            self.lightbeam.metadata["resources"][endpoint]["failures"] = failures
 
                             # update output and counters
                             self.lightbeam.increment_status_reason(message)
diff --git a/lightbeam/validate.py b/lightbeam/validate.py
@@ -33,6 +33,7 @@ def __init__(self, lightbeam=None):
         
     # Validates (selected) endpoints
     def validate(self):
+
         # The below should go in __init__(), but rely on lightbeam.config which is not yet available there.
         self.fail_fast_threshold = self.lightbeam.config.get("validate",{}).get("references",{}).get("max_failures", self.DEFAULT_FAIL_FAST_THRESHOLD)
         self.validation_methods = self.lightbeam.config.get("validate",{}).get("methods",self.DEFAULT_VALIDATION_METHODS)
@@ -65,6 +66,13 @@ def validate(self):
                 # to comparatively small datasets (sections, schools, students).
                 self.build_local_reference_cache(endpoint)
             asyncio.run(self.validate_endpoint(endpoint))
+        
+        # write structured output (if needed)
+        self.lightbeam.write_structured_output()
+
+        if self.lightbeam.metadata["total_records_processed"] == self.lightbeam.metadata["total_records_failed"]:
+            self.logger.info("all payloads failed")
+            exit(1) # signal to downstream tasks (in Airflow) all payloads failed
     
     def build_local_reference_cache(self, endpoint):
         swagger = self.lightbeam.api.resources_swagger
@@ -173,14 +181,15 @@ def get_swagger_definition_for_endpoint(self, endpoint):
     
     # Validates a single endpoint based on the Swagger docs
     async def validate_endpoint(self, endpoint):
+        self.lightbeam.metadata["resources"].update({endpoint: {}})
         definition = self.get_swagger_definition_for_endpoint(endpoint)
         data_files = self.lightbeam.get_data_files_for_endpoint(endpoint)
         tasks = []
         total_counter = 0
+        self.lightbeam.num_errors = 0
         for file_name in data_files:
             self.logger.info(f"validating {file_name} against {definition} schema...")
             with open(file_name) as file:
-                self.lightbeam.num_errors = 0
                 for line_counter, line in enumerate(file):
                     total_counter += 1
                     data = line.strip()
@@ -200,13 +209,20 @@ async def validate_endpoint(self, endpoint):
                         break
 
             if len(tasks)>0: await self.lightbeam.do_tasks(tasks, total_counter, log_status_counts=False)
+
+            # update metadata counts for this endpoint
+            self.lightbeam.metadata["resources"][endpoint].update({
+                "records_processed": total_counter,
+                "records_skipped": self.lightbeam.num_skipped,
+                "records_failed": self.lightbeam.num_errors
+            })
             
             if self.lightbeam.num_errors==0: self.logger.info(f"... all lines validate ok!")
             else:
                 num_others = self.lightbeam.num_errors - self.MAX_VALIDATION_ERRORS_TO_DISPLAY
                 if self.lightbeam.num_errors > self.MAX_VALIDATION_ERRORS_TO_DISPLAY:
-                    self.logger.critical(f"... and {num_others} others!")
-                self.logger.critical(f"... VALIDATION ERRORS on {self.lightbeam.num_errors} of {line_counter} lines in {file_name}; see details above.")
+                    self.logger.warn(f"... and {num_others} others!")
+                self.logger.warn(f"... VALIDATION ERRORS on {self.lightbeam.num_errors} of {line_counter} lines in {file_name}; see details above.")
 
 
     async def do_validate_payload(self, endpoint, file_name, data, line_counter):
@@ -233,41 +249,33 @@ async def do_validate_payload(self, endpoint, file_name, data, line_counter):
         try:
             payload = json.loads(data)
         except Exception as e:
-            if self.lightbeam.num_errors < self.MAX_VALIDATION_ERRORS_TO_DISPLAY:
-                self.logger.warning(f"... VALIDATION ERROR (line {line_counter}): invalid JSON" + str(e).replace(" line 1",""))
-            self.lightbeam.num_errors += 1
+            self.log_validation_error(endpoint, file_name, line_counter, "json", f"invalid JSON {str(e).replace(' line 1','')}")
             return
 
         # check payload obeys Swagger schema
         if "schema" in self.validation_methods:
             try:
                 validator.validate(payload)
             except Exception as e:
-                if self.lightbeam.num_errors < self.MAX_VALIDATION_ERRORS_TO_DISPLAY:
-                    e_path = [str(x) for x in list(e.path)]
-                    context = ""
-                    if len(e_path)>0: context = " in " + " -> ".join(e_path)
-                    self.logger.warning(f"... VALIDATION ERROR (line {line_counter}): " + str(e.message) + context)
-                self.lightbeam.num_errors += 1
+                e_path = [str(x) for x in list(e.path)]
+                context = ""
+                if len(e_path)>0: context = " in " + " -> ".join(e_path)
+                self.log_validation_error(endpoint, file_name, line_counter, "schema", f"{str(e.message)} {context}")
                 return
 
         # check descriptor values are valid
         if "descriptors" in self.validation_methods:
             error_message = self.has_invalid_descriptor_values(payload, path="")
             if error_message != "":
-                if self.lightbeam.num_errors < self.MAX_VALIDATION_ERRORS_TO_DISPLAY:
-                    self.logger.warning(f"... VALIDATION ERROR (line {line_counter}): " + error_message)
-                self.lightbeam.num_errors += 1
+                self.log_validation_error(endpoint, file_name, line_counter, "descriptors", error_message)
                 return
 
         # check natural keys are unique
         if "uniqueness" in self.validation_methods:
             params = json.dumps(util.interpolate_params(params_structure, payload))
             params_hash = hashlog.get_hash(params)
             if params_hash in distinct_params:
-                if self.lightbeam.num_errors < self.MAX_VALIDATION_ERRORS_TO_DISPLAY:
-                    self.logger.warning(f"... VALIDATION ERROR (line {line_counter}): duplicate value(s) for natural key(s): {params}")
-                self.lightbeam.num_errors += 1
+                self.log_validation_error(endpoint, file_name, line_counter, "uniqueness", "duplicate value(s) for natural key(s): {params}")
                 return
             else: distinct_params.append(params_hash)
 
@@ -276,11 +284,33 @@ async def do_validate_payload(self, endpoint, file_name, data, line_counter):
             self.lightbeam.api.do_oauth()
             error_message = self.has_invalid_references(payload, path="")
             if error_message != "":
-                if self.lightbeam.num_errors < self.MAX_VALIDATION_ERRORS_TO_DISPLAY:
-                    self.logger.warning(f"... VALIDATION ERROR (line {line_counter}): " + error_message)
-                self.lightbeam.num_errors += 1
+                self.log_validation_error(endpoint, file_name, line_counter, "references", error_message)
                 
                 
+    def log_validation_error(self, endpoint, file_name, line_number, method, message):
+        if self.lightbeam.num_errors < self.MAX_VALIDATION_ERRORS_TO_DISPLAY:
+            self.logger.warning(f"... VALIDATION ERROR (line {line_number}): {message}")
+        self.lightbeam.num_errors += 1
+
+        # update run metadata...
+        failures = self.lightbeam.metadata["resources"][endpoint].get("failures", [])
+        do_append = True
+        for index, item in enumerate(failures):
+            if item["method"]==method and item["message"]==message and item["file"]==file_name:
+                failures[index]["line_numbers"].append(line_number)
+                failures[index]["count"] += 1
+                do_append = False
+        if do_append:
+            failure = {
+                'method': method,
+                'message': message,
+                'file': file_name,
+                'line_numbers': [line_number],
+                'count': 1
+            }
+            failures.append(failure)
+        self.lightbeam.metadata["resources"][endpoint]["failures"] = failures
+    
     def load_local_descriptors(self):
         local_descriptors = []
         all_endpoints = self.lightbeam.api.get_sorted_endpoints()