Merge pull request #30 from edanalytics/feature/reference_validation

tomreitz · web-flow · commit 7c8c395e9a3a · 2024-07-12T14:01:07.000-05:00
validate references
diff --git a/README.md b/README.md
@@ -135,13 +135,36 @@ Like [selectors](#selectors), `keep-keys` and `drop-keys` are comma-separated li
 ```bash
 lightbeam validate -c path/to/config.yaml
 ```
-You may `validate` your JSONL before transmitting it. This checks that the payloads
-1. are valid JSON
-1. conform to the structure described in the Swagger documents for [resources](https://api.ed-fi.org/v5.3/api/metadata/data/v3/resourcess/swagger.json) and [descriptors](https://api.ed-fi.org/v5.3/api/metadata/data/v3/descriptors/swagger.json) fetched from your API
-1. contain valid descriptor values (fetched from your API and/or from descriptor values in your JSONL files)
-1. contain unique values for any natural key
+You may `validate` your JSONL before transmitting it. Configuration for `validate` goes in its own section of `lightbeam.yaml`:
+```yaml
+validate:
+  methods:
+    - schema # checks that payloads conform to the Swagger definitions from the API
+    - descriptors # checks that descriptor values are either locally-defined or exist in the remote API
+    - uniqueness # checks that local payloads are unique by the required property values
+    - references # checks that references resolve, either locally or in the remote API
+  # or
+  # methods: "*"
+```
+Default `validate`.`methods` are `["schema", "descriptors", "uniqueness"]` (not `references`; see below). In addition to the above methods, `lighteam validate` will also (first) check that each payload is valid JSON.
+
+The `references` `method` can be slow, as a separate `GET` request may be made to your API for each reference. (Therefore the validation method is disabled by default.) `lightbeam` tries to improve efficiency by:
+* batching requests and sending several concurrently (based on `connection`.`pool_size` of `lightbeam.yaml`)
+* caching responses and first checking the cache before making another (potentially identical) request
+
+Even with these optimizations, checking `references` can easily take minutes for even relatively small amounts of data. Therefore `lightbeam.yaml` also accepts a further configuration option:
+```yaml
+validate:
+  references:
+    max_failures: 10 # stop testing after X failed payloads ("fail fast")
+```
+This is optional; if absent, references in every payload are checked, no matter how many fail.
+
+**Note:** Reference validation efficiency may be improved by first `lightbeam fetch`ing certain resources to have a local copy. `lightbeam validate` checks local JSONL files to resolve references before trying the remote API, and `fetch` retrieves many records per  `GET`, so total runtime can be faster in this scenario. The downsides include
+* more data movement
+* `fetch`ed data becoming stale over time
+* needing to track which data is your own vs. was `fetch`ed (all the data must coexist in the `config.data_dir` to be discoverable by `lightbeam validate`)
 
-This command will not find invalid reference errors, but is helpful for finding payloads that are invalid JSON, are missing required fields, or have other structural issues.
 
 ## `send`
 ```bash
diff --git a/lightbeam/api.py b/lightbeam/api.py
@@ -53,10 +53,10 @@ def prepare(self):
             self.config["open_api_metadata_url"] = api_base["urls"]["openApiMetadata"]
 
         # load all endpoints in dependency-order
-        all_endpoints = self.get_sorted_endpoints()
+        self.lightbeam.all_endpoints = self.get_sorted_endpoints()
 
         # filter down to only selected endpoints
-        self.lightbeam.endpoints = self.apply_filters(all_endpoints)
+        self.lightbeam.endpoints = self.apply_filters(self.lightbeam.all_endpoints)
 
 
     def apply_filters(self, endpoints=[]):
diff --git a/lightbeam/delete.py b/lightbeam/delete.py
@@ -86,7 +86,8 @@ async def do_deletes(self, endpoint):
                     data = line.strip()
                     # fill out the required fields from the data payload
                     # (so we can search for matching records in the API)
-                    params = util.interpolate_params(params_structure, data)
+                    payload = json.loads(data)
+                    params = util.interpolate_params(params_structure, payload)
 
                     # check if we've posted this data before
                     data_hash = hashlog.get_hash(data)
diff --git a/lightbeam/lightbeam.py b/lightbeam/lightbeam.py
@@ -174,12 +174,33 @@ def get_data_files_for_endpoint(self, endpoint):
         return file_list
 
     # Prunes the list of endpoints down to those for which .jsonl files exist in the config.data_dir
-    def get_endpoints_with_data(self, endpoints):
+    def get_endpoints_with_data(self):
         self.logger.debug("discovering data...")
         endpoints_with_data = []
-        for endpoint in endpoints:
-            if self.get_data_files_for_endpoint(endpoint):
-                endpoints_with_data.append(endpoint)
+        data_dir_list = os.listdir(self.config["data_dir"])
+        for data_dir_item in data_dir_list:
+            data_dir_item_path = os.path.join(self.config["data_dir"], data_dir_item)
+            if os.path.isfile(data_dir_item_path):
+                filename = os.path.basename(data_dir_item)
+                extension = filename.rsplit(".", 1)[-1]
+                filename_without_extension = filename.rsplit(".", 1)[0]
+                if extension in self.DATA_FILE_EXTENSIONS and filename_without_extension in self.all_endpoints:
+                    endpoints_with_data.append(filename_without_extension)
+            elif os.path.isdir(data_dir_item_path):
+                if data_dir_item in self.all_endpoints:
+                    has_data_file = False
+                    sub_dir_list = os.listdir(data_dir_item_path)
+                    for sub_dir_item in sub_dir_list:
+                        sub_dir_item_path = os.path.join(data_dir_item_path, sub_dir_item)
+                        if os.path.isfile(sub_dir_item_path):
+                            filename = os.path.basename(sub_dir_item)
+                            extension = filename.rsplit(".", 1)[-1]
+                            if extension in self.DATA_FILE_EXTENSIONS:
+                                has_data_file = True
+                                break
+                    if has_data_file:
+                        endpoints_with_data.append(data_dir_item)
+
         return endpoints_with_data
     
     # Returns a generator which produces json lines for a given endpoint based on relevant files in config.data_dir
diff --git a/lightbeam/util.py b/lightbeam/util.py
@@ -28,12 +28,16 @@ def singularize_endpoint(endpoint):
     if endpoint[-3:]=="ies": return endpoint[0:-3] + "y"
     elif endpoint=="people": return "person"
     else: return endpoint[0:-1]
+def pluralize_endpoint(endpoint):
+    if endpoint[-1:]=="y": return endpoint[0:-1] + "ies"
+    elif endpoint=="person": return "people"
+    else: return endpoint+"s"
 
 # Takes a params structure and interpolates values from a (string) JSON payload
 def interpolate_params(params_structure, payload):
     params = {}
     for k,v in params_structure.items():
-        value = json.loads(payload)
+        value = payload.copy()
         for key in v.split('.'):
             value = value[key]
         params[k] = value
diff --git a/lightbeam/validate.py b/lightbeam/validate.py