Merge branch 'main' of github.com:edanalytics/lightbeam into fix/command_list

johncmerfeld · johncmerfeld · commit d7e2fab1d2a2 · 2024-04-22T08:13:12.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,12 @@
+### v0.1.2
+<details>
+<summary>Released 2024-04-19</summary>
+
+* feature: [Add ability for fetch `--keep-keys` and `--drop-keys` flags to allow wildcard matching](https://github.com/edanalytics/lightbeam/pull/23)
+* feature: [Update structured logging to be flatter, per recent team discussion](https://github.com/edanalytics/lightbeam/pull/24)
+* bugfix: [Support for `definitions`being renamed to `components.schemas` in Ed-Fi 7.1 Swagger](https://github.com/edanalytics/lightbeam/pull/25)
+</details>
+
 ### v0.1.1
 <details>
 <summary>Released 2024-02-16</summary>
diff --git a/README.md b/README.md
@@ -120,6 +120,8 @@ Optionally specify `--keep-keys id` or `-k id` to keep only specific keys from e
 
 Optionally specify `--drop-keys id,_etag,_lastModified` or `-d id` to remove specific keys from every payload. This can be useful if you want to `fetch` data from one Ed-Fi API and then turn around and `send` it to another.
 
+Like [selectors](#selectors), `keep-keys` and `drop-keys` are comma-separated lists of values, each of which may begin or end with an asterisk (`*`) for wildcard matching. Example: `-d _*` would remove properties beginning with an underscore (`_`) character from any `fetch`ed payloads.
+
 ## `validate`
 ```bash
 lightbeam validate -c path/to/config.yaml
diff --git a/lightbeam/api.py b/lightbeam/api.py
@@ -53,67 +53,24 @@ def prepare(self):
 
 
     def apply_filters(self, endpoints=[]):
-        selected_endpoints = self.parse_endpoint_string(self.lightbeam.selector, endpoints=endpoints, all_on_empty=True)
-
-        # make sure all selectors resolve to an endpoint
-        unknown_endpoints = list(set(selected_endpoints).difference(endpoints))
-        if unknown_endpoints:
-            self.logger.critical("no match for selector(s) [{0}] to any endpoint in your API; check for typos?".format(", ".join(unknown_endpoints)))
-
-        excluded_endpoints = self.parse_endpoint_string(self.lightbeam.exclude, endpoints=selected_endpoints)
+        # apply filters
+        my_endpoints = util.apply_selections(endpoints, self.lightbeam.selector, self.lightbeam.exclude)
         
         # make sure we have some endpoints to process
-        my_endpoints = list(set(selected_endpoints).difference(excluded_endpoints))
         if not my_endpoints:
             self.logger.critical("selector filtering left no endpoints to process; check your selector for typos?")
 
+        # make sure all selectors resolve to an endpoint
+        unknown_endpoints = set(my_endpoints).difference(endpoints)
+        if unknown_endpoints:
+            self.logger.critical("no match for selector(s) [{0}] to any endpoint in your API; check for typos?".format(", ".join(unknown_endpoints)))
+
         # all the list(set()) stuff above can mess up the ordering of the endpoints (which must be in dependency-order)... this puts them back in dependency-order
         final_endpoints = [x for x in endpoints if x in my_endpoints]
         
         return final_endpoints
 
 
-    @staticmethod
-    def parse_endpoint_string(full_endpoint_string: str, endpoints=[], all_on_empty=False):
-        """
-        Possible endpoint strings:
-        - "students"
-        - "students,schools"
-        - "student*"
-        - "student*,schools"
-        - "*Associations"
-        - "*Associations,schools"
-        """
-        # If no string is provided, return all or no endpoints, depending on use-case.
-        if not full_endpoint_string:
-            if all_on_empty:
-                return endpoints
-            else:
-                return []
-        
-        # Asterisk wildcards to all endpoints.
-        if full_endpoint_string == "*":
-            return endpoints
-        
-        # Otherwise, a comma-separated list of endpoints is expected.
-        return_endpoints = set()
-
-        for endpoint_string in full_endpoint_string.split(","):
-
-            if endpoint_string.startswith("*"):  # left wildcard: "*Associations"
-                return_endpoints.update(
-                    filter(lambda endpoint: endpoint.endswith(endpoint_string.lstrip("*")), endpoints)
-                )
-            elif endpoint_string.endswith("*"):  # right wildcard: "student*"
-                return_endpoints.update(
-                    filter(lambda endpoint: endpoint.startswith(endpoint_string.rstrip("*")), endpoints)
-                )
-            else:  # no wildcard: "students"
-                return_endpoints.add(endpoint_string)
-        
-        return list(return_endpoints)
-
-
     # Returns a client object with exponential retry and other parameters per configs
     def get_retry_client(self):
         return RetryClient(
@@ -344,12 +301,25 @@ def get_params_for_endpoint(self, endpoint):
 
     def get_required_params_from_swagger(self, swagger, definition, prefix=""):
         params = {}
-        for requiredProperty in swagger["definitions"][definition]["required"]:
-            if "$ref" in swagger["definitions"][definition]["properties"][requiredProperty].keys():
-                sub_definition = swagger["definitions"][definition]["properties"][requiredProperty]["$ref"].replace("#/definitions/", "")
+        use_definitions = False
+        if "definitions" in swagger.keys():
+            schema = swagger["definitions"][definition]
+            use_definitions = True
+        elif "components" in swagger.keys() and "schemas" in swagger["components"].keys():
+            schema = swagger["components"]["schemas"][definition]
+        else:
+            self.logger.critical(f"Swagger contains neither `definitions` nor `components.schemas` - check that the Swagger is valid.")
+        
+        for requiredProperty in schema["required"]:
+            if "$ref" in schema["properties"][requiredProperty].keys():
+                sub_definition = schema["properties"][requiredProperty]["$ref"]
+                if use_definitions:
+                    sub_definition = sub_definition.replace("#/definitions/", "")
+                else:
+                    sub_definition = sub_definition.replace("#/components/schemas/", "")
                 sub_params = self.get_required_params_from_swagger(swagger, sub_definition, prefix=requiredProperty+".")
                 for k,v in sub_params.items():
                     params[k] = v
-            elif swagger["definitions"][definition]["properties"][requiredProperty]["type"]!="array":
+            elif schema["properties"][requiredProperty]["type"]!="array":
                 params[requiredProperty] = prefix + requiredProperty
         return params
diff --git a/lightbeam/fetch.py b/lightbeam/fetch.py
@@ -86,18 +86,14 @@ async def get_endpoint_records(self, endpoint, limit, offset, file_handle=None):
                             if type(values) != list:
                                 self.logger.warn(f"Unable to load records for {endpoint}... API JSON response was not a list of records.")
                             else:
+                                payload_keys = list(values[0].keys())
+                                final_keys = util.apply_selections(payload_keys, self.lightbeam.keep_keys, self.lightbeam.drop_keys)
+                                do_key_filtering = len(payload_keys) != len(final_keys)
                                 for v in values:
-                                    if self.lightbeam.keep_keys!="":
-                                        row = {}
-                                        for key in self.lightbeam.keep_keys.split(','):
-                                            row.update({key: v[key]})
+                                    if do_key_filtering: row = {k: v[k] for k in final_keys}
                                     else: row = v
-                                    # delete_keys (id, _etag, _lastModifiedDate)
-                                    for key in self.lightbeam.drop_keys.split(','):
-                                        if key in row.keys():
-                                            del row[key]
                                     if file_handle: file_handle.write(json.dumps(row)+"\n")
-                                    else: self.lightbeam.results.append(v)
+                                    else: self.lightbeam.results.append(row)
                                     self.lightbeam.increment_status_counts(status)
                                 break
                         else:
diff --git a/lightbeam/send.py b/lightbeam/send.py
@@ -1,3 +1,4 @@
+import re
 import os
 import time
 import json
@@ -56,20 +57,16 @@ def send(self):
             "total_records_skipped": sum(item['records_skipped'] for item in self.metadata["resources"].values()),
             "total_records_failed": sum(item['records_failed'] for item in self.metadata["resources"].values())
         })
-        # total up counts by message and status
-        for resource, resource_metadata in self.metadata["resources"].items():
-            if "failed_statuses" in resource_metadata.keys():
-                for status, status_metadata in resource_metadata["failed_statuses"].items():
-                    total_num_errs = 0
-                    for message, message_metadata in status_metadata.items():
-                        for file, file_metadata in message_metadata["files"].items():
-                            num_errs = len(file_metadata["line_numbers"])
-                            file_metadata.update({
-                                "count": num_errs,
-                                "line_numbers": ",".join(str(x) for x in file_metadata["line_numbers"])
-                            })
-                            total_num_errs += num_errs
-                    status_metadata.update({"count": total_num_errs})
+        # sort failing line numbers
+        for resource in self.metadata["resources"].keys():
+            if "failures" in self.metadata["resources"][resource].keys():
+                for idx, _ in enumerate(self.metadata["resources"][resource]["failures"]):
+                    self.metadata["resources"][resource]["failures"][idx]["line_numbers"].sort()
+        
+        
+        # helper function used below
+        def repl(m):
+            return re.sub(r"\s+", '', m.group(0))
         
         ### Create structured output results_file if necessary
         if self.lightbeam.results_file:
@@ -78,7 +75,10 @@ def send(self):
             os.makedirs(os.path.dirname(self.lightbeam.results_file), exist_ok=True)
             
             with open(self.lightbeam.results_file, 'w') as fp:
-                fp.write(json.dumps(self.metadata, indent=4))
+                content = json.dumps(self.metadata, indent=4)
+                # failures.line_numbers are split each on their own line; here we remove those line breaks
+                content = re.sub(r'"line_numbers": \[(\d|,|\s|\n)*\]', repl, content)
+                fp.write(content)
 
         if self.metadata["total_records_processed"] == self.metadata["total_records_skipped"]:
             self.logger.info("all payloads skipped")
@@ -112,7 +112,7 @@ async def do_send(self, endpoint):
         for file_name in data_files:
             with open(file_name) as file:
                 # process each line
-                for line in file:
+                for line_counter, line in enumerate(file):
                     total_counter += 1
                     data = line.strip()
                     # compute hash of current row
@@ -123,15 +123,15 @@ async def do_send(self, endpoint):
                         if self.lightbeam.meets_process_criteria(self.hashlog_data[hash]):
                             # yes, we need to (re)post it; append to task queue
                             tasks.append(asyncio.create_task(
-                                self.do_post(endpoint, file_name, data, total_counter, hash)))
+                                self.do_post(endpoint, file_name, data, line_counter, hash)))
                         else:
                             # no, do not (re)post
                             self.lightbeam.num_skipped += 1
                             continue
                     else:
                         # new, never-before-seen payload! append it to task queue
                         tasks.append(asyncio.create_task(
-                            self.do_post(endpoint, file_name, data, total_counter, hash)))
+                            self.do_post(endpoint, file_name, data, line_counter, hash)))
                 
                     if total_counter%self.lightbeam.MAX_TASK_QUEUE_SIZE==0:
                         await self.lightbeam.do_tasks(tasks, total_counter)
@@ -176,19 +176,23 @@ async def do_post(self, endpoint, file_name, data, line, hash):
                             message = str(response.status) + ": " + util.linearize(json.loads(body).get("message"))
 
                             # update run metadata...
-                            failed_statuses_dict = self.metadata["resources"][endpoint].get("failed_statuses", {})
-                            if response.status not in failed_statuses_dict.keys():
-                                failed_statuses_dict.update({response.status: {}})
-                            if message not in failed_statuses_dict[response.status].keys():
-                                failed_statuses_dict[response.status].update({message: {}})
-                            if "files" not in failed_statuses_dict[response.status][message].keys():
-                                failed_statuses_dict[response.status][message].update({"files": {}})
-                            if file_name not in failed_statuses_dict[response.status][message]["files"].keys():
-                                failed_statuses_dict[response.status][message]["files"].update({file_name: {}})
-                            if "line_numbers" not in failed_statuses_dict[response.status][message]["files"][file_name].keys():
-                                failed_statuses_dict[response.status][message]["files"][file_name].update({"line_numbers": []})
-                            failed_statuses_dict[response.status][message]["files"][file_name]["line_numbers"].append(line)
-                            self.metadata["resources"][endpoint]["failed_statuses"] = failed_statuses_dict
+                            failures = self.metadata["resources"][endpoint].get("failures", [])
+                            do_append = True
+                            for index, item in enumerate(failures):
+                                if item["status_code"]==response.status and item["message"]==message and item["file"]==file_name:
+                                    failures[index]["line_numbers"].append(line)
+                                    failures[index]["count"] += 1
+                                    do_append = False
+                            if do_append:
+                                failure = {
+                                    'status_code': response.status,
+                                    'message': message,
+                                    'file': file_name,
+                                    'line_numbers': [line],
+                                    'count': 1
+                                }
+                                failures.append(failure)
+                            self.metadata["resources"][endpoint]["failures"] = failures
 
                             # update output and counters
                             self.lightbeam.increment_status_reason(message)
diff --git a/lightbeam/util.py b/lightbeam/util.py
@@ -1,5 +1,6 @@
 import re
 import json
+import itertools
 
 # Strips newlines from a string
 # Replace single-quotes with backticks
@@ -40,4 +41,36 @@ def interpolate_params(params_structure, payload):
 def url_join(*args):
     return '/'.join(
         map(lambda x: str(x).rstrip('/'), filter(lambda x: x is not None, args))
-    )
+    )
+
+# Returns the subset of `keys` that match the `keep` and `drop` criteria, importantly
+# respecting wildcards! (so keep=["*Association,student*"] matches anything beginning
+# with "student" or ending with "Association")
+# This function is used for both the endpoint selection in apply_filters() of api.py and
+# the keep-keys and drop-keys filtering in fetch.py
+def apply_selections(keys, keep, drop):
+    # `keep` and `drop` _should_ be arrays, but in case they're strings, we split them
+    if isinstance(keep, str): keep = keep.split(",")
+    if isinstance(drop, str): drop = drop.split(",")
+    # this will be the filtered set of keys
+    final_keys = []
+    # populate `final_keys` with `keys` that match `keep`
+    if keep and keep != ["*"]:
+        for payload_key, keep_key in list(itertools.product(keys, keep)):
+            if (keys_match(payload_key, keep_key)):
+                final_keys.append(payload_key)
+    else: final_keys = keys
+    # remove from `final_keys` keys that match `drop`
+    if drop and drop != [""]:
+        for payload_key, drop_key in list(itertools.product(keys, drop)):
+            if (keys_match(payload_key, drop_key)):
+                if payload_key in final_keys: final_keys.remove(payload_key)
+    return final_keys
+
+# Compares a key like "stateAbbreviationDescriptors" with a (potentially wildcard) expression
+# like "*Descriptors" for match.
+def keys_match(key, wildcard_key):
+    if key==wildcard_key: return True
+    if wildcard_key.startswith("*") and key.endswith(wildcard_key.lstrip("*")): return True
+    if wildcard_key.endswith("*") and key.startswith(wildcard_key.rstrip("*")): return True
+    return False
diff --git a/lightbeam/validate.py b/lightbeam/validate.py
@@ -31,8 +31,13 @@ def validate(self):
     # Validates a single endpoint based on the Swagger docs
     def validate_endpoint(self, swagger, endpoint, local_descriptors=[]):
         definition = util.camel_case(self.lightbeam.config["namespace"]) + "_" + util.singularize_endpoint(endpoint)
-        resource_schema = swagger["definitions"][definition]
-
+        if "definitions" in swagger.keys():
+            resource_schema = swagger["definitions"][definition]
+        elif "components" in swagger.keys() and "schemas" in swagger["components"].keys():
+            resource_schema = swagger["components"]["schemas"][definition]
+        else:
+            self.logger.critical(f"Swagger contains neither `definitions` nor `components.schemas` - check that the Swagger is valid.")
+        
         resolver = RefResolver("test", swagger, swagger)
         validator = Draft4Validator(resource_schema, resolver=resolver)
         params_structure = self.lightbeam.api.get_params_for_endpoint(endpoint)