Merge pull request #23 from edanalytics/feature/fetch_wildcard_keys

tomreitz · web-flow · commit 19e911431614 · 2024-04-19T16:04:15.000-05:00
Add ability for fetch --keep-keys and --drop-keys flags to do wildcard matching
diff --git a/README.md b/README.md
@@ -120,6 +120,8 @@ Optionally specify `--keep-keys id` or `-k id` to keep only specific keys from e
 
 Optionally specify `--drop-keys id,_etag,_lastModified` or `-d id` to remove specific keys from every payload. This can be useful if you want to `fetch` data from one Ed-Fi API and then turn around and `send` it to another.
 
+Like [selectors](#selectors), `keep-keys` and `drop-keys` are comma-separated lists of values, each of which may begin or end with an asterisk (`*`) for wildcard matching. Example: `-d _*` would remove properties beginning with an underscore (`_`) character from any `fetch`ed payloads.
+
 ## `validate`
 ```bash
 lightbeam validate -c path/to/config.yaml
diff --git a/lightbeam/api.py b/lightbeam/api.py
@@ -53,67 +53,24 @@ def prepare(self):
 
 
     def apply_filters(self, endpoints=[]):
-        selected_endpoints = self.parse_endpoint_string(self.lightbeam.selector, endpoints=endpoints, all_on_empty=True)
-
-        # make sure all selectors resolve to an endpoint
-        unknown_endpoints = list(set(selected_endpoints).difference(endpoints))
-        if unknown_endpoints:
-            self.logger.critical("no match for selector(s) [{0}] to any endpoint in your API; check for typos?".format(", ".join(unknown_endpoints)))
-
-        excluded_endpoints = self.parse_endpoint_string(self.lightbeam.exclude, endpoints=selected_endpoints)
+        # apply filters
+        my_endpoints = util.apply_selections(endpoints, self.lightbeam.selector, self.lightbeam.exclude)
         
         # make sure we have some endpoints to process
-        my_endpoints = list(set(selected_endpoints).difference(excluded_endpoints))
         if not my_endpoints:
             self.logger.critical("selector filtering left no endpoints to process; check your selector for typos?")
 
+        # make sure all selectors resolve to an endpoint
+        unknown_endpoints = set(my_endpoints).difference(endpoints)
+        if unknown_endpoints:
+            self.logger.critical("no match for selector(s) [{0}] to any endpoint in your API; check for typos?".format(", ".join(unknown_endpoints)))
+
         # all the list(set()) stuff above can mess up the ordering of the endpoints (which must be in dependency-order)... this puts them back in dependency-order
         final_endpoints = [x for x in endpoints if x in my_endpoints]
         
         return final_endpoints
 
 
-    @staticmethod
-    def parse_endpoint_string(full_endpoint_string: str, endpoints=[], all_on_empty=False):
-        """
-        Possible endpoint strings:
-        - "students"
-        - "students,schools"
-        - "student*"
-        - "student*,schools"
-        - "*Associations"
-        - "*Associations,schools"
-        """
-        # If no string is provided, return all or no endpoints, depending on use-case.
-        if not full_endpoint_string:
-            if all_on_empty:
-                return endpoints
-            else:
-                return []
-        
-        # Asterisk wildcards to all endpoints.
-        if full_endpoint_string == "*":
-            return endpoints
-        
-        # Otherwise, a comma-separated list of endpoints is expected.
-        return_endpoints = set()
-
-        for endpoint_string in full_endpoint_string.split(","):
-
-            if endpoint_string.startswith("*"):  # left wildcard: "*Associations"
-                return_endpoints.update(
-                    filter(lambda endpoint: endpoint.endswith(endpoint_string.lstrip("*")), endpoints)
-                )
-            elif endpoint_string.endswith("*"):  # right wildcard: "student*"
-                return_endpoints.update(
-                    filter(lambda endpoint: endpoint.startswith(endpoint_string.rstrip("*")), endpoints)
-                )
-            else:  # no wildcard: "students"
-                return_endpoints.add(endpoint_string)
-        
-        return list(return_endpoints)
-
-
     # Returns a client object with exponential retry and other parameters per configs
     def get_retry_client(self):
         return RetryClient(
diff --git a/lightbeam/fetch.py b/lightbeam/fetch.py
@@ -86,18 +86,14 @@ async def get_endpoint_records(self, endpoint, limit, offset, file_handle=None):
                             if type(values) != list:
                                 self.logger.warn(f"Unable to load records for {endpoint}... API JSON response was not a list of records.")
                             else:
+                                payload_keys = list(values[0].keys())
+                                final_keys = util.apply_selections(payload_keys, self.lightbeam.keep_keys, self.lightbeam.drop_keys)
+                                do_key_filtering = len(payload_keys) != len(final_keys)
                                 for v in values:
-                                    if self.lightbeam.keep_keys!="":
-                                        row = {}
-                                        for key in self.lightbeam.keep_keys.split(','):
-                                            row.update({key: v[key]})
+                                    if do_key_filtering: row = {k: v[k] for k in final_keys}
                                     else: row = v
-                                    # delete_keys (id, _etag, _lastModifiedDate)
-                                    for key in self.lightbeam.drop_keys.split(','):
-                                        if key in row.keys():
-                                            del row[key]
                                     if file_handle: file_handle.write(json.dumps(row)+"\n")
-                                    else: self.lightbeam.results.append(v)
+                                    else: self.lightbeam.results.append(row)
                                     self.lightbeam.increment_status_counts(status)
                                 break
                         else:
diff --git a/lightbeam/util.py b/lightbeam/util.py
@@ -1,5 +1,6 @@
 import re
 import json
+import itertools
 
 # Strips newlines from a string
 # Replace single-quotes with backticks
@@ -40,4 +41,36 @@ def interpolate_params(params_structure, payload):
 def url_join(*args):
     return '/'.join(
         map(lambda x: str(x).rstrip('/'), filter(lambda x: x is not None, args))
-    )
+    )
+
+# Returns the subset of `keys` that match the `keep` and `drop` criteria, importantly
+# respecting wildcards! (so keep=["*Association,student*"] matches anything beginning
+# with "student" or ending with "Association")
+# This function is used for both the endpoint selection in apply_filters() of api.py and
+# the keep-keys and drop-keys filtering in fetch.py
+def apply_selections(keys, keep, drop):
+    # `keep` and `drop` _should_ be arrays, but in case they're strings, we split them
+    if isinstance(keep, str): keep = keep.split(",")
+    if isinstance(drop, str): drop = drop.split(",")
+    # this will be the filtered set of keys
+    final_keys = []
+    # populate `final_keys` with `keys` that match `keep`
+    if keep and keep != ["*"]:
+        for payload_key, keep_key in list(itertools.product(keys, keep)):
+            if (keys_match(payload_key, keep_key)):
+                final_keys.append(payload_key)
+    else: final_keys = keys
+    # remove from `final_keys` keys that match `drop`
+    if drop and drop != [""]:
+        for payload_key, drop_key in list(itertools.product(keys, drop)):
+            if (keys_match(payload_key, drop_key)):
+                if payload_key in final_keys: final_keys.remove(payload_key)
+    return final_keys
+
+# Compares a key like "stateAbbreviationDescriptors" with a (potentially wildcard) expression
+# like "*Descriptors" for match.
+def keys_match(key, wildcard_key):
+    if key==wildcard_key: return True
+    if wildcard_key.startswith("*") and key.endswith(wildcard_key.lstrip("*")): return True
+    if wildcard_key.endswith("*") and key.startswith(wildcard_key.rstrip("*")): return True
+    return False