Skip to content

Commit 19e9114

Browse files
authored
Merge pull request #23 from edanalytics/feature/fetch_wildcard_keys
Add ability for fetch --keep-keys and --drop-keys flags to do wildcard matching
2 parents 221b7b6 + b217829 commit 19e9114

4 files changed

Lines changed: 48 additions & 60 deletions

File tree

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,8 @@ Optionally specify `--keep-keys id` or `-k id` to keep only specific keys from e
120120

121121
Optionally specify `--drop-keys id,_etag,_lastModified` or `-d id` to remove specific keys from every payload. This can be useful if you want to `fetch` data from one Ed-Fi API and then turn around and `send` it to another.
122122

123+
Like [selectors](#selectors), `keep-keys` and `drop-keys` are comma-separated lists of values, each of which may begin or end with an asterisk (`*`) for wildcard matching. Example: `-d _*` would remove properties beginning with an underscore (`_`) character from any `fetch`ed payloads.
124+
123125
## `validate`
124126
```bash
125127
lightbeam validate -c path/to/config.yaml

lightbeam/api.py

Lines changed: 7 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -53,67 +53,24 @@ def prepare(self):
5353

5454

5555
def apply_filters(self, endpoints=[]):
56-
selected_endpoints = self.parse_endpoint_string(self.lightbeam.selector, endpoints=endpoints, all_on_empty=True)
57-
58-
# make sure all selectors resolve to an endpoint
59-
unknown_endpoints = list(set(selected_endpoints).difference(endpoints))
60-
if unknown_endpoints:
61-
self.logger.critical("no match for selector(s) [{0}] to any endpoint in your API; check for typos?".format(", ".join(unknown_endpoints)))
62-
63-
excluded_endpoints = self.parse_endpoint_string(self.lightbeam.exclude, endpoints=selected_endpoints)
56+
# apply filters
57+
my_endpoints = util.apply_selections(endpoints, self.lightbeam.selector, self.lightbeam.exclude)
6458

6559
# make sure we have some endpoints to process
66-
my_endpoints = list(set(selected_endpoints).difference(excluded_endpoints))
6760
if not my_endpoints:
6861
self.logger.critical("selector filtering left no endpoints to process; check your selector for typos?")
6962

63+
# make sure all selectors resolve to an endpoint
64+
unknown_endpoints = set(my_endpoints).difference(endpoints)
65+
if unknown_endpoints:
66+
self.logger.critical("no match for selector(s) [{0}] to any endpoint in your API; check for typos?".format(", ".join(unknown_endpoints)))
67+
7068
# all the list(set()) stuff above can mess up the ordering of the endpoints (which must be in dependency-order)... this puts them back in dependency-order
7169
final_endpoints = [x for x in endpoints if x in my_endpoints]
7270

7371
return final_endpoints
7472

7573

76-
@staticmethod
77-
def parse_endpoint_string(full_endpoint_string: str, endpoints=[], all_on_empty=False):
78-
"""
79-
Possible endpoint strings:
80-
- "students"
81-
- "students,schools"
82-
- "student*"
83-
- "student*,schools"
84-
- "*Associations"
85-
- "*Associations,schools"
86-
"""
87-
# If no string is provided, return all or no endpoints, depending on use-case.
88-
if not full_endpoint_string:
89-
if all_on_empty:
90-
return endpoints
91-
else:
92-
return []
93-
94-
# Asterisk wildcards to all endpoints.
95-
if full_endpoint_string == "*":
96-
return endpoints
97-
98-
# Otherwise, a comma-separated list of endpoints is expected.
99-
return_endpoints = set()
100-
101-
for endpoint_string in full_endpoint_string.split(","):
102-
103-
if endpoint_string.startswith("*"): # left wildcard: "*Associations"
104-
return_endpoints.update(
105-
filter(lambda endpoint: endpoint.endswith(endpoint_string.lstrip("*")), endpoints)
106-
)
107-
elif endpoint_string.endswith("*"): # right wildcard: "student*"
108-
return_endpoints.update(
109-
filter(lambda endpoint: endpoint.startswith(endpoint_string.rstrip("*")), endpoints)
110-
)
111-
else: # no wildcard: "students"
112-
return_endpoints.add(endpoint_string)
113-
114-
return list(return_endpoints)
115-
116-
11774
# Returns a client object with exponential retry and other parameters per configs
11875
def get_retry_client(self):
11976
return RetryClient(

lightbeam/fetch.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -86,18 +86,14 @@ async def get_endpoint_records(self, endpoint, limit, offset, file_handle=None):
8686
if type(values) != list:
8787
self.logger.warn(f"Unable to load records for {endpoint}... API JSON response was not a list of records.")
8888
else:
89+
payload_keys = list(values[0].keys())
90+
final_keys = util.apply_selections(payload_keys, self.lightbeam.keep_keys, self.lightbeam.drop_keys)
91+
do_key_filtering = len(payload_keys) != len(final_keys)
8992
for v in values:
90-
if self.lightbeam.keep_keys!="":
91-
row = {}
92-
for key in self.lightbeam.keep_keys.split(','):
93-
row.update({key: v[key]})
93+
if do_key_filtering: row = {k: v[k] for k in final_keys}
9494
else: row = v
95-
# delete_keys (id, _etag, _lastModifiedDate)
96-
for key in self.lightbeam.drop_keys.split(','):
97-
if key in row.keys():
98-
del row[key]
9995
if file_handle: file_handle.write(json.dumps(row)+"\n")
100-
else: self.lightbeam.results.append(v)
96+
else: self.lightbeam.results.append(row)
10197
self.lightbeam.increment_status_counts(status)
10298
break
10399
else:

lightbeam/util.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import re
22
import json
3+
import itertools
34

45
# Strips newlines from a string
56
# Replace single-quotes with backticks
@@ -40,4 +41,36 @@ def interpolate_params(params_structure, payload):
4041
def url_join(*args):
4142
return '/'.join(
4243
map(lambda x: str(x).rstrip('/'), filter(lambda x: x is not None, args))
43-
)
44+
)
45+
46+
# Returns the subset of `keys` that match the `keep` and `drop` criteria, importantly
47+
# respecting wildcards! (so keep=["*Association,student*"] matches anything beginning
48+
# with "student" or ending with "Association")
49+
# This function is used for both the endpoint selection in apply_filters() of api.py and
50+
# the keep-keys and drop-keys filtering in fetch.py
51+
def apply_selections(keys, keep, drop):
52+
# `keep` and `drop` _should_ be arrays, but in case they're strings, we split them
53+
if isinstance(keep, str): keep = keep.split(",")
54+
if isinstance(drop, str): drop = drop.split(",")
55+
# this will be the filtered set of keys
56+
final_keys = []
57+
# populate `final_keys` with `keys` that match `keep`
58+
if keep and keep != ["*"]:
59+
for payload_key, keep_key in list(itertools.product(keys, keep)):
60+
if (keys_match(payload_key, keep_key)):
61+
final_keys.append(payload_key)
62+
else: final_keys = keys
63+
# remove from `final_keys` keys that match `drop`
64+
if drop and drop != [""]:
65+
for payload_key, drop_key in list(itertools.product(keys, drop)):
66+
if (keys_match(payload_key, drop_key)):
67+
if payload_key in final_keys: final_keys.remove(payload_key)
68+
return final_keys
69+
70+
# Compares a key like "stateAbbreviationDescriptors" with a (potentially wildcard) expression
71+
# like "*Descriptors" for match.
72+
def keys_match(key, wildcard_key):
73+
if key==wildcard_key: return True
74+
if wildcard_key.startswith("*") and key.endswith(wildcard_key.lstrip("*")): return True
75+
if wildcard_key.endswith("*") and key.startswith(wildcard_key.rstrip("*")): return True
76+
return False

0 commit comments

Comments
 (0)