Skip to content

Commit ed3eee1

Browse files
authored
Merge pull request #67 from edanalytics/feature/validate_fixes_and_uniqueness_in_array_elements
`validate` uniqueness fixes and recurse into array elements, other improvements
2 parents 6a7f805 + ebc84f4 commit ed3eee1

2 files changed

Lines changed: 61 additions & 30 deletions

File tree

lightbeam/util.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def get_swagger_ref_for_endpoint(namespace, swagger, endpoint):
9090
def resolve_swagger_ref(swagger, ref):
9191
if "definitions" in swagger.keys():
9292
definition = ref.replace("#/definitions/", "")
93-
return swagger["definitions"][definition]
93+
return swagger["definitions"].get(definition, None)
9494
elif "components" in swagger.keys() and "schemas" in swagger["components"].keys():
9595
definition = ref.replace("#/components/schemas/", "")
96-
return swagger["components"]["schemas"][definition]
96+
return swagger["components"]["schemas"].get(definition, None)

lightbeam/validate.py

Lines changed: 59 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,24 @@ async def validate_endpoint(self, endpoint):
197197
"records_skipped": 0,
198198
"records_failed": 0
199199
})
200+
# structures to support testing uniqueness accross payloads:
201+
definition = self.get_swagger_definition_for_endpoint(endpoint)
202+
if "Descriptor" in endpoint:
203+
swagger = self.lightbeam.api.descriptors_swagger
204+
else:
205+
swagger = self.lightbeam.api.resources_swagger
206+
207+
if "definitions" in swagger.keys():
208+
resource_schema = swagger["definitions"][definition]
209+
elif "components" in swagger.keys() and "schemas" in swagger["components"].keys():
210+
resource_schema = swagger["components"]["schemas"][definition]
211+
else:
212+
self.logger.critical(f"Swagger contains neither `definitions` nor `components.schemas` - check that the Swagger is valid.")
213+
self.uniqueness_hashes = { endpoint: [] }
214+
self.identity_params_structures = {}
215+
self.schema_resolver = RefResolver("test", swagger, swagger)
216+
self.schema_validator = Draft4Validator(resource_schema, resolver=self.schema_resolver)
217+
200218
for file_name in data_files:
201219
self.logger.info(f"validating {file_name} against {definition} schema...")
202220
with open(file_name) as file:
@@ -237,29 +255,18 @@ async def validate_endpoint(self, endpoint):
237255
num_others = self.lightbeam.num_errors - self.MAX_VALIDATION_ERRORS_TO_DISPLAY
238256
if self.lightbeam.num_errors > self.MAX_VALIDATION_ERRORS_TO_DISPLAY:
239257
self.logger.warn(f"... and {num_others} others!")
240-
self.logger.warn(f"... VALIDATION ERRORS on {self.lightbeam.num_errors} of {line_number} lines in {file_name}; see details above.")
258+
self.logger.warn(f"... VALIDATION ERRORS on {self.lightbeam.num_errors} of {line_counter} lines in {file_name}; see details above.")
259+
260+
# free up some memory
261+
self.uniqueness_hashes = {}
262+
self.identity_params_structures = {}
263+
self.schema_resolver = None
264+
self.schema_validator = None
241265

242266

243267
async def do_validate_payload(self, endpoint, file_name, data, line_number):
244268
if self.fail_fast_threshold is not None and self.lightbeam.num_errors >= self.fail_fast_threshold: return
245-
definition = self.get_swagger_definition_for_endpoint(endpoint)
246-
if "Descriptor" in endpoint:
247-
swagger = self.lightbeam.api.descriptors_swagger
248-
else:
249-
swagger = self.lightbeam.api.resources_swagger
250-
251-
if "definitions" in swagger.keys():
252-
resource_schema = swagger["definitions"][definition]
253-
elif "components" in swagger.keys() and "schemas" in swagger["components"].keys():
254-
resource_schema = swagger["components"]["schemas"][definition]
255-
else:
256-
self.logger.critical(f"Swagger contains neither `definitions` nor `components.schemas` - check that the Swagger is valid.")
257269

258-
resolver = RefResolver("test", swagger, swagger)
259-
validator = Draft4Validator(resource_schema, resolver=resolver)
260-
identity_params_structure = self.lightbeam.api.get_params_for_endpoint(endpoint, type='identity')
261-
distinct_params = []
262-
263270
# check payload is valid JSON
264271
try:
265272
payload = json.loads(data)
@@ -270,7 +277,7 @@ async def do_validate_payload(self, endpoint, file_name, data, line_number):
270277
# check payload obeys Swagger schema
271278
if "schema" in self.validation_methods:
272279
try:
273-
validator.validate(payload)
280+
self.schema_validator.validate(payload)
274281
except Exception as e:
275282
e_path = [str(x) for x in list(e.path)]
276283
context = ""
@@ -286,14 +293,13 @@ async def do_validate_payload(self, endpoint, file_name, data, line_number):
286293
return
287294

288295
# check natural keys are unique
296+
if not self.identity_params_structures.get(endpoint, False):
297+
self.identity_params_structures[endpoint] = self.lightbeam.api.get_params_for_endpoint(endpoint, type='identity')
289298
if "uniqueness" in self.validation_methods:
290-
params = json.dumps(util.interpolate_params(identity_params_structure, payload))
291-
params_hash = hashlog.get_hash(params)
292-
if params_hash in distinct_params:
293-
self.log_validation_error(endpoint, file_name, line_number, "uniqueness", "duplicate value(s) for natural key(s): {params}")
294-
return
295-
else: distinct_params.append(params_hash)
296-
299+
error_message = self.violates_uniqueness(endpoint, payload, path="")
300+
if error_message != "":
301+
self.log_validation_error(endpoint, file_name, line_counter, "uniqueness", error_message)
302+
297303
# check references values are valid
298304
if "references" in self.validation_methods and "Descriptor" not in endpoint: # Descriptors have no references
299305
self.lightbeam.api.do_oauth()
@@ -304,7 +310,7 @@ async def do_validate_payload(self, endpoint, file_name, data, line_number):
304310

305311
def log_validation_error(self, endpoint, file_name, line_number, method, message):
306312
if self.lightbeam.num_errors < self.MAX_VALIDATION_ERRORS_TO_DISPLAY:
307-
self.logger.warning(f"... VALIDATION ERROR (line {line_number}): {message}")
313+
self.logger.warning(f"... VALIDATION ERROR ({method} at line {line_number}): {message}")
308314
self.lightbeam.num_errors += 1
309315

310316
# update run metadata...
@@ -326,6 +332,31 @@ def log_validation_error(self, endpoint, file_name, line_number, method, message
326332
failures.append(failure)
327333
self.lightbeam.metadata["resources"][endpoint]["failures"] = failures
328334

335+
def violates_uniqueness(self, endpoint, payload, path=""):
336+
params = json.dumps(util.interpolate_params(self.identity_params_structures[endpoint], payload))
337+
params_hash = hashlog.get_hash(params)
338+
if params_hash in self.uniqueness_hashes[endpoint]:
339+
return f"duplicate value(s) for identity key(s): " + ("(at "+path+"): " if path!="" else ": ") + f"{params}"
340+
else:
341+
self.uniqueness_hashes[endpoint].append(params_hash)
342+
# (recursively) check uniqueness of items in arrays
343+
swagger = self.lightbeam.api.resources_swagger
344+
endpoint_def = util.get_swagger_ref_for_endpoint(self.lightbeam.config.get('namespace', ''), swagger, endpoint)
345+
for k in payload.keys():
346+
if isinstance(payload[k], list):
347+
subarray_definition = util.resolve_swagger_ref(swagger, endpoint_def)
348+
if subarray_definition:
349+
subarray_ref = subarray_definition['properties'][k].get('items',{}).get('$ref','')
350+
if not self.identity_params_structures.get(subarray_ref, False):
351+
self.identity_params_structures[subarray_ref] = self.lightbeam.api.get_identity_params_from_swagger(swagger, subarray_ref)
352+
if subarray_ref not in self.uniqueness_hashes.keys():
353+
self.uniqueness_hashes[subarray_ref] = []
354+
for i in range(0, len(payload[k])):
355+
value = self.violates_uniqueness(subarray_ref, payload[k][i], path+("." if path!="" else "") + f"{k}[{i}]")
356+
if value!="": return value
357+
return ""
358+
359+
329360
def load_local_descriptors(self):
330361
local_descriptors = []
331362
all_endpoints = self.lightbeam.api.get_sorted_endpoints()
@@ -347,7 +378,7 @@ def has_invalid_descriptor_values(self, payload, path=""):
347378
if value!="": return value
348379
elif isinstance(payload[k], list):
349380
for i in range(0, len(payload[k])):
350-
value = self.has_invalid_descriptor_values(payload[k][i], path+("." if path!="" else "")+k+"["+str(i)+"]")
381+
value = self.has_invalid_descriptor_values(payload[k][i], path+("." if path!="" else "") + f"{k}[{i}]")
351382
if value!="": return value
352383
elif isinstance(payload[k], str) and k.endswith("Descriptor"):
353384
if "#" not in payload[k]:

0 commit comments

Comments
 (0)