Skip to content

Commit a00effe

Browse files
chore: reorganize dedupe code (#14641)
* Add match-only deduplication helpers and close-old queryset extraction. Expose match_batch_* and match_batch_of_findings for read-only matching. Support unsaved findings in location/endpoint comparison and _is_candidate_older. Refactor default_importer close_old_findings to use get_close_old_findings_queryset. Restore batch deduplication debug logging. * Batch-refresh close_old_findings status fields to avoid N refresh_from_db queries. Replace per-finding refresh_from_db(false_p, risk_accepted, out_of_scope) with one values() query for all PKs and assign onto instances, falling back to refresh_from_db when a row is missing. * docs: cite #12291 for close_old_findings status refresh origin * perf: chunk close_old_findings status sync queries (1000 PKs per SELECT) * fix(parsers): use unsaved_tags instead of tags= in Finding constructor for performance Passing tags= directly to the Finding() constructor triggers expensive tagulous processing for every finding. Using finding.unsaved_tags instead bypasses this overhead and lets the import pipeline handle tags efficiently. Affected parsers: jfrog_xray_unified, dependency_check, cargo_audit, anchore_grype, threat_composer. Benchmark on 14,219 findings: 99s -> 7.97s (12x faster). * fix: resolve ruff D203 and COM812 lint errors from formatter conflict * fix: update tests to check unsaved_tags instead of tags * fix: correct unsaved_tags assertions to expect lists and fix tag ordering Update tests for dependency_check and jfrog_xray_unified parsers to match the actual list format returned by unsaved_tags, and fix the expected order of tags for the suppressed-without-notes case in dependency_check. * fix(reimport): do not update finding tags on reimport for matched findings Tags from the report were being appended to matched findings via tags.add(), causing tags to accumulate across reimports instead of being left unchanged. This aligns tag handling with how other finding fields are treated on reimport. Closes #14606 --------- Co-authored-by: Cody Maffucci <46459665+Maffooch@users.noreply.github.com>
1 parent 0349f01 commit a00effe

7 files changed

Lines changed: 259 additions & 164 deletions

File tree

dojo/finding/deduplication.py

Lines changed: 135 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -217,14 +217,17 @@ def is_deduplication_on_engagement_mismatch(new_finding, to_duplicate_finding):
217217
return False
218218

219219

220-
def get_endpoints_as_url(finding):
221-
# Fix for https://github.com/DefectDojo/django-DefectDojo/issues/10215
222-
# When endpoints lack a protocol (scheme), str(e) returns a string like "10.20.197.218:6379"
223-
# without the "//" prefix. hyperlink.parse() then misinterprets the hostname as the scheme.
224-
# We replicate the behavior from dojo/endpoint/utils.py line 265: prepend "//" if "://" is missing
225-
# to ensure hyperlink.parse() correctly identifies host, port, and path components.
220+
def get_endpoints_as_url(endpoints):
221+
"""
222+
Convert a list of Endpoint objects to parsed hyperlink URLs.
223+
224+
Fix for https://github.com/DefectDojo/django-DefectDojo/issues/10215
225+
When endpoints lack a protocol (scheme), str(e) returns a string like "10.20.197.218:6379"
226+
without the "//" prefix. hyperlink.parse() then misinterprets the hostname as the scheme.
227+
We prepend "//" if "://" is missing to ensure correct parsing.
228+
"""
226229
urls = []
227-
for e in finding.endpoints.all():
230+
for e in endpoints:
228231
endpoint_str = str(e)
229232
if "://" not in endpoint_str:
230233
endpoint_str = "//" + endpoint_str
@@ -242,8 +245,9 @@ def are_urls_equal(url1, url2, fields):
242245
return True
243246

244247

245-
def finding_locations(finding):
246-
return [ref.location.url for ref in finding.locations.all()]
248+
def finding_locations(location_refs):
249+
"""Extract URLs from a list of location references."""
250+
return [ref.location.url for ref in location_refs]
247251

248252

249253
def are_location_urls_equal(url1, url2, fields):
@@ -266,8 +270,11 @@ def are_locations_duplicates(new_finding, to_duplicate_finding):
266270
return True
267271

268272
if settings.V3_FEATURE_LOCATIONS:
269-
list1 = finding_locations(new_finding)
270-
list2 = finding_locations(to_duplicate_finding)
273+
# Use unsaved_locations for unsaved findings (preview mode), saved M2M otherwise
274+
locs1 = new_finding.locations.all() if new_finding.pk else getattr(new_finding, "unsaved_locations", [])
275+
locs2 = to_duplicate_finding.locations.all() if to_duplicate_finding.pk else getattr(to_duplicate_finding, "unsaved_locations", [])
276+
list1 = finding_locations(locs1)
277+
list2 = finding_locations(locs2)
271278

272279
deduplicationLogger.debug(
273280
f"Starting deduplication by location fields for finding {new_finding.id} with locations {list1} and finding {to_duplicate_finding.id} with locations {list2}",
@@ -284,8 +291,11 @@ def are_locations_duplicates(new_finding, to_duplicate_finding):
284291
deduplicationLogger.debug(f"locations are not duplicates: {new_finding.id} and {to_duplicate_finding.id}")
285292
return False
286293
# TODO: Delete this after the move to Locations
287-
list1 = get_endpoints_as_url(new_finding)
288-
list2 = get_endpoints_as_url(to_duplicate_finding)
294+
# Use unsaved_endpoints for unsaved findings (preview mode), saved M2M otherwise
295+
eps1 = new_finding.endpoints.all() if new_finding.pk else getattr(new_finding, "unsaved_endpoints", [])
296+
eps2 = to_duplicate_finding.endpoints.all() if to_duplicate_finding.pk else getattr(to_duplicate_finding, "unsaved_endpoints", [])
297+
list1 = get_endpoints_as_url(eps1)
298+
list2 = get_endpoints_as_url(eps2)
289299

290300
deduplicationLogger.debug(
291301
f"Starting deduplication by endpoint fields for finding {new_finding.id} with urls {list1} and finding {to_duplicate_finding.id} with urls {list2}",
@@ -535,6 +545,9 @@ def find_candidates_for_reimport_legacy(test, findings, service=None):
535545

536546

537547
def _is_candidate_older(new_finding, candidate):
548+
# Unsaved findings (e.g. preview mode) have no PK — all DB candidates are older by definition
549+
if new_finding.pk is None:
550+
return True
538551
# Ensure the newer finding is marked as duplicate of the older finding
539552
is_older = candidate.id < new_finding.id
540553
if not is_older:
@@ -715,7 +728,116 @@ def _flush_duplicate_changes(modified_new_findings):
715728
return modified_new_findings
716729

717730

731+
# ---------------------------------------------------------------------------
732+
# Match-only functions (read-only, no DB writes)
733+
# These return [(new_finding, matched_candidate), ...] without persisting.
734+
# Used by both the regular dedup pipeline and the Pro import/reimport preview engine.
735+
# ---------------------------------------------------------------------------
736+
737+
738+
def match_batch_hash_code(findings):
739+
"""Find dedup matches by hash_code without persisting. Returns [(finding, candidate), ...]."""
740+
if not findings:
741+
return []
742+
test = findings[0].test
743+
candidates_by_hash = find_candidates_for_deduplication_hash(test, findings)
744+
if not candidates_by_hash:
745+
return []
746+
matches = []
747+
for new_finding in findings:
748+
for match in get_matches_from_hash_candidates(new_finding, candidates_by_hash):
749+
matches.append((new_finding, match))
750+
break
751+
return matches
752+
753+
754+
def match_batch_unique_id(findings):
755+
"""Find dedup matches by unique_id_from_tool without persisting. Returns [(finding, candidate), ...]."""
756+
if not findings:
757+
return []
758+
test = findings[0].test
759+
candidates_by_uid = find_candidates_for_deduplication_unique_id(test, findings)
760+
if not candidates_by_uid:
761+
return []
762+
matches = []
763+
for new_finding in findings:
764+
for match in get_matches_from_unique_id_candidates(new_finding, candidates_by_uid):
765+
matches.append((new_finding, match))
766+
break
767+
return matches
768+
769+
770+
def match_batch_uid_or_hash(findings):
771+
"""Find dedup matches by uid or hash_code without persisting. Returns [(finding, candidate), ...]."""
772+
if not findings:
773+
return []
774+
test = findings[0].test
775+
candidates_by_uid, existing_by_hash = find_candidates_for_deduplication_uid_or_hash(test, findings)
776+
if not (candidates_by_uid or existing_by_hash):
777+
return []
778+
matches = []
779+
for new_finding in findings:
780+
if new_finding.duplicate:
781+
continue
782+
for match in get_matches_from_uid_or_hash_candidates(new_finding, candidates_by_uid, existing_by_hash):
783+
matches.append((new_finding, match))
784+
break
785+
return matches
786+
787+
788+
def match_batch_legacy(findings):
789+
"""Find dedup matches by legacy algorithm without persisting. Returns [(finding, candidate), ...]."""
790+
if not findings:
791+
return []
792+
test = findings[0].test
793+
candidates_by_title, candidates_by_cwe = find_candidates_for_deduplication_legacy(test, findings)
794+
if not (candidates_by_title or candidates_by_cwe):
795+
return []
796+
matches = []
797+
for new_finding in findings:
798+
for match in get_matches_from_legacy_candidates(new_finding, candidates_by_title, candidates_by_cwe):
799+
matches.append((new_finding, match))
800+
break
801+
return matches
802+
803+
804+
def match_batch_of_findings(findings):
805+
"""
806+
Batch match findings against existing candidates without persisting.
807+
808+
Returns list of (new_finding, matched_candidate) tuples.
809+
Works with both saved and unsaved findings.
810+
"""
811+
if not findings:
812+
return []
813+
enabled = System_Settings.objects.get().enable_deduplication
814+
if not enabled:
815+
return []
816+
# Only sort by id for saved findings; unsaved findings have no id
817+
if findings[0].pk is not None:
818+
findings = sorted(findings, key=attrgetter("id"))
819+
test = findings[0].test
820+
dedup_alg = test.deduplication_algorithm
821+
if dedup_alg == settings.DEDUPE_ALGO_HASH_CODE:
822+
return match_batch_hash_code(findings)
823+
if dedup_alg == settings.DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL:
824+
return match_batch_unique_id(findings)
825+
if dedup_alg == settings.DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL_OR_HASH_CODE:
826+
return match_batch_uid_or_hash(findings)
827+
return match_batch_legacy(findings)
828+
829+
830+
# ---------------------------------------------------------------------------
831+
# Batch dedup functions (match + persist)
832+
# These call the match-only functions above and then persist the results.
833+
# ---------------------------------------------------------------------------
834+
835+
718836
def _dedupe_batch_hash_code(findings):
837+
# NOTE: These functions intentionally interleave matching and set_duplicate()
838+
# rather than calling the match_batch_*() functions above. This is because
839+
# set_duplicate() modifies finding.duplicate in-memory, which affects the
840+
# duplicate check in subsequent loop iterations (especially for uid_or_hash).
719841
if not findings:
720842
return []
721843
test = findings[0].test

dojo/importers/default_importer.py

Lines changed: 36 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -317,35 +317,12 @@ def process_findings(
317317

318318
return new_findings
319319

320-
def close_old_findings(
321-
self,
322-
findings: list[Finding],
323-
**kwargs: dict,
324-
) -> list[Finding]:
320+
def get_close_old_findings_queryset(self, new_hash_codes, new_unique_ids_from_tool):
325321
"""
326-
Closes old findings based on a hash code match at either the product
327-
or the engagement scope. Closing an old finding entails setting the
328-
finding to mitigated status, setting all location statuses to mitigated,
329-
as well as leaving a not on the finding indicating that it was mitigated
330-
because the vulnerability is no longer present in the submitted scan report.
331-
"""
332-
# First check if close old findings is desired
333-
if not self.close_old_findings_toggle:
334-
return []
322+
Build queryset of findings that would be closed, without closing them.
335323
336-
logger.debug("IMPORT_SCAN: Closing findings no longer present in scan report")
337-
# Remove all the findings that are coming from the report already mitigated
338-
new_hash_codes = []
339-
new_unique_ids_from_tool = []
340-
for finding in findings.values():
341-
# Do not process closed findings in the report
342-
if finding.get("is_mitigated", False):
343-
continue
344-
# Grab the hash code
345-
if (hash_code := finding.get("hash_code")) is not None:
346-
new_hash_codes.append(hash_code)
347-
if (unique_id_from_tool := finding.get("unique_id_from_tool")) is not None:
348-
new_unique_ids_from_tool.append(unique_id_from_tool)
324+
Reusable by preview engines to count findings that would be closed.
325+
"""
349326
# Get the initial filtered list of old findings to be closed without
350327
# considering the scope of the product or engagement
351328
# Include both active findings and risk-accepted findings (which have active=False)
@@ -382,6 +359,38 @@ def close_old_findings(
382359
old_findings = old_findings.filter(service=self.service)
383360
else:
384361
old_findings = old_findings.filter(Q(service__isnull=True) | Q(service__exact=""))
362+
return old_findings
363+
364+
def close_old_findings(
365+
self,
366+
findings: list[Finding],
367+
**kwargs: dict,
368+
) -> list[Finding]:
369+
"""
370+
Closes old findings based on a hash code match at either the product
371+
or the engagement scope. Closing an old finding entails setting the
372+
finding to mitigated status, setting all location statuses to mitigated,
373+
as well as leaving a not on the finding indicating that it was mitigated
374+
because the vulnerability is no longer present in the submitted scan report.
375+
"""
376+
# First check if close old findings is desired
377+
if not self.close_old_findings_toggle:
378+
return []
379+
380+
logger.debug("IMPORT_SCAN: Closing findings no longer present in scan report")
381+
# Remove all the findings that are coming from the report already mitigated
382+
new_hash_codes = []
383+
new_unique_ids_from_tool = []
384+
for finding in findings.values():
385+
# Do not process closed findings in the report
386+
if finding.get("is_mitigated", False):
387+
continue
388+
# Grab the hash code
389+
if (hash_code := finding.get("hash_code")) is not None:
390+
new_hash_codes.append(hash_code)
391+
if (unique_id_from_tool := finding.get("unique_id_from_tool")) is not None:
392+
new_unique_ids_from_tool.append(unique_id_from_tool)
393+
old_findings = self.get_close_old_findings_queryset(new_hash_codes, new_unique_ids_from_tool)
385394
# Update the status of the findings and any locations
386395
for old_finding in old_findings:
387396
url = str(get_full_url(reverse("view_test", args=(self.test.id,))))

dojo/tools/anchore_grype/parser.py

Lines changed: 18 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ def get_findings(self, file, test):
7676
rel_epss = related_vulnerability.get("epss")
7777
rel_vuln_id = related_vulnerability.get("id")
7878
vulnerability_ids = self.get_vulnerability_ids(
79-
vuln_id, related_vulnerabilities,
79+
vuln_id,
80+
related_vulnerabilities,
8081
)
8182

8283
matches = item["matchDetails"]
@@ -87,37 +88,25 @@ def get_findings(self, file, test):
8788
artifact_purl = artifact.get("purl")
8889
artifact_location = artifact.get("locations")
8990
file_path = None
90-
if (
91-
artifact_location
92-
and len(artifact_location) > 0
93-
and artifact_location[0].get("path")
94-
):
91+
if artifact_location and len(artifact_location) > 0 and artifact_location[0].get("path"):
9592
file_path = artifact_location[0].get("path")
9693

9794
finding_title = f"{vuln_id} in {artifact_name}:{artifact_version}"
9895

9996
finding_tags = None
10097
finding_description = ""
10198
if vuln_namespace:
102-
finding_description += (
103-
f"**Vulnerability Namespace:** {vuln_namespace}"
104-
)
99+
finding_description += f"**Vulnerability Namespace:** {vuln_namespace}"
105100
if vuln_description:
106-
finding_description += (
107-
f"\n**Vulnerability Description:** {vuln_description}"
108-
)
101+
finding_description += f"\n**Vulnerability Description:** {vuln_description}"
109102
if rel_description and rel_description != vuln_description:
110103
finding_description += f"\n**Related Vulnerability Description:** {rel_description}"
111104
if matches:
112105
if isinstance(item["matchDetails"], dict):
113-
finding_description += (
114-
f"\n**Matcher:** {matches['matcher']}"
115-
)
106+
finding_description += f"\n**Matcher:** {matches['matcher']}"
116107
finding_tags = [matches["matcher"].replace("-matcher", "")]
117108
elif len(matches) == 1:
118-
finding_description += (
119-
f"\n**Matcher:** {matches[0]['matcher']}"
120-
)
109+
finding_description += f"\n**Matcher:** {matches[0]['matcher']}"
121110
finding_tags = [
122111
matches[0]["matcher"].replace("-matcher", ""),
123112
]
@@ -148,30 +137,22 @@ def get_findings(self, file, test):
148137

149138
finding_references = ""
150139
if vuln_datasource:
151-
finding_references += (
152-
f"**Vulnerability Datasource:** {vuln_datasource}\n"
153-
)
140+
finding_references += f"**Vulnerability Datasource:** {vuln_datasource}\n"
154141
if vuln_urls:
155142
if len(vuln_urls) == 1:
156143
if vuln_urls[0] != vuln_datasource:
157-
finding_references += (
158-
f"**Vulnerability URL:** {vuln_urls[0]}\n"
159-
)
144+
finding_references += f"**Vulnerability URL:** {vuln_urls[0]}\n"
160145
else:
161146
finding_references += "**Vulnerability URLs:**\n"
162147
for url in vuln_urls:
163148
if url != vuln_datasource:
164149
finding_references += f"- {url}\n"
165150
if rel_datasource:
166-
finding_references += (
167-
f"**Related Vulnerability Datasource:** {rel_datasource}\n"
168-
)
151+
finding_references += f"**Related Vulnerability Datasource:** {rel_datasource}\n"
169152
if rel_urls:
170153
if len(rel_urls) == 1:
171154
if rel_urls[0] != vuln_datasource:
172-
finding_references += (
173-
f"**Related Vulnerability URL:** {rel_urls[0]}\n"
174-
)
155+
finding_references += f"**Related Vulnerability URL:** {rel_urls[0]}\n"
175156
else:
176157
finding_references += "**Related Vulnerability URLs:**\n"
177158
for url in rel_urls:
@@ -246,7 +227,8 @@ def get_cvss(self, cvss):
246227
vector = cvss_item["vector"]
247228
cvss_objects = cvss_parser.parse_cvss_from_text(vector)
248229
if len(cvss_objects) > 0 and isinstance(
249-
cvss_objects[0], CVSS3,
230+
cvss_objects[0],
231+
CVSS3,
250232
):
251233
return vector
252234
return None
@@ -276,8 +258,11 @@ def get_vulnerability_ids(self, vuln_id, related_vulnerabilities):
276258
if vuln_id:
277259
vulnerability_ids.append(vuln_id)
278260
if related_vulnerabilities:
279-
vulnerability_ids.extend(related_vulnerability_id for related_vulnerability in related_vulnerabilities
280-
if (related_vulnerability_id := related_vulnerability.get("id")))
261+
vulnerability_ids.extend(
262+
related_vulnerability_id
263+
for related_vulnerability in related_vulnerabilities
264+
if (related_vulnerability_id := related_vulnerability.get("id"))
265+
)
281266
if vulnerability_ids:
282267
return vulnerability_ids
283268
return None

0 commit comments

Comments
 (0)