DefectDojo
diff --git a/‎dojo/finding/deduplication.py‎
Lines changed: 135 additions & 13 deletions b/‎dojo/finding/deduplication.py‎
Lines changed: 135 additions & 13 deletions
diff --git a/‎dojo/importers/default_importer.py‎
Lines changed: 36 additions & 27 deletions b/‎dojo/importers/default_importer.py‎
Lines changed: 36 additions & 27 deletions
diff --git a/‎dojo/tools/anchore_grype/parser.py‎
Lines changed: 18 additions & 33 deletions b/‎dojo/tools/anchore_grype/parser.py‎
Lines changed: 18 additions & 33 deletions
@@ -217,14 +217,17 @@ def is_deduplication_on_engagement_mismatch(new_finding, to_duplicate_finding):
     return False
 
 
-def get_endpoints_as_url(finding):
-    # Fix for https://github.com/DefectDojo/django-DefectDojo/issues/10215
-    # When endpoints lack a protocol (scheme), str(e) returns a string like "10.20.197.218:6379"
-    # without the "//" prefix. hyperlink.parse() then misinterprets the hostname as the scheme.
-    # We replicate the behavior from dojo/endpoint/utils.py line 265: prepend "//" if "://" is missing
-    # to ensure hyperlink.parse() correctly identifies host, port, and path components.
+def get_endpoints_as_url(endpoints):
+    """
+    Convert a list of Endpoint objects to parsed hyperlink URLs.
+
+    Fix for https://github.com/DefectDojo/django-DefectDojo/issues/10215
+    When endpoints lack a protocol (scheme), str(e) returns a string like "10.20.197.218:6379"
+    without the "//" prefix. hyperlink.parse() then misinterprets the hostname as the scheme.
+    We prepend "//" if "://" is missing to ensure correct parsing.
+    """
     urls = []
-    for e in finding.endpoints.all():
+    for e in endpoints:
         endpoint_str = str(e)
         if "://" not in endpoint_str:
             endpoint_str = "//" + endpoint_str
@@ -242,8 +245,9 @@ def are_urls_equal(url1, url2, fields):
     return True
 
 
-def finding_locations(finding):
-    return [ref.location.url for ref in finding.locations.all()]
+def finding_locations(location_refs):
+    """Extract URLs from a list of location references."""
+    return [ref.location.url for ref in location_refs]
 
 
 def are_location_urls_equal(url1, url2, fields):
@@ -266,8 +270,11 @@ def are_locations_duplicates(new_finding, to_duplicate_finding):
         return True
 
     if settings.V3_FEATURE_LOCATIONS:
-        list1 = finding_locations(new_finding)
-        list2 = finding_locations(to_duplicate_finding)
+        # Use unsaved_locations for unsaved findings (preview mode), saved M2M otherwise
+        locs1 = new_finding.locations.all() if new_finding.pk else getattr(new_finding, "unsaved_locations", [])
+        locs2 = to_duplicate_finding.locations.all() if to_duplicate_finding.pk else getattr(to_duplicate_finding, "unsaved_locations", [])
+        list1 = finding_locations(locs1)
+        list2 = finding_locations(locs2)
 
         deduplicationLogger.debug(
             f"Starting deduplication by location fields for finding {new_finding.id} with locations {list1} and finding {to_duplicate_finding.id} with locations {list2}",
@@ -284,8 +291,11 @@ def are_locations_duplicates(new_finding, to_duplicate_finding):
         deduplicationLogger.debug(f"locations are not duplicates: {new_finding.id} and {to_duplicate_finding.id}")
         return False
     # TODO: Delete this after the move to Locations
-    list1 = get_endpoints_as_url(new_finding)
-    list2 = get_endpoints_as_url(to_duplicate_finding)
+    # Use unsaved_endpoints for unsaved findings (preview mode), saved M2M otherwise
+    eps1 = new_finding.endpoints.all() if new_finding.pk else getattr(new_finding, "unsaved_endpoints", [])
+    eps2 = to_duplicate_finding.endpoints.all() if to_duplicate_finding.pk else getattr(to_duplicate_finding, "unsaved_endpoints", [])
+    list1 = get_endpoints_as_url(eps1)
+    list2 = get_endpoints_as_url(eps2)
 
     deduplicationLogger.debug(
         f"Starting deduplication by endpoint fields for finding {new_finding.id} with urls {list1} and finding {to_duplicate_finding.id} with urls {list2}",
@@ -535,6 +545,9 @@ def find_candidates_for_reimport_legacy(test, findings, service=None):
 
 
 def _is_candidate_older(new_finding, candidate):
+    # Unsaved findings (e.g. preview mode) have no PK — all DB candidates are older by definition
+    if new_finding.pk is None:
+        return True
     # Ensure the newer finding is marked as duplicate of the older finding
     is_older = candidate.id < new_finding.id
     if not is_older:
@@ -715,7 +728,116 @@ def _flush_duplicate_changes(modified_new_findings):
     return modified_new_findings
 
 
+# ---------------------------------------------------------------------------
+# Match-only functions (read-only, no DB writes)
+# These return [(new_finding, matched_candidate), ...] without persisting.
+# Used by both the regular dedup pipeline and the Pro import/reimport preview engine.
+# ---------------------------------------------------------------------------
+
+
+def match_batch_hash_code(findings):
+    """Find dedup matches by hash_code without persisting. Returns [(finding, candidate), ...]."""
+    if not findings:
+        return []
+    test = findings[0].test
+    candidates_by_hash = find_candidates_for_deduplication_hash(test, findings)
+    if not candidates_by_hash:
+        return []
+    matches = []
+    for new_finding in findings:
+        for match in get_matches_from_hash_candidates(new_finding, candidates_by_hash):
+            matches.append((new_finding, match))
+            break
+    return matches
+
+
+def match_batch_unique_id(findings):
+    """Find dedup matches by unique_id_from_tool without persisting. Returns [(finding, candidate), ...]."""
+    if not findings:
+        return []
+    test = findings[0].test
+    candidates_by_uid = find_candidates_for_deduplication_unique_id(test, findings)
+    if not candidates_by_uid:
+        return []
+    matches = []
+    for new_finding in findings:
+        for match in get_matches_from_unique_id_candidates(new_finding, candidates_by_uid):
+            matches.append((new_finding, match))
+            break
+    return matches
+
+
+def match_batch_uid_or_hash(findings):
+    """Find dedup matches by uid or hash_code without persisting. Returns [(finding, candidate), ...]."""
+    if not findings:
+        return []
+    test = findings[0].test
+    candidates_by_uid, existing_by_hash = find_candidates_for_deduplication_uid_or_hash(test, findings)
+    if not (candidates_by_uid or existing_by_hash):
+        return []
+    matches = []
+    for new_finding in findings:
+        if new_finding.duplicate:
+            continue
+        for match in get_matches_from_uid_or_hash_candidates(new_finding, candidates_by_uid, existing_by_hash):
+            matches.append((new_finding, match))
+            break
+    return matches
+
+
+def match_batch_legacy(findings):
+    """Find dedup matches by legacy algorithm without persisting. Returns [(finding, candidate), ...]."""
+    if not findings:
+        return []
+    test = findings[0].test
+    candidates_by_title, candidates_by_cwe = find_candidates_for_deduplication_legacy(test, findings)
+    if not (candidates_by_title or candidates_by_cwe):
+        return []
+    matches = []
+    for new_finding in findings:
+        for match in get_matches_from_legacy_candidates(new_finding, candidates_by_title, candidates_by_cwe):
+            matches.append((new_finding, match))
+            break
+    return matches
+
+
+def match_batch_of_findings(findings):
+    """
+    Batch match findings against existing candidates without persisting.
+
+    Returns list of (new_finding, matched_candidate) tuples.
+    Works with both saved and unsaved findings.
+    """
+    if not findings:
+        return []
+    enabled = System_Settings.objects.get().enable_deduplication
+    if not enabled:
+        return []
+    # Only sort by id for saved findings; unsaved findings have no id
+    if findings[0].pk is not None:
+        findings = sorted(findings, key=attrgetter("id"))
+    test = findings[0].test
+    dedup_alg = test.deduplication_algorithm
+    if dedup_alg == settings.DEDUPE_ALGO_HASH_CODE:
+        return match_batch_hash_code(findings)
+    if dedup_alg == settings.DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL:
+        return match_batch_unique_id(findings)
+    if dedup_alg == settings.DEDUPE_ALGO_UNIQUE_ID_FROM_TOOL_OR_HASH_CODE:
+        return match_batch_uid_or_hash(findings)
+    return match_batch_legacy(findings)
+
+
+# ---------------------------------------------------------------------------
+# Batch dedup functions (match + persist)
+# These call the match-only functions above and then persist the results.
+# ---------------------------------------------------------------------------
+
+
 def _dedupe_batch_hash_code(findings):
+    # NOTE: These functions intentionally interleave matching and set_duplicate()
+    # rather than calling the match_batch_*() functions above. This is because
+    # set_duplicate() modifies finding.duplicate in-memory, which affects the
+    # duplicate check in subsequent loop iterations (especially for uid_or_hash).
     if not findings:
         return []
     test = findings[0].test
 
@@ -317,35 +317,12 @@ def process_findings(
 
         return new_findings
 
-    def close_old_findings(
-        self,
-        findings: list[Finding],
-        **kwargs: dict,
-    ) -> list[Finding]:
+    def get_close_old_findings_queryset(self, new_hash_codes, new_unique_ids_from_tool):
         """
-        Closes old findings based on a hash code match at either the product
-        or the engagement scope. Closing an old finding entails setting the
-        finding to mitigated status, setting all location statuses to mitigated,
-        as well as leaving a not on the finding indicating that it was mitigated
-        because the vulnerability is no longer present in the submitted scan report.
-        """
-        # First check if close old findings is desired
-        if not self.close_old_findings_toggle:
-            return []
+        Build queryset of findings that would be closed, without closing them.
 
-        logger.debug("IMPORT_SCAN: Closing findings no longer present in scan report")
-        # Remove all the findings that are coming from the report already mitigated
-        new_hash_codes = []
-        new_unique_ids_from_tool = []
-        for finding in findings.values():
-            # Do not process closed findings in the report
-            if finding.get("is_mitigated", False):
-                continue
-            # Grab the hash code
-            if (hash_code := finding.get("hash_code")) is not None:
-                new_hash_codes.append(hash_code)
-            if (unique_id_from_tool := finding.get("unique_id_from_tool")) is not None:
-                new_unique_ids_from_tool.append(unique_id_from_tool)
+        Reusable by preview engines to count findings that would be closed.
+        """
         # Get the initial filtered list of old findings to be closed without
         # considering the scope of the product or engagement
         # Include both active findings and risk-accepted findings (which have active=False)
@@ -382,6 +359,38 @@ def close_old_findings(
             old_findings = old_findings.filter(service=self.service)
         else:
             old_findings = old_findings.filter(Q(service__isnull=True) | Q(service__exact=""))
+        return old_findings
+
+    def close_old_findings(
+        self,
+        findings: list[Finding],
+        **kwargs: dict,
+    ) -> list[Finding]:
+        """
+        Closes old findings based on a hash code match at either the product
+        or the engagement scope. Closing an old finding entails setting the
+        finding to mitigated status, setting all location statuses to mitigated,
+        as well as leaving a not on the finding indicating that it was mitigated
+        because the vulnerability is no longer present in the submitted scan report.
+        """
+        # First check if close old findings is desired
+        if not self.close_old_findings_toggle:
+            return []
+
+        logger.debug("IMPORT_SCAN: Closing findings no longer present in scan report")
+        # Remove all the findings that are coming from the report already mitigated
+        new_hash_codes = []
+        new_unique_ids_from_tool = []
+        for finding in findings.values():
+            # Do not process closed findings in the report
+            if finding.get("is_mitigated", False):
+                continue
+            # Grab the hash code
+            if (hash_code := finding.get("hash_code")) is not None:
+                new_hash_codes.append(hash_code)
+            if (unique_id_from_tool := finding.get("unique_id_from_tool")) is not None:
+                new_unique_ids_from_tool.append(unique_id_from_tool)
+        old_findings = self.get_close_old_findings_queryset(new_hash_codes, new_unique_ids_from_tool)
         # Update the status of the findings and any locations
         for old_finding in old_findings:
             url = str(get_full_url(reverse("view_test", args=(self.test.id,))))
 
@@ -76,7 +76,8 @@ def get_findings(self, file, test):
                 rel_epss = related_vulnerability.get("epss")
                 rel_vuln_id = related_vulnerability.get("id")
             vulnerability_ids = self.get_vulnerability_ids(
-                vuln_id, related_vulnerabilities,
+                vuln_id,
+                related_vulnerabilities,
             )
 
             matches = item["matchDetails"]
@@ -87,37 +88,25 @@ def get_findings(self, file, test):
             artifact_purl = artifact.get("purl")
             artifact_location = artifact.get("locations")
             file_path = None
-            if (
-                artifact_location
-                and len(artifact_location) > 0
-                and artifact_location[0].get("path")
-            ):
+            if artifact_location and len(artifact_location) > 0 and artifact_location[0].get("path"):
                 file_path = artifact_location[0].get("path")
 
             finding_title = f"{vuln_id} in {artifact_name}:{artifact_version}"
 
             finding_tags = None
             finding_description = ""
             if vuln_namespace:
-                finding_description += (
-                    f"**Vulnerability Namespace:** {vuln_namespace}"
-                )
+                finding_description += f"**Vulnerability Namespace:** {vuln_namespace}"
             if vuln_description:
-                finding_description += (
-                    f"\n**Vulnerability Description:** {vuln_description}"
-                )
+                finding_description += f"\n**Vulnerability Description:** {vuln_description}"
             if rel_description and rel_description != vuln_description:
                 finding_description += f"\n**Related Vulnerability Description:** {rel_description}"
             if matches:
                 if isinstance(item["matchDetails"], dict):
-                    finding_description += (
-                        f"\n**Matcher:** {matches['matcher']}"
-                    )
+                    finding_description += f"\n**Matcher:** {matches['matcher']}"
                     finding_tags = [matches["matcher"].replace("-matcher", "")]
                 elif len(matches) == 1:
-                    finding_description += (
-                        f"\n**Matcher:** {matches[0]['matcher']}"
-                    )
+                    finding_description += f"\n**Matcher:** {matches[0]['matcher']}"
                     finding_tags = [
                         matches[0]["matcher"].replace("-matcher", ""),
                     ]
@@ -148,30 +137,22 @@ def get_findings(self, file, test):
 
             finding_references = ""
             if vuln_datasource:
-                finding_references += (
-                    f"**Vulnerability Datasource:** {vuln_datasource}\n"
-                )
+                finding_references += f"**Vulnerability Datasource:** {vuln_datasource}\n"
             if vuln_urls:
                 if len(vuln_urls) == 1:
                     if vuln_urls[0] != vuln_datasource:
-                        finding_references += (
-                            f"**Vulnerability URL:** {vuln_urls[0]}\n"
-                        )
+                        finding_references += f"**Vulnerability URL:** {vuln_urls[0]}\n"
                 else:
                     finding_references += "**Vulnerability URLs:**\n"
                     for url in vuln_urls:
                         if url != vuln_datasource:
                             finding_references += f"- {url}\n"
             if rel_datasource:
-                finding_references += (
-                    f"**Related Vulnerability Datasource:** {rel_datasource}\n"
-                )
+                finding_references += f"**Related Vulnerability Datasource:** {rel_datasource}\n"
             if rel_urls:
                 if len(rel_urls) == 1:
                     if rel_urls[0] != vuln_datasource:
-                        finding_references += (
-                            f"**Related Vulnerability URL:** {rel_urls[0]}\n"
-                        )
+                        finding_references += f"**Related Vulnerability URL:** {rel_urls[0]}\n"
                 else:
                     finding_references += "**Related Vulnerability URLs:**\n"
                     for url in rel_urls:
@@ -246,7 +227,8 @@ def get_cvss(self, cvss):
                 vector = cvss_item["vector"]
                 cvss_objects = cvss_parser.parse_cvss_from_text(vector)
                 if len(cvss_objects) > 0 and isinstance(
-                    cvss_objects[0], CVSS3,
+                    cvss_objects[0],
+                    CVSS3,
                 ):
                     return vector
         return None
@@ -276,8 +258,11 @@ def get_vulnerability_ids(self, vuln_id, related_vulnerabilities):
         if vuln_id:
             vulnerability_ids.append(vuln_id)
         if related_vulnerabilities:
-            vulnerability_ids.extend(related_vulnerability_id for related_vulnerability in related_vulnerabilities
-                if (related_vulnerability_id := related_vulnerability.get("id")))
+            vulnerability_ids.extend(
+                related_vulnerability_id
+                for related_vulnerability in related_vulnerabilities
+                if (related_vulnerability_id := related_vulnerability.get("id"))
+            )
         if vulnerability_ids:
             return vulnerability_ids
         return None