[Test] Improve stability, debuggability and coverage of test_dcv_configuration:

gmarciani · gmarciani · commit b89788c17db5 · 2026-04-03T09:53:56.000-04:00
* debuggability: retrieve, print and analyze a comprehensive report of crashes (not only the crash filename, but the stack trace of the crash).  Also, moved from hard assertions to soft assertions to have a final report of all the observed failures.
  * stability: prevent false positive failures, by ignoring harmless crashes related to gnome, unrelated to nvidia or dcv. Also fixed a gap that was causing failures when multiple instances of this test are executed in parallel by serializing the modifications to ssh known_hosts.
  * coverage: the test is now able to detect crashes on all supported OSs, not only Ubuntu.
diff --git a/tests/integration-tests/tests/dcv/test_dcv.py b/tests/integration-tests/tests/dcv/test_dcv.py
@@ -9,6 +9,9 @@
 # or in the "LICENSE.txt" file accompanying this file.
 # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+import contextlib
+import fcntl
+import json
 import logging
 import os as operating_system
 import re
@@ -35,6 +38,18 @@
 SERVER_URL = "https://localhost"
 DCV_CONNECT_SCRIPT = "/opt/parallelcluster/scripts/pcluster_dcv_connect.sh"
 
+# Crashes matching any of these patterns are never tolerated, regardless of TOLERATED_CRASH_PATTERNS.
+UNTOLERATED_CRASH_PATTERNS = [
+    re.compile(r"dcv|nvidia", re.IGNORECASE),
+]
+
+# Tolerated crash patterns: list of regex patterns.
+# A crash is tolerated if it is unrelated to DCV and the software stack owned by ParallelCluster.
+TOLERATED_CRASH_PATTERNS = [
+    # gnome-software segfaults in libadwaita related to animated scrolling of UI widget, observed on RHEL9/Rocky9
+    re.compile(r"gnome-software.*scroll_to \(libadwaita", re.DOTALL),
+]
+
 
 def test_dcv_configuration(region, instance, os, scheduler, pcluster_config_reader, clusters_factory, test_datadir):
     host_ip = get_local_ip()
@@ -78,36 +93,67 @@ def _test_dcv_configuration(
     head_node_remote_command_executor = RemoteCommandExecutor(cluster)
     login_node_remote_command_executor = RemoteCommandExecutor(cluster, use_login_node=True)
 
-    # check configuration parameters of the head and login nodes
-    check_node_security_group(region, cluster, dcv_port, expected_cidr=access_from)
-    check_node_security_group(region, cluster, dcv_port, expected_cidr=access_from, login_pool_name="pool")
-
     shared_dir = f"/home/{get_username_for_os(os)}"
 
-    # test dcv connect show url for head and login node
-    _test_show_url(cluster, region, dcv_port, access_from)
-    _test_show_url(cluster, region, dcv_port, access_from, use_login_node=True)
-
-    # launch a session and verify the authenticator works
-    _test_authenticator(head_node_remote_command_executor, dcv_authenticator_port, shared_dir, os)
-    _test_authenticator(login_node_remote_command_executor, dcv_authenticator_port, shared_dir, os)
-
-    # check error cases
-    _check_error_cases(head_node_remote_command_executor, dcv_authenticator_port)
-    _check_error_cases(login_node_remote_command_executor, dcv_authenticator_port)
-
-    # check shared dir configuration
-    _check_shared_dir(head_node_remote_command_executor, shared_dir)
-    _check_shared_dir(login_node_remote_command_executor, shared_dir)
-
-    # Ensure no system programs crashed
-    _check_no_crashes(head_node_remote_command_executor, test_datadir)
-    _check_no_crashes(login_node_remote_command_executor, test_datadir)
-
-    # Check that logs are stored in CloudWatch as expected
-    FeatureSpecificCloudWatchLoggingTestRunner.run_tests_for_feature(
-        cluster, scheduler, os, "dcv_enabled", region, shared_dir
-    )
+    checks = [
+        (
+            "check_node_security_group (head node)",
+            lambda: check_node_security_group(region, cluster, dcv_port, expected_cidr=access_from),
+        ),
+        (
+            "check_node_security_group (login node)",
+            lambda: check_node_security_group(
+                region, cluster, dcv_port, expected_cidr=access_from, login_pool_name="pool"
+            ),
+        ),
+        ("dcv connect show url (head node)", lambda: _test_show_url(cluster, region, dcv_port, access_from)),
+        (
+            "dcv connect show url (login node)",
+            lambda: _test_show_url(cluster, region, dcv_port, access_from, use_login_node=True),
+        ),
+        (
+            "authenticator (head node)",
+            lambda: _test_authenticator(head_node_remote_command_executor, dcv_authenticator_port, shared_dir, os),
+        ),
+        (
+            "authenticator (login node)",
+            lambda: _test_authenticator(login_node_remote_command_executor, dcv_authenticator_port, shared_dir, os),
+        ),
+        (
+            "error cases (head node)",
+            lambda: _check_error_cases(head_node_remote_command_executor, dcv_authenticator_port),
+        ),
+        (
+            "error cases (login node)",
+            lambda: _check_error_cases(login_node_remote_command_executor, dcv_authenticator_port),
+        ),
+        ("shared dir (head node)", lambda: _check_shared_dir(head_node_remote_command_executor, shared_dir)),
+        ("shared dir (login node)", lambda: _check_shared_dir(login_node_remote_command_executor, shared_dir)),
+        ("no crashes (head node)", lambda: _assert_no_crashes(head_node_remote_command_executor, test_datadir)),
+        ("no crashes (login node)", lambda: _assert_no_crashes(login_node_remote_command_executor, test_datadir)),
+        (
+            "cloudwatch logs",
+            lambda: FeatureSpecificCloudWatchLoggingTestRunner.run_tests_for_feature(
+                cluster, scheduler, os, "dcv_enabled", region, shared_dir
+            ),
+        ),
+    ]
+
+    failures = []
+    for check_name, check_fn in checks:
+        try:
+            check_fn()
+        except Exception as e:
+            logging.error("Soft assertion failed for '%s': %s", check_name, e)
+            failures.append(f"{check_name}: {e}")
+
+    if failures:
+        formatted = []
+        for i, f in enumerate(failures):
+            # Unescape literal \n and \t sequences so the output is human-readable
+            readable = f.replace("\\n", "\n").replace("\\t", "\t")
+            formatted.append(f"  [{i+1}] {readable}")
+        pytest.fail(f"{len(failures)} DCV configuration check(s) failed:\n" + "\n".join(formatted))
 
 
 def _check_auth_ko(remote_command_executor, dcv_authenticator_port, params, expected_message):
@@ -138,9 +184,37 @@ def _check_auth_ok(remote_command_executor, external_authenticator_port, session
     ).is_equal_to('<auth result="yes"><username>{0}</username></auth>'.format(username))
 
 
-def _check_no_crashes(remote_command_executor, test_datadir):
-    """Verify no core files in /var/crash, which on ubuntu18 causes a popup when logging into the 1st session."""
-    remote_command_executor.run_remote_script(str(test_datadir / "verify_no_core_files.sh"))
+def _get_crash_report(remote_command_executor, test_datadir):
+    """Check for crash files on the node and return a crash report dictionary.
+
+    Runs a script that scans crash locations across all pcluster-supported OSes
+    (Ubuntu, AL2, AL2023, RHEL8/9, Rocky8/9) and returns a JSON dictionary
+    mapping crash file paths to their human-readable content.
+
+    Returns an empty dict if no crashes found.
+    """
+    result = remote_command_executor.run_remote_script(str(test_datadir / "get_crash_report.sh"), pty=False)
+    return json.loads(result.stdout)
+
+
+def _is_tolerated_crash(content):
+    """Check if a crash content matches a tolerated pattern."""
+    for pattern in UNTOLERATED_CRASH_PATTERNS:
+        if pattern.search(content):
+            return False
+    for pattern in TOLERATED_CRASH_PATTERNS:
+        if pattern.search(content):
+            return True
+    return False
+
+
+def _assert_no_crashes(remote_command_executor, test_datadir):
+    """Get crash report, log all crashes, and fail only on non-tolerated ones."""
+    crash_report = _get_crash_report(remote_command_executor, test_datadir)
+    if crash_report:
+        logging.warning("Crash report for %s:\n%s", remote_command_executor.target, json.dumps(crash_report, indent=2))
+    untolerated = {path: content for path, content in crash_report.items() if not _is_tolerated_crash(content)}
+    assert_that(untolerated).is_empty()
 
 
 def _get_known_hosts_content(host_keys_file):
@@ -153,6 +227,7 @@ def _get_known_hosts_content(host_keys_file):
 
 def _check_error_cases(remote_command_executor, dcv_authenticator_port):
     """Check DCV errors for both head and login nodes."""
+    logging.info(f"Checking expected authentication failure on {remote_command_executor.target}")
     _check_auth_ko(
         remote_command_executor,
         dcv_authenticator_port,
@@ -165,6 +240,20 @@ def _check_error_cases(remote_command_executor, dcv_authenticator_port):
     _check_auth_ko(
         remote_command_executor, dcv_authenticator_port, "-d action=requestToken -d authUser=centos", "Wrong parameters"
     )
+    logging.info(f"Completed checks for authentication failure on {remote_command_executor.target}")
+
+
+@contextlib.contextmanager
+def _temporary_known_host(hostname, host_keys_file, env):
+    """Add SSH host keys for hostname, yield, then remove them. Serialized via file lock across processes."""
+    lock_file = host_keys_file + ".lock"
+    with open(lock_file, "w") as lf:
+        fcntl.flock(lf, fcntl.LOCK_EX)
+        try:
+            add_keys_to_known_hosts(hostname, host_keys_file)
+            yield
+        finally:
+            remove_keys_from_known_hosts(hostname, host_keys_file, env=env)
 
 
 def _test_show_url(cluster, region, dcv_port, access_from, use_login_node=False):  # noqa: C901
@@ -174,7 +263,6 @@ def _test_show_url(cluster, region, dcv_port, access_from, use_login_node=False)
 
     node_ip = cluster.get_login_node_public_ip() if use_login_node else cluster.head_node_ip
 
-    # add ssh key to jenkins user known hosts file to avoid ssh keychecking prompt
     # Ensure known_hosts path exists to avoid `cat` command returning non-zero exit when testing in ADC region.
     host_keys_file = operating_system.path.expanduser("~/.ssh/known_hosts")
     host_keys_path = Path(host_keys_file)
@@ -186,18 +274,18 @@ def _test_show_url(cluster, region, dcv_port, access_from, use_login_node=False)
     except Exception as e:
         logging.warning(f"Failed to prepare known_hosts file {host_keys_file}: {e}")
 
-    add_keys_to_known_hosts(node_ip, host_keys_file)
-
     dcv_connect_args = ["pcluster", "dcv-connect", "--cluster-name", cluster.name, "--show-url"]
 
     if use_login_node:
         dcv_connect_args.extend(["--login-node-ip", node_ip])
 
-    try:
-        result = run_pcluster_command(dcv_connect_args, env=env)
-    finally:
-        # remove ssh key from jenkins user known hosts file
-        remove_keys_from_known_hosts(node_ip, host_keys_file, env=env)
+    with _temporary_known_host(node_ip, host_keys_file, env):
+        try:
+            result = run_pcluster_command(dcv_connect_args, env=env)
+        except subprocess.CalledProcessError as e:
+            raise AssertionError(
+                f"Command {e.cmd} failed (exit {e.returncode}).\nstderr: {e.stderr}\nstdout: {e.stdout}"
+            ) from e
 
     assert_that(result.stdout).matches(
         r"Please use the following one-time URL in your browser within 30 seconds:\n"
@@ -221,9 +309,8 @@ def _test_authenticator(remote_command_executor, dcv_authenticator_port, shared_
         dcv_session_token = dcv_parameters.group(3)
         _check_auth_ok(remote_command_executor, dcv_authenticator_port, dcv_session_id, dcv_session_token, os)
     else:
-        print(
+        assert_that(dcv_parameters).described_as(
             "Command '{0} {1}' fails, output: {2}, error: {3}".format(
                 DCV_CONNECT_SCRIPT, shared_dir, command_execution.stdout, command_execution.stderr
             )
-        )
-        raise AssertionError
+        ).is_not_none()
diff --git a/tests/integration-tests/tests/dcv/test_dcv/test_dcv_configuration/get_crash_report.sh b/tests/integration-tests/tests/dcv/test_dcv/test_dcv_configuration/get_crash_report.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+# Scans for crash files based on the detected OS and outputs a JSON
+# dictionary: { "crash_file_path": "content", ... }
+# Always exits 0 — the caller decides whether to fail.
+# Log lines are emitted to stderr so they don't pollute the JSON on stdout.
+
+SCRIPT_NAME=$(basename "$0")
+
+function log_info() { echo "[${SCRIPT_NAME}] INFO: $*" >&2; }
+
+function log_error() { echo "[${SCRIPT_NAME}] ERROR: $*" >&2; }
+
+# --- JSON helper using python3 for safe escaping ---
+crash_entries=""
+
+function add_entry() {
+  local path="$1"
+  local content="$2"
+  log_info "Adding crash entry: ${path}"
+  if [ -n "${crash_entries}" ]; then
+    crash_entries="${crash_entries},"
+  fi
+  crash_entries="${crash_entries}$(python3 -c "
+import json, sys
+print(json.dumps(sys.argv[1]) + ': ' + json.dumps(sys.argv[2]))
+" "${path}" "${content}")"
+}
+
+function read_crash_content() {
+  local filepath="$1"
+  if [ -d "${filepath}" ]; then
+    # ABRT directory: concatenate text fields
+    local content=""
+    for field in "${filepath}"/*; do
+      [ -f "${field}" ] || continue
+      fname=$(basename "${field}")
+      [ "${fname}" = "coredump" ] && continue
+      if file "${field}" 2>/dev/null | grep -q "text"; then
+        content="${content}${fname}:\n$(tail -100 "${field}")\n\n"
+      fi
+    done
+    echo -e "${content}"
+  elif file "${filepath}" 2>/dev/null | grep -q "text"; then
+    tail -100 "${filepath}"
+  else
+    strings "${filepath}" 2>/dev/null | head -100
+  fi
+}
+
+function scan_directory() {
+  local crashdir="$1"
+  log_info "Scanning directory ${crashdir}..."
+  if [ ! -d "${crashdir}" ]; then
+    log_info "Directory ${crashdir} does not exist, skipping."
+    return
+  fi
+  files="$(ls -A "${crashdir}" 2>/dev/null)"
+  if [ -z "${files}" ]; then
+    log_info "No crash files found in ${crashdir}."
+    return
+  fi
+  log_info "Found files in ${crashdir}: ${files}"
+  for crash_file in ${files}; do
+    filepath="${crashdir}/${crash_file}"
+    log_info "Reading crash file: ${filepath}"
+    content=$(read_crash_content "${filepath}")
+    add_entry "${filepath}" "${content}"
+  done
+}
+
+function scan_coredumpctl() {
+  log_info "Checking coredumpctl for systemd-coredump entries..."
+  if ! command -v coredumpctl > /dev/null 2>&1; then
+    log_info "coredumpctl not found, skipping."
+    return
+  fi
+  dump_list=$(coredumpctl list --no-pager --no-legend 2>/dev/null)
+  if [ -z "${dump_list}" ]; then
+    log_info "No coredump entries found via coredumpctl."
+    return
+  fi
+  log_info "Found coredump entries via coredumpctl."
+  content=$(coredumpctl info --no-pager 2>/dev/null | head -100)
+  add_entry "coredumpctl" "${content}"
+}
+
+# --- Detect OS from /etc/os-release ---
+log_info "Reading /etc/os-release..."
+if [ -f /etc/os-release ]; then
+  . /etc/os-release
+else
+  log_error "/etc/os-release not found. Cannot determine OS."
+  exit 1
+fi
+
+if [ -z "${ID}" ]; then
+  log_error "Could not determine OS from /etc/os-release (ID is empty)."
+  exit 1
+fi
+
+log_info "Detected OS: ID=${ID}, VERSION_ID=${VERSION_ID}"
+
+# --- Select crash scan logic based on OS ---
+log_info "Selecting crash scan strategy for OS '${ID}' version '${VERSION_ID}'..."
+case "${ID}" in
+  ubuntu)
+    log_info "Using Apport strategy (Ubuntu)."
+    scan_directory /var/crash
+    ;;
+  amzn)
+    case "${VERSION_ID}" in
+      2)
+        log_info "Using ABRT strategy (Amazon Linux 2)."
+        scan_directory /var/spool/abrt
+        ;;
+      2023)
+        log_info "Using systemd-coredump strategy (Amazon Linux 2023)."
+        scan_coredumpctl
+        ;;
+    esac
+    ;;
+  rhel)
+    case "${VERSION_ID%%.*}" in
+      8)
+        log_info "Using ABRT strategy (RHEL 8)."
+        scan_directory /var/spool/abrt
+        ;;
+      9)
+        log_info "Using systemd-coredump strategy (RHEL 9)."
+        scan_coredumpctl
+        ;;
+    esac
+    ;;
+  rocky)
+    case "${VERSION_ID%%.*}" in
+      8)
+        log_info "Using ABRT strategy (Rocky 8)."
+        scan_directory /var/spool/abrt
+        ;;
+      9)
+        log_info "Using systemd-coredump strategy (Rocky 9)."
+        scan_coredumpctl
+        ;;
+    esac
+    ;;
+  *)
+    log_error "Unsupported OS '${ID}' (VERSION_ID='${VERSION_ID}'). Cannot determine crash file locations."
+    exit 1
+    ;;
+esac
+
+log_info "Crash scan complete."
+echo "{${crash_entries}}"
+exit 0
diff --git a/tests/integration-tests/tests/dcv/test_dcv/test_dcv_configuration/verify_no_core_files.sh b/tests/integration-tests/tests/dcv/test_dcv/test_dcv_configuration/verify_no_core_files.sh