DRAFT [Test] Make test_dcv_configuration able to print out the content of crash files if detected.

gmarciani · gmarciani · commit 9816039e3869 · 2026-03-30T18:30:23.000-04:00
diff --git a/tests/integration-tests/tests/dcv/test_dcv.py b/tests/integration-tests/tests/dcv/test_dcv.py
@@ -9,6 +9,7 @@
 # or in the "LICENSE.txt" file accompanying this file.
 # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+import json
 import logging
 import os as operating_system
 import re
@@ -100,9 +101,11 @@ def _test_dcv_configuration(
     _check_shared_dir(head_node_remote_command_executor, shared_dir)
     _check_shared_dir(login_node_remote_command_executor, shared_dir)
 
-    # Ensure no system programs crashed
-    _check_no_crashes(head_node_remote_command_executor, test_datadir)
-    _check_no_crashes(login_node_remote_command_executor, test_datadir)
+    # Check for system crashes on both nodes
+    head_node_crash_report = _check_no_crashes(head_node_remote_command_executor, test_datadir)
+    login_node_crash_report = _check_no_crashes(login_node_remote_command_executor, test_datadir)
+    assert_that(head_node_crash_report).is_empty()
+    assert_that(login_node_crash_report).is_empty()
 
     # Check that logs are stored in CloudWatch as expected
     FeatureSpecificCloudWatchLoggingTestRunner.run_tests_for_feature(
@@ -139,8 +142,23 @@ def _check_auth_ok(remote_command_executor, external_authenticator_port, session
 
 
 def _check_no_crashes(remote_command_executor, test_datadir):
-    """Verify no core files in /var/crash, which on ubuntu18 causes a popup when logging into the 1st session."""
-    remote_command_executor.run_remote_script(str(test_datadir / "verify_no_core_files.sh"))
+    """Check for crash files on the node and return a crash report dictionary.
+
+    Runs a script that scans crash locations across all pcluster-supported OSes
+    (Ubuntu, AL2, AL2023, RHEL8/9, Rocky8/9) and returns a JSON dictionary
+    mapping crash file paths to their human-readable content.
+
+    Returns an empty dict if no crashes found, otherwise logs and returns the report.
+    """
+    result = remote_command_executor.run_remote_script(str(test_datadir / "get_crash_report.sh"))
+    crash_report = json.loads(result.stdout)
+    if crash_report:
+        logging.warning("Crash files detected on %s:", remote_command_executor.target)
+        for crash_path, content in crash_report.items():
+            logging.warning("  %s:\n%s", crash_path, content)
+    else:
+        logging.info("No crash files found on %s.", remote_command_executor.target)
+    return crash_report
 
 
 def _get_known_hosts_content(host_keys_file):
diff --git a/tests/integration-tests/tests/dcv/test_dcv/test_dcv_configuration/get_crash_report.sh b/tests/integration-tests/tests/dcv/test_dcv/test_dcv_configuration/get_crash_report.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+# Scans for crash files based on the detected OS and outputs a JSON
+# dictionary: { "crash_file_path": "content", ... }
+# Always exits 0 — the caller decides whether to fail.
+# Log lines are emitted to stderr so they don't pollute the JSON on stdout.
+
+log_info() { echo "[get_crash_report] INFO: $*" >&2; }
+log_error() { echo "[get_crash_report] ERROR: $*" >&2; }
+
+# --- JSON helper using python3 for safe escaping ---
+crash_entries=""
+
+function add_entry() {
+  local path="$1"
+  local content="$2"
+  log_info "Adding crash entry: ${path}"
+  if [ -n "${crash_entries}" ]; then
+    crash_entries="${crash_entries},"
+  fi
+  crash_entries="${crash_entries}$(python3 -c "
+import json, sys
+print(json.dumps(sys.argv[1]) + ': ' + json.dumps(sys.argv[2]))
+" "${path}" "${content}")"
+}
+
+function read_crash_content() {
+  local filepath="$1"
+  if [ -d "${filepath}" ]; then
+    # ABRT directory: concatenate text fields
+    local content=""
+    for field in "${filepath}"/*; do
+      [ -f "${field}" ] || continue
+      fname=$(basename "${field}")
+      [ "${fname}" = "coredump" ] && continue
+      if file "${field}" 2>/dev/null | grep -q "text"; then
+        content="${content}${fname}:\n$(tail -100 "${field}")\n\n"
+      fi
+    done
+    echo -e "${content}"
+  elif file "${filepath}" 2>/dev/null | grep -q "text"; then
+    tail -100 "${filepath}"
+  else
+    strings "${filepath}" 2>/dev/null | head -100
+  fi
+}
+
+function scan_directory() {
+  local crashdir="$1"
+  log_info "Scanning directory ${crashdir}..."
+  if [ ! -d "${crashdir}" ]; then
+    log_info "Directory ${crashdir} does not exist, skipping."
+    return
+  fi
+  files="$(ls -A "${crashdir}" 2>/dev/null)"
+  if [ -z "${files}" ]; then
+    log_info "No crash files found in ${crashdir}."
+    return
+  fi
+  log_info "Found files in ${crashdir}: ${files}"
+  for crash_file in ${files}; do
+    filepath="${crashdir}/${crash_file}"
+    log_info "Reading crash file: ${filepath}"
+    content=$(read_crash_content "${filepath}")
+    add_entry "${filepath}" "${content}"
+  done
+}
+
+function scan_coredumpctl() {
+  log_info "Checking coredumpctl for systemd-coredump entries..."
+  if ! command -v coredumpctl > /dev/null 2>&1; then
+    log_info "coredumpctl not found, skipping."
+    return
+  fi
+  dump_list=$(coredumpctl list --no-pager --no-legend 2>/dev/null)
+  if [ -z "${dump_list}" ]; then
+    log_info "No coredump entries found via coredumpctl."
+    return
+  fi
+  log_info "Found coredump entries via coredumpctl."
+  content=$(coredumpctl info --no-pager 2>/dev/null | head -100)
+  add_entry "coredumpctl" "${content}"
+}
+
+# --- Detect OS from /etc/os-release ---
+log_info "Reading /etc/os-release..."
+if [ -f /etc/os-release ]; then
+  . /etc/os-release
+else
+  log_error "/etc/os-release not found. Cannot determine OS."
+  exit 1
+fi
+
+if [ -z "${ID}" ]; then
+  log_error "Could not determine OS from /etc/os-release (ID is empty)."
+  exit 1
+fi
+
+log_info "Detected OS: ID=${ID}, VERSION_ID=${VERSION_ID}"
+
+# --- Select crash scan logic based on OS ---
+log_info "Selecting crash scan strategy for OS '${ID}' version '${VERSION_ID}'..."
+case "${ID}" in
+  ubuntu)
+    log_info "Using Apport strategy (Ubuntu)."
+    scan_directory /var/crash
+    ;;
+  amzn)
+    case "${VERSION_ID}" in
+      2)
+        log_info "Using ABRT strategy (Amazon Linux 2)."
+        scan_directory /var/spool/abrt
+        ;;
+      2023)
+        log_info "Using systemd-coredump strategy (Amazon Linux 2023)."
+        scan_coredumpctl
+        ;;
+    esac
+    ;;
+  rhel)
+    case "${VERSION_ID%%.*}" in
+      8)
+        log_info "Using ABRT strategy (RHEL 8)."
+        scan_directory /var/spool/abrt
+        ;;
+      9)
+        log_info "Using systemd-coredump strategy (RHEL 9)."
+        scan_coredumpctl
+        ;;
+    esac
+    ;;
+  rocky)
+    case "${VERSION_ID%%.*}" in
+      8)
+        log_info "Using ABRT strategy (Rocky 8)."
+        scan_directory /var/spool/abrt
+        ;;
+      9)
+        log_info "Using systemd-coredump strategy (Rocky 9)."
+        scan_coredumpctl
+        ;;
+    esac
+    ;;
+  *)
+    log_error "Unsupported OS '${ID}' (VERSION_ID='${VERSION_ID}'). Cannot determine crash file locations."
+    exit 1
+    ;;
+esac
+
+log_info "Crash scan complete."
+echo "{${crash_entries}}"
+exit 0