Skip to content

Commit 9816039

Browse files
committed
DRAFT [Test] Make test_dcv_configuration able to print out the content of crash files if detected.
1 parent c9ad85c commit 9816039

2 files changed

Lines changed: 174 additions & 5 deletions

File tree

tests/integration-tests/tests/dcv/test_dcv.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
# or in the "LICENSE.txt" file accompanying this file.
1010
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
1111
# See the License for the specific language governing permissions and limitations under the License.
12+
import json
1213
import logging
1314
import os as operating_system
1415
import re
@@ -100,9 +101,11 @@ def _test_dcv_configuration(
100101
_check_shared_dir(head_node_remote_command_executor, shared_dir)
101102
_check_shared_dir(login_node_remote_command_executor, shared_dir)
102103

103-
# Ensure no system programs crashed
104-
_check_no_crashes(head_node_remote_command_executor, test_datadir)
105-
_check_no_crashes(login_node_remote_command_executor, test_datadir)
104+
# Check for system crashes on both nodes
105+
head_node_crash_report = _check_no_crashes(head_node_remote_command_executor, test_datadir)
106+
login_node_crash_report = _check_no_crashes(login_node_remote_command_executor, test_datadir)
107+
assert_that(head_node_crash_report).is_empty()
108+
assert_that(login_node_crash_report).is_empty()
106109

107110
# Check that logs are stored in CloudWatch as expected
108111
FeatureSpecificCloudWatchLoggingTestRunner.run_tests_for_feature(
@@ -139,8 +142,23 @@ def _check_auth_ok(remote_command_executor, external_authenticator_port, session
139142

140143

141144
def _check_no_crashes(remote_command_executor, test_datadir):
142-
"""Verify no core files in /var/crash, which on ubuntu18 causes a popup when logging into the 1st session."""
143-
remote_command_executor.run_remote_script(str(test_datadir / "verify_no_core_files.sh"))
145+
"""Check for crash files on the node and return a crash report dictionary.
146+
147+
Runs a script that scans crash locations across all pcluster-supported OSes
148+
(Ubuntu, AL2, AL2023, RHEL8/9, Rocky8/9) and returns a JSON dictionary
149+
mapping crash file paths to their human-readable content.
150+
151+
Returns an empty dict if no crashes found, otherwise logs and returns the report.
152+
"""
153+
result = remote_command_executor.run_remote_script(str(test_datadir / "get_crash_report.sh"))
154+
crash_report = json.loads(result.stdout)
155+
if crash_report:
156+
logging.warning("Crash files detected on %s:", remote_command_executor.target)
157+
for crash_path, content in crash_report.items():
158+
logging.warning(" %s:\n%s", crash_path, content)
159+
else:
160+
logging.info("No crash files found on %s.", remote_command_executor.target)
161+
return crash_report
144162

145163

146164
def _get_known_hosts_content(host_keys_file):
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/bin/bash
2+
# Scans for crash files based on the detected OS and outputs a JSON
3+
# dictionary: { "crash_file_path": "content", ... }
4+
# Always exits 0 — the caller decides whether to fail.
5+
# Log lines are emitted to stderr so they don't pollute the JSON on stdout.
6+
7+
log_info() { echo "[get_crash_report] INFO: $*" >&2; }
8+
log_error() { echo "[get_crash_report] ERROR: $*" >&2; }
9+
10+
# --- JSON helper using python3 for safe escaping ---
11+
crash_entries=""
12+
13+
function add_entry() {
14+
local path="$1"
15+
local content="$2"
16+
log_info "Adding crash entry: ${path}"
17+
if [ -n "${crash_entries}" ]; then
18+
crash_entries="${crash_entries},"
19+
fi
20+
crash_entries="${crash_entries}$(python3 -c "
21+
import json, sys
22+
print(json.dumps(sys.argv[1]) + ': ' + json.dumps(sys.argv[2]))
23+
" "${path}" "${content}")"
24+
}
25+
26+
function read_crash_content() {
27+
local filepath="$1"
28+
if [ -d "${filepath}" ]; then
29+
# ABRT directory: concatenate text fields
30+
local content=""
31+
for field in "${filepath}"/*; do
32+
[ -f "${field}" ] || continue
33+
fname=$(basename "${field}")
34+
[ "${fname}" = "coredump" ] && continue
35+
if file "${field}" 2>/dev/null | grep -q "text"; then
36+
content="${content}${fname}:\n$(tail -100 "${field}")\n\n"
37+
fi
38+
done
39+
echo -e "${content}"
40+
elif file "${filepath}" 2>/dev/null | grep -q "text"; then
41+
tail -100 "${filepath}"
42+
else
43+
strings "${filepath}" 2>/dev/null | head -100
44+
fi
45+
}
46+
47+
function scan_directory() {
48+
local crashdir="$1"
49+
log_info "Scanning directory ${crashdir}..."
50+
if [ ! -d "${crashdir}" ]; then
51+
log_info "Directory ${crashdir} does not exist, skipping."
52+
return
53+
fi
54+
files="$(ls -A "${crashdir}" 2>/dev/null)"
55+
if [ -z "${files}" ]; then
56+
log_info "No crash files found in ${crashdir}."
57+
return
58+
fi
59+
log_info "Found files in ${crashdir}: ${files}"
60+
for crash_file in ${files}; do
61+
filepath="${crashdir}/${crash_file}"
62+
log_info "Reading crash file: ${filepath}"
63+
content=$(read_crash_content "${filepath}")
64+
add_entry "${filepath}" "${content}"
65+
done
66+
}
67+
68+
function scan_coredumpctl() {
69+
log_info "Checking coredumpctl for systemd-coredump entries..."
70+
if ! command -v coredumpctl > /dev/null 2>&1; then
71+
log_info "coredumpctl not found, skipping."
72+
return
73+
fi
74+
dump_list=$(coredumpctl list --no-pager --no-legend 2>/dev/null)
75+
if [ -z "${dump_list}" ]; then
76+
log_info "No coredump entries found via coredumpctl."
77+
return
78+
fi
79+
log_info "Found coredump entries via coredumpctl."
80+
content=$(coredumpctl info --no-pager 2>/dev/null | head -100)
81+
add_entry "coredumpctl" "${content}"
82+
}
83+
84+
# --- Detect OS from /etc/os-release ---
85+
log_info "Reading /etc/os-release..."
86+
if [ -f /etc/os-release ]; then
87+
. /etc/os-release
88+
else
89+
log_error "/etc/os-release not found. Cannot determine OS."
90+
exit 1
91+
fi
92+
93+
if [ -z "${ID}" ]; then
94+
log_error "Could not determine OS from /etc/os-release (ID is empty)."
95+
exit 1
96+
fi
97+
98+
log_info "Detected OS: ID=${ID}, VERSION_ID=${VERSION_ID}"
99+
100+
# --- Select crash scan logic based on OS ---
101+
log_info "Selecting crash scan strategy for OS '${ID}' version '${VERSION_ID}'..."
102+
case "${ID}" in
103+
ubuntu)
104+
log_info "Using Apport strategy (Ubuntu)."
105+
scan_directory /var/crash
106+
;;
107+
amzn)
108+
case "${VERSION_ID}" in
109+
2)
110+
log_info "Using ABRT strategy (Amazon Linux 2)."
111+
scan_directory /var/spool/abrt
112+
;;
113+
2023)
114+
log_info "Using systemd-coredump strategy (Amazon Linux 2023)."
115+
scan_coredumpctl
116+
;;
117+
esac
118+
;;
119+
rhel)
120+
case "${VERSION_ID%%.*}" in
121+
8)
122+
log_info "Using ABRT strategy (RHEL 8)."
123+
scan_directory /var/spool/abrt
124+
;;
125+
9)
126+
log_info "Using systemd-coredump strategy (RHEL 9)."
127+
scan_coredumpctl
128+
;;
129+
esac
130+
;;
131+
rocky)
132+
case "${VERSION_ID%%.*}" in
133+
8)
134+
log_info "Using ABRT strategy (Rocky 8)."
135+
scan_directory /var/spool/abrt
136+
;;
137+
9)
138+
log_info "Using systemd-coredump strategy (Rocky 9)."
139+
scan_coredumpctl
140+
;;
141+
esac
142+
;;
143+
*)
144+
log_error "Unsupported OS '${ID}' (VERSION_ID='${VERSION_ID}'). Cannot determine crash file locations."
145+
exit 1
146+
;;
147+
esac
148+
149+
log_info "Crash scan complete."
150+
echo "{${crash_entries}}"
151+
exit 0

0 commit comments

Comments
 (0)