Skip to content

Commit b89788c

Browse files
committed
[Test] Improve stability, debuggability and coverage of test_dcv_configuration:
* debuggability: retrieve, print and analyze a comprehensive report of crashes (not only the crash filename, but the stack trace of the crash). Also, moved from hard assertions to soft assertions to have a final report of all the observed failures. * stability: prevent false positive failures, by ignoring harmless crashes related to gnome, unrelated to nvidia or dcv. Also fixed a gap that was causing failures when multiple instances of this test are executed in parallel by serializing the modifications to ssh known_hosts. * coverage: the test is now able to detect crashes on all supported OSs, not only Ubuntu.
1 parent f06ab20 commit b89788c

3 files changed

Lines changed: 283 additions & 56 deletions

File tree

tests/integration-tests/tests/dcv/test_dcv.py

Lines changed: 129 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
# or in the "LICENSE.txt" file accompanying this file.
1010
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
1111
# See the License for the specific language governing permissions and limitations under the License.
12+
import contextlib
13+
import fcntl
14+
import json
1215
import logging
1316
import os as operating_system
1417
import re
@@ -35,6 +38,18 @@
3538
SERVER_URL = "https://localhost"
3639
DCV_CONNECT_SCRIPT = "/opt/parallelcluster/scripts/pcluster_dcv_connect.sh"
3740

41+
# Crashes matching any of these patterns are never tolerated, regardless of TOLERATED_CRASH_PATTERNS.
42+
UNTOLERATED_CRASH_PATTERNS = [
43+
re.compile(r"dcv|nvidia", re.IGNORECASE),
44+
]
45+
46+
# Tolerated crash patterns: list of regex patterns.
47+
# A crash is tolerated if it is unrelated to DCV and the software stack owned by ParallelCluster.
48+
TOLERATED_CRASH_PATTERNS = [
49+
# gnome-software segfaults in libadwaita related to animated scrolling of UI widget, observed on RHEL9/Rocky9
50+
re.compile(r"gnome-software.*scroll_to \(libadwaita", re.DOTALL),
51+
]
52+
3853

3954
def test_dcv_configuration(region, instance, os, scheduler, pcluster_config_reader, clusters_factory, test_datadir):
4055
host_ip = get_local_ip()
@@ -78,36 +93,67 @@ def _test_dcv_configuration(
7893
head_node_remote_command_executor = RemoteCommandExecutor(cluster)
7994
login_node_remote_command_executor = RemoteCommandExecutor(cluster, use_login_node=True)
8095

81-
# check configuration parameters of the head and login nodes
82-
check_node_security_group(region, cluster, dcv_port, expected_cidr=access_from)
83-
check_node_security_group(region, cluster, dcv_port, expected_cidr=access_from, login_pool_name="pool")
84-
8596
shared_dir = f"/home/{get_username_for_os(os)}"
8697

87-
# test dcv connect show url for head and login node
88-
_test_show_url(cluster, region, dcv_port, access_from)
89-
_test_show_url(cluster, region, dcv_port, access_from, use_login_node=True)
90-
91-
# launch a session and verify the authenticator works
92-
_test_authenticator(head_node_remote_command_executor, dcv_authenticator_port, shared_dir, os)
93-
_test_authenticator(login_node_remote_command_executor, dcv_authenticator_port, shared_dir, os)
94-
95-
# check error cases
96-
_check_error_cases(head_node_remote_command_executor, dcv_authenticator_port)
97-
_check_error_cases(login_node_remote_command_executor, dcv_authenticator_port)
98-
99-
# check shared dir configuration
100-
_check_shared_dir(head_node_remote_command_executor, shared_dir)
101-
_check_shared_dir(login_node_remote_command_executor, shared_dir)
102-
103-
# Ensure no system programs crashed
104-
_check_no_crashes(head_node_remote_command_executor, test_datadir)
105-
_check_no_crashes(login_node_remote_command_executor, test_datadir)
106-
107-
# Check that logs are stored in CloudWatch as expected
108-
FeatureSpecificCloudWatchLoggingTestRunner.run_tests_for_feature(
109-
cluster, scheduler, os, "dcv_enabled", region, shared_dir
110-
)
98+
checks = [
99+
(
100+
"check_node_security_group (head node)",
101+
lambda: check_node_security_group(region, cluster, dcv_port, expected_cidr=access_from),
102+
),
103+
(
104+
"check_node_security_group (login node)",
105+
lambda: check_node_security_group(
106+
region, cluster, dcv_port, expected_cidr=access_from, login_pool_name="pool"
107+
),
108+
),
109+
("dcv connect show url (head node)", lambda: _test_show_url(cluster, region, dcv_port, access_from)),
110+
(
111+
"dcv connect show url (login node)",
112+
lambda: _test_show_url(cluster, region, dcv_port, access_from, use_login_node=True),
113+
),
114+
(
115+
"authenticator (head node)",
116+
lambda: _test_authenticator(head_node_remote_command_executor, dcv_authenticator_port, shared_dir, os),
117+
),
118+
(
119+
"authenticator (login node)",
120+
lambda: _test_authenticator(login_node_remote_command_executor, dcv_authenticator_port, shared_dir, os),
121+
),
122+
(
123+
"error cases (head node)",
124+
lambda: _check_error_cases(head_node_remote_command_executor, dcv_authenticator_port),
125+
),
126+
(
127+
"error cases (login node)",
128+
lambda: _check_error_cases(login_node_remote_command_executor, dcv_authenticator_port),
129+
),
130+
("shared dir (head node)", lambda: _check_shared_dir(head_node_remote_command_executor, shared_dir)),
131+
("shared dir (login node)", lambda: _check_shared_dir(login_node_remote_command_executor, shared_dir)),
132+
("no crashes (head node)", lambda: _assert_no_crashes(head_node_remote_command_executor, test_datadir)),
133+
("no crashes (login node)", lambda: _assert_no_crashes(login_node_remote_command_executor, test_datadir)),
134+
(
135+
"cloudwatch logs",
136+
lambda: FeatureSpecificCloudWatchLoggingTestRunner.run_tests_for_feature(
137+
cluster, scheduler, os, "dcv_enabled", region, shared_dir
138+
),
139+
),
140+
]
141+
142+
failures = []
143+
for check_name, check_fn in checks:
144+
try:
145+
check_fn()
146+
except Exception as e:
147+
logging.error("Soft assertion failed for '%s': %s", check_name, e)
148+
failures.append(f"{check_name}: {e}")
149+
150+
if failures:
151+
formatted = []
152+
for i, f in enumerate(failures):
153+
# Unescape literal \n and \t sequences so the output is human-readable
154+
readable = f.replace("\\n", "\n").replace("\\t", "\t")
155+
formatted.append(f" [{i+1}] {readable}")
156+
pytest.fail(f"{len(failures)} DCV configuration check(s) failed:\n" + "\n".join(formatted))
111157

112158

113159
def _check_auth_ko(remote_command_executor, dcv_authenticator_port, params, expected_message):
@@ -138,9 +184,37 @@ def _check_auth_ok(remote_command_executor, external_authenticator_port, session
138184
).is_equal_to('<auth result="yes"><username>{0}</username></auth>'.format(username))
139185

140186

141-
def _check_no_crashes(remote_command_executor, test_datadir):
142-
"""Verify no core files in /var/crash, which on ubuntu18 causes a popup when logging into the 1st session."""
143-
remote_command_executor.run_remote_script(str(test_datadir / "verify_no_core_files.sh"))
187+
def _get_crash_report(remote_command_executor, test_datadir):
188+
"""Check for crash files on the node and return a crash report dictionary.
189+
190+
Runs a script that scans crash locations across all pcluster-supported OSes
191+
(Ubuntu, AL2, AL2023, RHEL8/9, Rocky8/9) and returns a JSON dictionary
192+
mapping crash file paths to their human-readable content.
193+
194+
Returns an empty dict if no crashes found.
195+
"""
196+
result = remote_command_executor.run_remote_script(str(test_datadir / "get_crash_report.sh"), pty=False)
197+
return json.loads(result.stdout)
198+
199+
200+
def _is_tolerated_crash(content):
201+
"""Check if a crash content matches a tolerated pattern."""
202+
for pattern in UNTOLERATED_CRASH_PATTERNS:
203+
if pattern.search(content):
204+
return False
205+
for pattern in TOLERATED_CRASH_PATTERNS:
206+
if pattern.search(content):
207+
return True
208+
return False
209+
210+
211+
def _assert_no_crashes(remote_command_executor, test_datadir):
212+
"""Get crash report, log all crashes, and fail only on non-tolerated ones."""
213+
crash_report = _get_crash_report(remote_command_executor, test_datadir)
214+
if crash_report:
215+
logging.warning("Crash report for %s:\n%s", remote_command_executor.target, json.dumps(crash_report, indent=2))
216+
untolerated = {path: content for path, content in crash_report.items() if not _is_tolerated_crash(content)}
217+
assert_that(untolerated).is_empty()
144218

145219

146220
def _get_known_hosts_content(host_keys_file):
@@ -153,6 +227,7 @@ def _get_known_hosts_content(host_keys_file):
153227

154228
def _check_error_cases(remote_command_executor, dcv_authenticator_port):
155229
"""Check DCV errors for both head and login nodes."""
230+
logging.info(f"Checking expected authentication failure on {remote_command_executor.target}")
156231
_check_auth_ko(
157232
remote_command_executor,
158233
dcv_authenticator_port,
@@ -165,6 +240,20 @@ def _check_error_cases(remote_command_executor, dcv_authenticator_port):
165240
_check_auth_ko(
166241
remote_command_executor, dcv_authenticator_port, "-d action=requestToken -d authUser=centos", "Wrong parameters"
167242
)
243+
logging.info(f"Completed checks for authentication failure on {remote_command_executor.target}")
244+
245+
246+
@contextlib.contextmanager
247+
def _temporary_known_host(hostname, host_keys_file, env):
248+
"""Add SSH host keys for hostname, yield, then remove them. Serialized via file lock across processes."""
249+
lock_file = host_keys_file + ".lock"
250+
with open(lock_file, "w") as lf:
251+
fcntl.flock(lf, fcntl.LOCK_EX)
252+
try:
253+
add_keys_to_known_hosts(hostname, host_keys_file)
254+
yield
255+
finally:
256+
remove_keys_from_known_hosts(hostname, host_keys_file, env=env)
168257

169258

170259
def _test_show_url(cluster, region, dcv_port, access_from, use_login_node=False): # noqa: C901
@@ -174,7 +263,6 @@ def _test_show_url(cluster, region, dcv_port, access_from, use_login_node=False)
174263

175264
node_ip = cluster.get_login_node_public_ip() if use_login_node else cluster.head_node_ip
176265

177-
# add ssh key to jenkins user known hosts file to avoid ssh keychecking prompt
178266
# Ensure known_hosts path exists to avoid `cat` command returning non-zero exit when testing in ADC region.
179267
host_keys_file = operating_system.path.expanduser("~/.ssh/known_hosts")
180268
host_keys_path = Path(host_keys_file)
@@ -186,18 +274,18 @@ def _test_show_url(cluster, region, dcv_port, access_from, use_login_node=False)
186274
except Exception as e:
187275
logging.warning(f"Failed to prepare known_hosts file {host_keys_file}: {e}")
188276

189-
add_keys_to_known_hosts(node_ip, host_keys_file)
190-
191277
dcv_connect_args = ["pcluster", "dcv-connect", "--cluster-name", cluster.name, "--show-url"]
192278

193279
if use_login_node:
194280
dcv_connect_args.extend(["--login-node-ip", node_ip])
195281

196-
try:
197-
result = run_pcluster_command(dcv_connect_args, env=env)
198-
finally:
199-
# remove ssh key from jenkins user known hosts file
200-
remove_keys_from_known_hosts(node_ip, host_keys_file, env=env)
282+
with _temporary_known_host(node_ip, host_keys_file, env):
283+
try:
284+
result = run_pcluster_command(dcv_connect_args, env=env)
285+
except subprocess.CalledProcessError as e:
286+
raise AssertionError(
287+
f"Command {e.cmd} failed (exit {e.returncode}).\nstderr: {e.stderr}\nstdout: {e.stdout}"
288+
) from e
201289

202290
assert_that(result.stdout).matches(
203291
r"Please use the following one-time URL in your browser within 30 seconds:\n"
@@ -221,9 +309,8 @@ def _test_authenticator(remote_command_executor, dcv_authenticator_port, shared_
221309
dcv_session_token = dcv_parameters.group(3)
222310
_check_auth_ok(remote_command_executor, dcv_authenticator_port, dcv_session_id, dcv_session_token, os)
223311
else:
224-
print(
312+
assert_that(dcv_parameters).described_as(
225313
"Command '{0} {1}' fails, output: {2}, error: {3}".format(
226314
DCV_CONNECT_SCRIPT, shared_dir, command_execution.stdout, command_execution.stderr
227315
)
228-
)
229-
raise AssertionError
316+
).is_not_none()
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
#!/bin/bash
2+
# Scans for crash files based on the detected OS and outputs a JSON
3+
# dictionary: { "crash_file_path": "content", ... }
4+
# Always exits 0 — the caller decides whether to fail.
5+
# Log lines are emitted to stderr so they don't pollute the JSON on stdout.
6+
7+
SCRIPT_NAME=$(basename "$0")
8+
9+
function log_info() { echo "[${SCRIPT_NAME}] INFO: $*" >&2; }
10+
11+
function log_error() { echo "[${SCRIPT_NAME}] ERROR: $*" >&2; }
12+
13+
# --- JSON helper using python3 for safe escaping ---
14+
crash_entries=""
15+
16+
function add_entry() {
17+
local path="$1"
18+
local content="$2"
19+
log_info "Adding crash entry: ${path}"
20+
if [ -n "${crash_entries}" ]; then
21+
crash_entries="${crash_entries},"
22+
fi
23+
crash_entries="${crash_entries}$(python3 -c "
24+
import json, sys
25+
print(json.dumps(sys.argv[1]) + ': ' + json.dumps(sys.argv[2]))
26+
" "${path}" "${content}")"
27+
}
28+
29+
function read_crash_content() {
30+
local filepath="$1"
31+
if [ -d "${filepath}" ]; then
32+
# ABRT directory: concatenate text fields
33+
local content=""
34+
for field in "${filepath}"/*; do
35+
[ -f "${field}" ] || continue
36+
fname=$(basename "${field}")
37+
[ "${fname}" = "coredump" ] && continue
38+
if file "${field}" 2>/dev/null | grep -q "text"; then
39+
content="${content}${fname}:\n$(tail -100 "${field}")\n\n"
40+
fi
41+
done
42+
echo -e "${content}"
43+
elif file "${filepath}" 2>/dev/null | grep -q "text"; then
44+
tail -100 "${filepath}"
45+
else
46+
strings "${filepath}" 2>/dev/null | head -100
47+
fi
48+
}
49+
50+
function scan_directory() {
51+
local crashdir="$1"
52+
log_info "Scanning directory ${crashdir}..."
53+
if [ ! -d "${crashdir}" ]; then
54+
log_info "Directory ${crashdir} does not exist, skipping."
55+
return
56+
fi
57+
files="$(ls -A "${crashdir}" 2>/dev/null)"
58+
if [ -z "${files}" ]; then
59+
log_info "No crash files found in ${crashdir}."
60+
return
61+
fi
62+
log_info "Found files in ${crashdir}: ${files}"
63+
for crash_file in ${files}; do
64+
filepath="${crashdir}/${crash_file}"
65+
log_info "Reading crash file: ${filepath}"
66+
content=$(read_crash_content "${filepath}")
67+
add_entry "${filepath}" "${content}"
68+
done
69+
}
70+
71+
function scan_coredumpctl() {
72+
log_info "Checking coredumpctl for systemd-coredump entries..."
73+
if ! command -v coredumpctl > /dev/null 2>&1; then
74+
log_info "coredumpctl not found, skipping."
75+
return
76+
fi
77+
dump_list=$(coredumpctl list --no-pager --no-legend 2>/dev/null)
78+
if [ -z "${dump_list}" ]; then
79+
log_info "No coredump entries found via coredumpctl."
80+
return
81+
fi
82+
log_info "Found coredump entries via coredumpctl."
83+
content=$(coredumpctl info --no-pager 2>/dev/null | head -100)
84+
add_entry "coredumpctl" "${content}"
85+
}
86+
87+
# --- Detect OS from /etc/os-release ---
88+
log_info "Reading /etc/os-release..."
89+
if [ -f /etc/os-release ]; then
90+
. /etc/os-release
91+
else
92+
log_error "/etc/os-release not found. Cannot determine OS."
93+
exit 1
94+
fi
95+
96+
if [ -z "${ID}" ]; then
97+
log_error "Could not determine OS from /etc/os-release (ID is empty)."
98+
exit 1
99+
fi
100+
101+
log_info "Detected OS: ID=${ID}, VERSION_ID=${VERSION_ID}"
102+
103+
# --- Select crash scan logic based on OS ---
104+
log_info "Selecting crash scan strategy for OS '${ID}' version '${VERSION_ID}'..."
105+
case "${ID}" in
106+
ubuntu)
107+
log_info "Using Apport strategy (Ubuntu)."
108+
scan_directory /var/crash
109+
;;
110+
amzn)
111+
case "${VERSION_ID}" in
112+
2)
113+
log_info "Using ABRT strategy (Amazon Linux 2)."
114+
scan_directory /var/spool/abrt
115+
;;
116+
2023)
117+
log_info "Using systemd-coredump strategy (Amazon Linux 2023)."
118+
scan_coredumpctl
119+
;;
120+
esac
121+
;;
122+
rhel)
123+
case "${VERSION_ID%%.*}" in
124+
8)
125+
log_info "Using ABRT strategy (RHEL 8)."
126+
scan_directory /var/spool/abrt
127+
;;
128+
9)
129+
log_info "Using systemd-coredump strategy (RHEL 9)."
130+
scan_coredumpctl
131+
;;
132+
esac
133+
;;
134+
rocky)
135+
case "${VERSION_ID%%.*}" in
136+
8)
137+
log_info "Using ABRT strategy (Rocky 8)."
138+
scan_directory /var/spool/abrt
139+
;;
140+
9)
141+
log_info "Using systemd-coredump strategy (Rocky 9)."
142+
scan_coredumpctl
143+
;;
144+
esac
145+
;;
146+
*)
147+
log_error "Unsupported OS '${ID}' (VERSION_ID='${VERSION_ID}'). Cannot determine crash file locations."
148+
exit 1
149+
;;
150+
esac
151+
152+
log_info "Crash scan complete."
153+
echo "{${crash_entries}}"
154+
exit 0

0 commit comments

Comments
 (0)