99# or in the "LICENSE.txt" file accompanying this file.
1010# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
1111# See the License for the specific language governing permissions and limitations under the License.
12+ import contextlib
13+ import fcntl
14+ import json
1215import logging
1316import os as operating_system
1417import re
3538SERVER_URL = "https://localhost"
3639DCV_CONNECT_SCRIPT = "/opt/parallelcluster/scripts/pcluster_dcv_connect.sh"
3740
41+ # Crashes matching any of these patterns are never tolerated, regardless of TOLERATED_CRASH_PATTERNS.
42+ UNTOLERATED_CRASH_PATTERNS = [
43+ re .compile (r"dcv|nvidia" , re .IGNORECASE ),
44+ ]
45+
46+ # Tolerated crash patterns: list of regex patterns.
47+ # A crash is tolerated if it is unrelated to DCV and the software stack owned by ParallelCluster.
48+ TOLERATED_CRASH_PATTERNS = [
49+ # gnome-software segfaults in libadwaita related to animated scrolling of UI widget, observed on RHEL9/Rocky9
50+ re .compile (r"gnome-software.*scroll_to \(libadwaita" , re .DOTALL ),
51+ ]
52+
3853
3954def test_dcv_configuration (region , instance , os , scheduler , pcluster_config_reader , clusters_factory , test_datadir ):
4055 host_ip = get_local_ip ()
@@ -78,36 +93,67 @@ def _test_dcv_configuration(
7893 head_node_remote_command_executor = RemoteCommandExecutor (cluster )
7994 login_node_remote_command_executor = RemoteCommandExecutor (cluster , use_login_node = True )
8095
81- # check configuration parameters of the head and login nodes
82- check_node_security_group (region , cluster , dcv_port , expected_cidr = access_from )
83- check_node_security_group (region , cluster , dcv_port , expected_cidr = access_from , login_pool_name = "pool" )
84-
8596 shared_dir = f"/home/{ get_username_for_os (os )} "
8697
87- # test dcv connect show url for head and login node
88- _test_show_url (cluster , region , dcv_port , access_from )
89- _test_show_url (cluster , region , dcv_port , access_from , use_login_node = True )
90-
91- # launch a session and verify the authenticator works
92- _test_authenticator (head_node_remote_command_executor , dcv_authenticator_port , shared_dir , os )
93- _test_authenticator (login_node_remote_command_executor , dcv_authenticator_port , shared_dir , os )
94-
95- # check error cases
96- _check_error_cases (head_node_remote_command_executor , dcv_authenticator_port )
97- _check_error_cases (login_node_remote_command_executor , dcv_authenticator_port )
98-
99- # check shared dir configuration
100- _check_shared_dir (head_node_remote_command_executor , shared_dir )
101- _check_shared_dir (login_node_remote_command_executor , shared_dir )
102-
103- # Ensure no system programs crashed
104- _check_no_crashes (head_node_remote_command_executor , test_datadir )
105- _check_no_crashes (login_node_remote_command_executor , test_datadir )
106-
107- # Check that logs are stored in CloudWatch as expected
108- FeatureSpecificCloudWatchLoggingTestRunner .run_tests_for_feature (
109- cluster , scheduler , os , "dcv_enabled" , region , shared_dir
110- )
98+ checks = [
99+ (
100+ "check_node_security_group (head node)" ,
101+ lambda : check_node_security_group (region , cluster , dcv_port , expected_cidr = access_from ),
102+ ),
103+ (
104+ "check_node_security_group (login node)" ,
105+ lambda : check_node_security_group (
106+ region , cluster , dcv_port , expected_cidr = access_from , login_pool_name = "pool"
107+ ),
108+ ),
109+ ("dcv connect show url (head node)" , lambda : _test_show_url (cluster , region , dcv_port , access_from )),
110+ (
111+ "dcv connect show url (login node)" ,
112+ lambda : _test_show_url (cluster , region , dcv_port , access_from , use_login_node = True ),
113+ ),
114+ (
115+ "authenticator (head node)" ,
116+ lambda : _test_authenticator (head_node_remote_command_executor , dcv_authenticator_port , shared_dir , os ),
117+ ),
118+ (
119+ "authenticator (login node)" ,
120+ lambda : _test_authenticator (login_node_remote_command_executor , dcv_authenticator_port , shared_dir , os ),
121+ ),
122+ (
123+ "error cases (head node)" ,
124+ lambda : _check_error_cases (head_node_remote_command_executor , dcv_authenticator_port ),
125+ ),
126+ (
127+ "error cases (login node)" ,
128+ lambda : _check_error_cases (login_node_remote_command_executor , dcv_authenticator_port ),
129+ ),
130+ ("shared dir (head node)" , lambda : _check_shared_dir (head_node_remote_command_executor , shared_dir )),
131+ ("shared dir (login node)" , lambda : _check_shared_dir (login_node_remote_command_executor , shared_dir )),
132+ ("no crashes (head node)" , lambda : _assert_no_crashes (head_node_remote_command_executor , test_datadir )),
133+ ("no crashes (login node)" , lambda : _assert_no_crashes (login_node_remote_command_executor , test_datadir )),
134+ (
135+ "cloudwatch logs" ,
136+ lambda : FeatureSpecificCloudWatchLoggingTestRunner .run_tests_for_feature (
137+ cluster , scheduler , os , "dcv_enabled" , region , shared_dir
138+ ),
139+ ),
140+ ]
141+
142+ failures = []
143+ for check_name , check_fn in checks :
144+ try :
145+ check_fn ()
146+ except Exception as e :
147+ logging .error ("Soft assertion failed for '%s': %s" , check_name , e )
148+ failures .append (f"{ check_name } : { e } " )
149+
150+ if failures :
151+ formatted = []
152+ for i , f in enumerate (failures ):
153+ # Unescape literal \n and \t sequences so the output is human-readable
154+ readable = f .replace ("\\ n" , "\n " ).replace ("\\ t" , "\t " )
155+ formatted .append (f" [{ i + 1 } ] { readable } " )
156+ pytest .fail (f"{ len (failures )} DCV configuration check(s) failed:\n " + "\n " .join (formatted ))
111157
112158
113159def _check_auth_ko (remote_command_executor , dcv_authenticator_port , params , expected_message ):
@@ -138,9 +184,37 @@ def _check_auth_ok(remote_command_executor, external_authenticator_port, session
138184 ).is_equal_to ('<auth result="yes"><username>{0}</username></auth>' .format (username ))
139185
140186
141- def _check_no_crashes (remote_command_executor , test_datadir ):
142- """Verify no core files in /var/crash, which on ubuntu18 causes a popup when logging into the 1st session."""
143- remote_command_executor .run_remote_script (str (test_datadir / "verify_no_core_files.sh" ))
187+ def _get_crash_report (remote_command_executor , test_datadir ):
188+ """Check for crash files on the node and return a crash report dictionary.
189+
190+ Runs a script that scans crash locations across all pcluster-supported OSes
191+ (Ubuntu, AL2, AL2023, RHEL8/9, Rocky8/9) and returns a JSON dictionary
192+ mapping crash file paths to their human-readable content.
193+
194+ Returns an empty dict if no crashes found.
195+ """
196+ result = remote_command_executor .run_remote_script (str (test_datadir / "get_crash_report.sh" ), pty = False )
197+ return json .loads (result .stdout )
198+
199+
200+ def _is_tolerated_crash (content ):
201+ """Check if a crash content matches a tolerated pattern."""
202+ for pattern in UNTOLERATED_CRASH_PATTERNS :
203+ if pattern .search (content ):
204+ return False
205+ for pattern in TOLERATED_CRASH_PATTERNS :
206+ if pattern .search (content ):
207+ return True
208+ return False
209+
210+
211+ def _assert_no_crashes (remote_command_executor , test_datadir ):
212+ """Get crash report, log all crashes, and fail only on non-tolerated ones."""
213+ crash_report = _get_crash_report (remote_command_executor , test_datadir )
214+ if crash_report :
215+ logging .warning ("Crash report for %s:\n %s" , remote_command_executor .target , json .dumps (crash_report , indent = 2 ))
216+ untolerated = {path : content for path , content in crash_report .items () if not _is_tolerated_crash (content )}
217+ assert_that (untolerated ).is_empty ()
144218
145219
146220def _get_known_hosts_content (host_keys_file ):
@@ -153,6 +227,7 @@ def _get_known_hosts_content(host_keys_file):
153227
154228def _check_error_cases (remote_command_executor , dcv_authenticator_port ):
155229 """Check DCV errors for both head and login nodes."""
230+ logging .info (f"Checking expected authentication failure on { remote_command_executor .target } " )
156231 _check_auth_ko (
157232 remote_command_executor ,
158233 dcv_authenticator_port ,
@@ -165,6 +240,20 @@ def _check_error_cases(remote_command_executor, dcv_authenticator_port):
165240 _check_auth_ko (
166241 remote_command_executor , dcv_authenticator_port , "-d action=requestToken -d authUser=centos" , "Wrong parameters"
167242 )
243+ logging .info (f"Completed checks for authentication failure on { remote_command_executor .target } " )
244+
245+
246+ @contextlib .contextmanager
247+ def _temporary_known_host (hostname , host_keys_file , env ):
248+ """Add SSH host keys for hostname, yield, then remove them. Serialized via file lock across processes."""
249+ lock_file = host_keys_file + ".lock"
250+ with open (lock_file , "w" ) as lf :
251+ fcntl .flock (lf , fcntl .LOCK_EX )
252+ try :
253+ add_keys_to_known_hosts (hostname , host_keys_file )
254+ yield
255+ finally :
256+ remove_keys_from_known_hosts (hostname , host_keys_file , env = env )
168257
169258
170259def _test_show_url (cluster , region , dcv_port , access_from , use_login_node = False ): # noqa: C901
@@ -174,7 +263,6 @@ def _test_show_url(cluster, region, dcv_port, access_from, use_login_node=False)
174263
175264 node_ip = cluster .get_login_node_public_ip () if use_login_node else cluster .head_node_ip
176265
177- # add ssh key to jenkins user known hosts file to avoid ssh keychecking prompt
178266 # Ensure known_hosts path exists to avoid `cat` command returning non-zero exit when testing in ADC region.
179267 host_keys_file = operating_system .path .expanduser ("~/.ssh/known_hosts" )
180268 host_keys_path = Path (host_keys_file )
@@ -186,18 +274,18 @@ def _test_show_url(cluster, region, dcv_port, access_from, use_login_node=False)
186274 except Exception as e :
187275 logging .warning (f"Failed to prepare known_hosts file { host_keys_file } : { e } " )
188276
189- add_keys_to_known_hosts (node_ip , host_keys_file )
190-
191277 dcv_connect_args = ["pcluster" , "dcv-connect" , "--cluster-name" , cluster .name , "--show-url" ]
192278
193279 if use_login_node :
194280 dcv_connect_args .extend (["--login-node-ip" , node_ip ])
195281
196- try :
197- result = run_pcluster_command (dcv_connect_args , env = env )
198- finally :
199- # remove ssh key from jenkins user known hosts file
200- remove_keys_from_known_hosts (node_ip , host_keys_file , env = env )
282+ with _temporary_known_host (node_ip , host_keys_file , env ):
283+ try :
284+ result = run_pcluster_command (dcv_connect_args , env = env )
285+ except subprocess .CalledProcessError as e :
286+ raise AssertionError (
287+ f"Command { e .cmd } failed (exit { e .returncode } ).\n stderr: { e .stderr } \n stdout: { e .stdout } "
288+ ) from e
201289
202290 assert_that (result .stdout ).matches (
203291 r"Please use the following one-time URL in your browser within 30 seconds:\n"
@@ -221,9 +309,8 @@ def _test_authenticator(remote_command_executor, dcv_authenticator_port, shared_
221309 dcv_session_token = dcv_parameters .group (3 )
222310 _check_auth_ok (remote_command_executor , dcv_authenticator_port , dcv_session_id , dcv_session_token , os )
223311 else :
224- print (
312+ assert_that ( dcv_parameters ). described_as (
225313 "Command '{0} {1}' fails, output: {2}, error: {3}" .format (
226314 DCV_CONNECT_SCRIPT , shared_dir , command_execution .stdout , command_execution .stderr
227315 )
228- )
229- raise AssertionError
316+ ).is_not_none ()
0 commit comments