Skip to content

Commit 63a8c89

Browse files
authored
Merge pull request #12 from EESSI/gpu
Add support for detection of GPU software
2 parents 6732e5c + cffc123 commit 63a8c89

4 files changed

Lines changed: 100 additions & 31 deletions

File tree

.github/workflows/docs.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ jobs:
3333
module purge
3434
module unuse $MODULEPATH
3535
module use /cvmfs/software.eessi.io/init/modules/
36+
# Set CPU override for the stack to query
37+
export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="x86_64/intel/icelake"
38+
# Set GPU overrides for the stack to query (we have no GPU, so also override the GPU check)
39+
export EESSI_ACCELERATOR_TARGET_OVERRIDE="accel/nvidia/cc90"
40+
export EESSI_OVERRIDE_GPU_CHECK=1
3641
# First do 2023.06 for EB 4
3742
( module load EESSI/2023.06 && module load EasyBuild/4 && module load EESSI-extend && python scripts/generate_data_files.py --eessi-version=2023.06 ) &
3843
# then 2023.06 for EB 5

.github/workflows/prs.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ jobs:
2828
module purge
2929
module unuse $MODULEPATH
3030
module use /cvmfs/software.eessi.io/init/modules/
31+
# Set CPU override for the stack to query
32+
export EESSI_ARCHDETECT_OPTIONS_OVERRIDE="x86_64/intel/icelake"
33+
# Set GPU overrides for the stack to query (we have no GPU, so also override the GPU check)
34+
export EESSI_ACCELERATOR_TARGET_OVERRIDE="accel/nvidia/cc90"
35+
export EESSI_OVERRIDE_GPU_CHECK=1
3136
# Only do 2023.06 for EB 5 since this is just a test
3237
( module load EESSI/2023.06 && module load EasyBuild/5 && module load EESSI-extend && python scripts/generate_data_files.py --eessi-version=2023.06 ) &
3338
# Merge all these results together

scripts/generate_data_files.py

Lines changed: 56 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,16 @@
1010
from collections import defaultdict, OrderedDict
1111
from datetime import datetime, timezone
1212
from easybuild.tools.version import VERSION as EASYBUILD_VERSION
13-
from easybuild.framework.easyconfig.easyconfig import process_easyconfig, get_toolchain_hierarchy
13+
from easybuild.framework.easyconfig.easyconfig import (
14+
process_easyconfig,
15+
get_toolchain_hierarchy,
16+
)
1417
from easybuild.tools.options import set_up_configuration
1518
from easybuild.tools.include import include_easyblocks
1619
from contextlib import contextmanager
1720

1821
VALID_EESSI_VERSIONS = ["2025.06", "2023.06"]
1922

20-
EESSI_REFERENCE_ARCHITECTURE = "x86_64/intel/icelake"
21-
2223
# Give order to my toolchains so I can easily figure out what "latest" means
2324
EESSI_SUPPORTED_TOP_LEVEL_TOOLCHAINS = OrderedDict(
2425
{
@@ -49,7 +50,11 @@ def suppress_stdout():
4950

5051
def module_dict_from_module_string(module):
5152
module_name, module_version = module.split("/", 1)
52-
module_dict = {"module_name": module_name, "module_version": module_version, "full_module_name": module}
53+
module_dict = {
54+
"module_name": module_name,
55+
"module_version": module_version,
56+
"full_module_name": module,
57+
}
5358

5459
return module_dict
5560

@@ -182,6 +187,16 @@ def collect_eb_files(base_path):
182187
return dict(eb_files_by_version)
183188

184189

190+
def merge_dicts(d1, d2):
191+
merged = defaultdict(list)
192+
193+
for d in (d1, d2):
194+
for key, value in d.items():
195+
merged[key].extend(value)
196+
197+
return dict(merged)
198+
199+
185200
if __name__ == "__main__":
186201
# The EESSI version is provided as an argument
187202
parser = argparse.ArgumentParser(description="EESSI version to scan.")
@@ -199,10 +214,23 @@ def collect_eb_files(base_path):
199214
print(f"Using EESSI version: {eessi_version}")
200215

201216
# We use a single architecture path to gather information about the software versions
202-
base_path = (
203-
f"/cvmfs/software.eessi.io/versions/{eessi_version}/software/linux/{EESSI_REFERENCE_ARCHITECTURE}/software/"
204-
)
205-
result = collect_eb_files(base_path)
217+
eessi_reference_architecture = os.getenv("EESSI_ARCHDETECT_OPTIONS_OVERRIDE", False)
218+
if not eessi_reference_architecture:
219+
print("You must have selected a CPU architecture via EESSI_ARCHDETECT_OPTIONS_OVERRIDE")
220+
exit()
221+
base_path = f"/cvmfs/software.eessi.io/versions/{eessi_version}/software/linux/{eessi_reference_architecture}"
222+
cpu_easyconfig_files_dict = collect_eb_files(os.path.join(base_path, "software"))
223+
# We also gather all the acclerator installations for NVIDIA-enabled packages
224+
# We're not typically running this script on a node with a GPU so an override must have been set
225+
eessi_reference_nvidia_architecture = os.getenv("EESSI_ACCELERATOR_TARGET_OVERRIDE", False)
226+
if not eessi_reference_nvidia_architecture:
227+
print("You must have selected a GPU architecture via EESSI_ACCELERATOR_TARGET_OVERRIDE")
228+
exit()
229+
accel_base_path = os.path.join(base_path, eessi_reference_nvidia_architecture)
230+
accel_easyconfig_files_dict = collect_eb_files(os.path.join(accel_base_path, "software"))
231+
232+
# Merge the easyconfig files
233+
easyconfig_files_dict = merge_dicts(cpu_easyconfig_files_dict, accel_easyconfig_files_dict)
206234

207235
set_up_configuration(args="")
208236
tmpdir = tempfile.mkdtemp()
@@ -224,23 +252,23 @@ def collect_eb_files(base_path):
224252
{"name": "system", "version": "system"}
225253
] + get_toolchain_hierarchy(top_level_toolchain)
226254

227-
for eb_version_of_install, files in sorted(result.items()):
255+
for eb_version_of_install, easyconfigs in sorted(easyconfig_files_dict.items()):
228256
print(f"Major version {eb_version_of_install}:")
229257
if eb_version_of_install == str(EASYBUILD_VERSION.version[0]):
230-
total_files = len(files)
231-
for i, file in enumerate(files, start=1):
232-
percent = (i / total_files) * 100
233-
print(f"{percent:.1f}% - {file}")
258+
total_easyconfigs = len(easyconfigs)
259+
for i, easyconfig in enumerate(easyconfigs, start=1):
260+
percent = (i / total_easyconfigs) * 100
261+
print(f"{percent:.1f}% - {easyconfig}")
234262

235263
# Don't try to parse an EasyBuild easyconfig that is not the same major release
236-
if "/software/EasyBuild/" in file and f"/EasyBuild/{eb_version_of_install}" not in file:
264+
if "/software/EasyBuild/" in easyconfig and f"/EasyBuild/{eb_version_of_install}" not in easyconfig:
237265
continue
238266
# print(process_easyconfig(path)[0]['ec'].asdict())
239267

240-
eb_hooks_path = use_timestamped_reprod_if_exists(f"{os.path.dirname(file)}/reprod/easyblocks")
268+
eb_hooks_path = use_timestamped_reprod_if_exists(f"{os.path.dirname(easyconfig)}/reprod/easyblocks")
241269
easyblocks_dir = include_easyblocks(tmpdir, [eb_hooks_path + "/*.py"])
242270
with suppress_stdout():
243-
parsed_ec = process_easyconfig(file)[0]
271+
parsed_ec = process_easyconfig(easyconfig)[0]
244272
# included easyblocks are the first entry in sys.path, so just pop them but keep a list of what was used
245273
sys.path.pop(0)
246274
easyblocks_used = [
@@ -252,26 +280,29 @@ def collect_eb_files(base_path):
252280

253281
# Store everything we now know about the installation as a dict
254282
# Use the path as the key since we know it is unique
255-
eessi_software["eessi_version"][eessi_version][file] = parsed_ec["ec"].asdict()
256-
eessi_software["eessi_version"][eessi_version][file]["mtime"] = os.path.getmtime(file)
283+
eessi_software["eessi_version"][eessi_version][easyconfig] = parsed_ec["ec"].asdict()
284+
eessi_software["eessi_version"][eessi_version][easyconfig]["mtime"] = os.path.getmtime(easyconfig)
257285

258286
# Make sure we can load the module before adding it's information to the main dict
259287
try:
260-
eessi_software["eessi_version"][eessi_version][file]["required_modules"] = load_and_list_modules(
261-
parsed_ec["full_mod_name"]
288+
eessi_software["eessi_version"][eessi_version][easyconfig]["required_modules"] = (
289+
load_and_list_modules(parsed_ec["full_mod_name"])
262290
)
263291
except RuntimeError as e:
264-
print(f"Ignoring {file} due to error processing module: {e}")
265-
eessi_software["eessi_version"][eessi_version].pop(file)
292+
print(f"Ignoring {easyconfig} due to error processing module: {e}")
293+
eessi_software["eessi_version"][eessi_version].pop(easyconfig)
266294
continue
267295

268296
# Add important data that is related to the module environment
269-
eessi_software["eessi_version"][eessi_version][file]["module"] = module_dict_from_module_string(
297+
eessi_software["eessi_version"][eessi_version][easyconfig]["module"] = module_dict_from_module_string(
270298
parsed_ec["full_mod_name"]
271299
)
272300
# Retain the easyblocks used so we can use a heuristic to figure out the type of extensions (R, Python, Perl)
273-
eessi_software["eessi_version"][eessi_version][file]["easyblocks"] = easyblocks_used
301+
eessi_software["eessi_version"][eessi_version][easyconfig]["easyblocks"] = easyblocks_used
274302

275303
# Store the result
276-
with open(f"eessi_software_{eessi_version}-eb{str(EASYBUILD_VERSION.version[0])}.yaml", "w") as f:
304+
with open(
305+
f"eessi_software_{eessi_version}-eb{str(EASYBUILD_VERSION.version[0])}.yaml",
306+
"w",
307+
) as f:
277308
yaml.dump(eessi_software, f)

scripts/process_eessi_software_metadata.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,14 @@
2424
"x86_64/intel/cascadelake",
2525
]
2626

27+
NVIDIA_ARCHITECTURES = [
28+
"accel/nvidia/cc70",
29+
"accel/nvidia/cc80",
30+
"accel/nvidia/cc90",
31+
"accel/nvidia/cc100",
32+
"accel/nvidia/cc120",
33+
]
34+
2735
TOOLCHAIN_FAMILIES = [
2836
"2025b_foss",
2937
"2025a_foss",
@@ -59,13 +67,25 @@ def get_software_information_by_filename(file_metadata, original_path=None, tool
5967
if f"/{arch}/" in original_path:
6068
detected_arch = arch
6169
break
62-
6370
if detected_arch is None:
6471
raise RuntimeError("No known architecture matched in the input path.")
6572

73+
# also detect the GPU arch (this one may not exist)
74+
# needs to be a dict as we can filter on associated cpu arch
75+
base_version_dict["gpu_arch"] = {}
76+
detected_accel_arch = None
77+
for accel_arch in NVIDIA_ARCHITECTURES:
78+
if f"/{accel_arch}/" in original_path:
79+
detected_accel_arch = accel_arch
80+
break
81+
if detected_accel_arch is None:
82+
# Not having a GPU is not an error (we can just leave it empty, which is falsey)
83+
detected_accel_arch = ""
84+
6685
# 2) Construct the modulefile path
6786
before_arch, _, _ = original_path.partition(detected_arch)
68-
modulefile = before_arch + detected_arch + "/modules/all/" + file_metadata["module"]["full_module_name"] + ".lua"
87+
# Remember, detected_accel_arch can be an empty string
88+
modulefile = os.path.join(before_arch, detected_arch, detected_accel_arch, "modules/all", file_metadata["module"]["full_module_name"] + ".lua")
6989
spider_cache = before_arch + detected_arch + "/.lmod/cache/spiderT.lua"
7090

7191
# 3) Substitute each architecture and test module file existence in spider cache
@@ -76,13 +96,21 @@ def get_software_information_by_filename(file_metadata, original_path=None, tool
7696
found = subprocess.run(["grep", "-q", substituted_modulefile, substituted_spider_cache]).returncode == 0
7797
if found:
7898
base_version_dict["cpu_arch"].append(arch)
99+
# If we have an accelerator module let's check which architectures are supported
100+
if detected_accel_arch:
101+
base_version_dict["gpu_arch"][arch] = []
102+
for accel_arch in NVIDIA_ARCHITECTURES:
103+
accel_substituted_modulefile = substituted_modulefile.replace(detected_accel_arch, accel_arch)
104+
found = subprocess.run(["grep", "-q", accel_substituted_modulefile, substituted_spider_cache]).returncode == 0
105+
if found:
106+
base_version_dict["gpu_arch"][arch].append(accel_arch)
107+
else:
108+
print(f"No module {accel_substituted_modulefile}...not adding software for architecture {arch}/{accel_arch}")
109+
continue
79110
else:
80-
print(f"No module {substituted_modulefile}...not adding software for archtecture {arch}")
111+
print(f"No module {substituted_modulefile}...not adding software for architecture {arch}")
81112
continue
82113

83-
# TODO: Handle GPU arch later, but it is going to need to be a dict as we will filter on cpu arch
84-
base_version_dict["gpu_arch"] = {}
85-
86114
# Now we can cycle throught the possibilities
87115
# - software application itself
88116
software = {}

0 commit comments

Comments
 (0)