Skip to content

Commit 671560b

Browse files
committed
updated cuda memory handling ...
* now all gunicorn workers are discarded after processing 1 request * this will prevent VRAM leak caused by zombie workers holding VRAM doing nothing * but in exchange of some loading time at worker initialization (e.g. model loading) * can be overridden by setting `max_requests` passed to `serve_production()` in app.py * number of gunicorn workers are set reasonably low when app declares GPU usage * app metadata now has two new fields can be used to declare GPU usage * for GPU apps, first-time VRAM usage for each parameter combination is now recorded in local cache directory see `$XDG_CACHE_HOME/clams`
1 parent 69bdca9 commit 671560b

9 files changed

Lines changed: 478 additions & 41 deletions

File tree

clams/app/__init__.py

Lines changed: 203 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@
1313
from typing import Union, Any, Optional, Dict, List, Tuple
1414

1515
from mmif import Mmif, Document, DocumentTypes, View
16+
from mmif.utils.cli.describe import generate_param_hash # pytype: disable=import-error
1617
from clams.appmetadata import AppMetadata, real_valued_primitives, python_type, map_param_kv_delimiter
1718

1819
logging.basicConfig(
19-
level=logging.WARNING,
20+
level=getattr(logging, os.environ.get('CLAMS_LOGLEVEL', 'WARNING').upper(), logging.WARNING),
2021
format="%(asctime)s %(name)s %(levelname)-8s %(thread)d %(message)s",
2122
datefmt="%Y-%m-%d %H:%M:%S")
2223

@@ -47,7 +48,7 @@ class ClamsApp(ABC):
4748
'description': 'The JSON body of the HTTP response will be re-formatted with 2-space indentation',
4849
},
4950
{
50-
'name': 'runningTime', 'type': 'boolean', 'choices': None, 'default': False, 'multivalued': False,
51+
'name': 'runningTime', 'type': 'boolean', 'choices': None, 'default': True, 'multivalued': False,
5152
'description': 'The running time of the app will be recorded in the view metadata',
5253
},
5354
{
@@ -160,20 +161,19 @@ def annotate(self, mmif: Union[str, dict, Mmif], **runtime_params: List[str]) ->
160161
hwFetch = refined.get('hwFetch', False)
161162
runtime_recs = {}
162163
if hwFetch:
164+
import multiprocessing
163165
import platform, shutil, subprocess
164-
runtime_recs['architecture'] = platform.machine()
165-
# runtime_recs['processor'] = platform.processor() # this only works on Windows
166+
runtime_recs['cpu'] = f"{platform.machine()}, {multiprocessing.cpu_count()} cores"
166167
runtime_recs['cuda'] = []
167168
# Use cuda_profiler data if available, otherwise fallback to nvidia-smi
168169
if cuda_profiler:
169-
for gpu_info, peak_memory_bytes in cuda_profiler.items():
170-
# Convert peak memory to human-readable format
171-
peak_memory_mb = peak_memory_bytes / (1000 * 1000)
172-
if peak_memory_mb >= 1000:
173-
peak_memory_str = f"{peak_memory_mb / 1000:.2f} GiB"
174-
else:
175-
peak_memory_str = f"{peak_memory_mb:.1f} MiB"
176-
runtime_recs['cuda'].append(f"{gpu_info}, Used {self._cuda_memory_to_str(peak_memory_bytes)}")
170+
for gpu_name, mem_info in cuda_profiler.items():
171+
total_str = self._cuda_memory_to_str(mem_info['total'])
172+
available_str = self._cuda_memory_to_str(mem_info['available_before'])
173+
peak_str = self._cuda_memory_to_str(mem_info['peak'])
174+
runtime_recs['cuda'].append(
175+
f"{gpu_name}, {total_str} total, {available_str} available, {peak_str} peak used"
176+
)
177177
elif shutil.which('nvidia-smi'):
178178
for gpu in subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader'],
179179
stdout=subprocess.PIPE).stdout.decode('utf-8').strip().split('\n'):
@@ -345,50 +345,224 @@ def _cuda_device_name_concat(name, mem):
345345
mem = ClamsApp._cuda_memory_to_str(mem)
346346
return f"{name}, With {mem}"
347347

348+
def _get_profile_path(self, param_hash: str) -> pathlib.Path:
349+
"""
350+
Get filesystem path for memory profile file.
351+
352+
Profile files are stored in a per-app directory under user's cache.
353+
354+
:param param_hash: Hash of parameters from :func:`mmif.utils.cli.describe.generate_param_hash`
355+
:return: Path to the profile file
356+
"""
357+
# Sanitize app identifier for filesystem use
358+
app_id = self.metadata.identifier.replace('/', '-').replace(':', '-')
359+
cache_base = pathlib.Path(os.environ.get('XDG_CACHE_HOME', pathlib.Path.home() / '.cache'))
360+
cache_dir = cache_base / 'clams' / 'memory_profiles' / app_id
361+
return cache_dir / f"memory_{param_hash}.json"
362+
363+
@staticmethod
364+
def _get_available_vram() -> int:
365+
"""
366+
Get currently available VRAM in bytes (GPU-wide, across all processes).
367+
368+
Uses nvidia-smi to get actual available memory, not just current process.
369+
370+
:return: Available VRAM in bytes, or 0 if unavailable
371+
"""
372+
try:
373+
import subprocess
374+
import shutil
375+
if shutil.which('nvidia-smi'):
376+
# Get free memory from nvidia-smi (reports GPU-wide, not per-process)
377+
result = subprocess.run(
378+
['nvidia-smi', '--query-gpu=memory.free', '--format=csv,noheader,nounits', '-i', '0'],
379+
capture_output=True, text=True, timeout=5
380+
)
381+
if result.returncode == 0 and result.stdout.strip():
382+
free_mb = float(result.stdout.strip())
383+
return int(free_mb * 1024 * 1024) # Convert MB to bytes
384+
except Exception:
385+
pass
386+
387+
# Fallback to torch (only sees current process memory)
388+
try:
389+
import torch # pytype: disable=import-error
390+
if not torch.cuda.is_available():
391+
return 0
392+
393+
device = torch.cuda.current_device()
394+
total = torch.cuda.get_device_properties(device).total_memory
395+
used = max(torch.cuda.memory_allocated(device),
396+
torch.cuda.memory_reserved(device))
397+
return total - used
398+
except Exception:
399+
return 0
400+
401+
def _record_vram_usage(self, parameters: dict, peak_bytes: int) -> None:
402+
"""
403+
Record peak memory usage to profile file.
404+
405+
Uses atomic write (temp + rename) to avoid corruption from
406+
concurrent writes. Only updates if new value is higher.
407+
408+
Profile files are JSON containing:
409+
- peak_bytes: Peak VRAM usage by the torch process
410+
- parameters: Original parameters for human readability
411+
412+
:param parameters: Request parameters (for hash and recording)
413+
:param peak_bytes: Measured peak VRAM usage
414+
"""
415+
import json
416+
417+
if peak_bytes <= 0:
418+
return
419+
420+
param_hash = generate_param_hash(parameters)
421+
profile_path = self._get_profile_path(param_hash)
422+
423+
try:
424+
profile_path.parent.mkdir(parents=True, exist_ok=True)
425+
426+
# Check if we should update
427+
should_write = True
428+
if profile_path.exists():
429+
try:
430+
existing_data = json.loads(profile_path.read_text())
431+
existing = existing_data.get('peak_bytes', 0)
432+
if peak_bytes <= existing:
433+
should_write = False # Existing value is sufficient
434+
else:
435+
self.logger.debug(
436+
f"Updating peak memory for {param_hash}: "
437+
f"{existing/1024**3:.2f}GB -> {peak_bytes/1024**3:.2f}GB"
438+
)
439+
except (ValueError, IOError, json.JSONDecodeError):
440+
pass # Corrupted file, overwrite
441+
442+
if should_write:
443+
# Prepare profile data with original parameters for readability
444+
# Filter out internal keys and non-serializable values
445+
clean_params = {
446+
k: v for k, v in parameters.items()
447+
if k != self._RAW_PARAMS_KEY and not k.startswith('#')
448+
}
449+
profile_data = {
450+
'peak_bytes': peak_bytes,
451+
'parameters': clean_params
452+
}
453+
454+
# Atomic write: write to temp, then rename
455+
temp_path = profile_path.with_suffix('.tmp')
456+
temp_path.write_text(json.dumps(profile_data, indent=2))
457+
temp_path.rename(profile_path) # Atomic on POSIX
458+
459+
self.logger.info(
460+
f"Recorded peak memory for {param_hash}: "
461+
f"{peak_bytes/1024**3:.2f}GB"
462+
)
463+
except Exception as e:
464+
self.logger.warning(f"Failed to record memory profile: {e}")
465+
348466
@staticmethod
349467
def _profile_cuda_memory(func):
350468
"""
351-
Decorator for profiling CUDA memory usage during _annotate execution.
352-
469+
Decorator for profiling CUDA memory usage and managing VRAM availability.
470+
471+
This decorator:
472+
1. Checks VRAM requirements before execution (if conditions met)
473+
2. Rejects requests if insufficient VRAM
474+
3. Records peak memory usage after execution
475+
4. Calls empty_cache() for cleanup
476+
353477
:param func: The function to wrap (typically _annotate)
354478
:return: Decorated function that returns (result, cuda_profiler)
355479
where cuda_profiler is dict with "<GPU_NAME>, <GPU_TOTAL_MEMORY>" keys
356-
and peak memory usage values
480+
and dict values containing 'available_before' and 'peak' memory in bytes
357481
"""
358482
def wrapper(*args, **kwargs):
483+
# Get the ClamsApp instance from the bound method
484+
app_instance = getattr(func, '__self__', None)
485+
359486
cuda_profiler = {}
360487
torch_available = False
361488
cuda_available = False
362489
device_count = 0
363-
490+
available_before = {}
491+
364492
try:
365493
import torch # pytype: disable=import-error
366494
torch_available = True
367495
cuda_available = torch.cuda.is_available()
368496
device_count = torch.cuda.device_count()
369-
if cuda_available:
370-
# Reset peak memory stats for all devices
371-
torch.cuda.reset_peak_memory_stats('cuda')
372497
except ImportError:
373498
pass
374-
499+
500+
# Capture available VRAM before execution and reset stats
501+
if torch_available and cuda_available:
502+
for device_id in range(device_count):
503+
device_id_str = f'cuda:{device_id}'
504+
# Get GPU-wide available memory via nvidia-smi
505+
try:
506+
import subprocess
507+
import shutil
508+
if shutil.which('nvidia-smi'):
509+
result = subprocess.run(
510+
['nvidia-smi', '--query-gpu=memory.free',
511+
'--format=csv,noheader,nounits', '-i', str(device_id)],
512+
capture_output=True, text=True, timeout=5
513+
)
514+
if result.returncode == 0 and result.stdout.strip():
515+
free_mb = float(result.stdout.strip())
516+
available_before[device_id] = int(free_mb * 1024 * 1024)
517+
else:
518+
# Fallback to torch (process-specific)
519+
total = torch.cuda.get_device_properties(device_id_str).total_memory
520+
allocated = torch.cuda.memory_allocated(device_id_str)
521+
available_before[device_id] = total - allocated
522+
else:
523+
# Fallback to torch (process-specific)
524+
total = torch.cuda.get_device_properties(device_id_str).total_memory
525+
allocated = torch.cuda.memory_allocated(device_id_str)
526+
available_before[device_id] = total - allocated
527+
except Exception:
528+
# Fallback to torch (process-specific)
529+
total = torch.cuda.get_device_properties(device_id_str).total_memory
530+
allocated = torch.cuda.memory_allocated(device_id_str)
531+
available_before[device_id] = total - allocated
532+
# Reset peak memory stats for all devices
533+
torch.cuda.reset_peak_memory_stats('cuda')
534+
375535
try:
376536
result = func(*args, **kwargs)
377-
537+
538+
# Record peak memory usage
539+
total_peak = 0
378540
if torch_available and cuda_available and device_count > 0:
379541
for device_id in range(device_count):
380-
device_id = f'cuda:{device_id}'
381-
peak_memory = torch.cuda.max_memory_allocated(device_id)
382-
gpu_name = torch.cuda.get_device_name(device_id)
383-
gpu_total_memory = torch.cuda.get_device_properties(device_id).total_memory
384-
key = ClamsApp._cuda_device_name_concat(gpu_name, gpu_total_memory)
385-
cuda_profiler[key] = peak_memory
386-
542+
device_id_str = f'cuda:{device_id}'
543+
peak_memory = torch.cuda.max_memory_allocated(device_id_str)
544+
total_peak = max(total_peak, peak_memory)
545+
gpu_name = torch.cuda.get_device_name(device_id_str)
546+
gpu_total_memory = torch.cuda.get_device_properties(device_id_str).total_memory
547+
cuda_profiler[gpu_name] = {
548+
'total': gpu_total_memory,
549+
'available_before': available_before.get(device_id, 0),
550+
'peak': peak_memory
551+
}
552+
553+
# Record peak memory for future requests (if GPU app)
554+
gpu_app = (
555+
hasattr(app_instance, 'metadata') and
556+
getattr(app_instance.metadata, 'est_gpu_mem_min', 0) > 0
557+
)
558+
if gpu_app and total_peak > 0:
559+
app_instance._record_vram_usage(kwargs, total_peak)
560+
387561
return result, cuda_profiler
388562
finally:
389563
if torch_available and cuda_available:
390564
torch.cuda.empty_cache()
391-
565+
392566
return wrapper
393567

394568
@staticmethod

clams/appmetadata/__init__.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,9 +352,20 @@ class AppMetadata(pydantic.BaseModel):
352352
"a package name and its version in the string value at the minimum (e.g., ``clams-python==1.2.3``)."
353353
)
354354
more: Optional[Dict[str, str]] = pydantic.Field(
355-
None,
355+
None,
356356
description="(optional) A string-to-string map that can be used to store any additional metadata of the app."
357357
)
358+
est_gpu_mem_min: int = pydantic.Field(
359+
0,
360+
description="(optional) Minimum GPU memory required to run the app, in megabytes (MB). "
361+
"Set to 0 (default) if the app does not use GPU."
362+
)
363+
est_gpu_mem_typ: int = pydantic.Field(
364+
0,
365+
description="(optional) Typical GPU memory usage for default parameters, in megabytes (MB). "
366+
"Must be equal or larger than est_gpu_mem_min. "
367+
"Set to 0 (default) if the app does not use GPU."
368+
)
358369

359370
model_config = {
360371
'title': 'CLAMS AppMetadata',
@@ -372,6 +383,21 @@ def assign_versions(cls, data):
372383
data.mmif_version = get_mmif_specver()
373384
return data
374385

386+
@pydantic.model_validator(mode='after')
387+
@classmethod
388+
def validate_gpu_memory(cls, data):
389+
import warnings
390+
if data.est_gpu_mem_typ > 0 and data.est_gpu_mem_min > 0:
391+
if data.est_gpu_mem_typ < data.est_gpu_mem_min:
392+
warnings.warn(
393+
f"est_gpu_mem_typ ({data.est_gpu_mem_typ} MB) is less than "
394+
f"est_gpu_mem_min ({data.est_gpu_mem_min} MB). "
395+
f"Setting est_gpu_mem_typ to {data.est_gpu_mem_min} MB.",
396+
UserWarning
397+
)
398+
data.est_gpu_mem_typ = data.est_gpu_mem_min
399+
return data
400+
375401
@pydantic.field_validator('identifier', mode='before')
376402
@classmethod
377403
def append_version(cls, val):

clams/develop/templates/app/metadata.py.template

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ def appmetadata() -> AppMetadata:
3939
# this trick can also be useful (replace ANALYZER_NAME with the pypi dist name)
4040
analyzer_version=[l.strip().rsplit('==')[-1] for l in open(pathlib.Path(__file__).parent / 'requirements.txt').readlines() if re.match(r'^ANALYZER_NAME==', l)][0],
4141
analyzer_license="", # short name for a software license
42+
# GPU memory estimates (in MB). Set to 0 if the app does not use GPU.
43+
est_gpu_mem_min=0, # estimated memory usage with minimal computation parameters
44+
est_gpu_mem_typ=0, # estimated memory usage with default parameters, must be >= est_gpu_mem_min
4245
)
4346
# and then add I/O specifications: an app must have at least one input and one output
4447
metadata.add_input(DocumentTypes.Document)

0 commit comments

Comments
 (0)