diff --git a/examples/benchmarks/ort_inference_performance.py b/examples/benchmarks/ort_inference_performance.py index 18bda2043..73c258387 100644 --- a/examples/benchmarks/ort_inference_performance.py +++ b/examples/benchmarks/ort_inference_performance.py @@ -4,13 +4,28 @@ """Micro benchmark example for ONNXRuntime inference performance. Commands to run: + In-house models: python3 examples/benchmarks/ort_inference_performance.py + python3 examples/benchmarks/ort_inference_performance.py --model_source in-house + + HuggingFace models: + python3 examples/benchmarks/ort_inference_performance.py \ + --model_source huggingface --model_identifier bert-base-uncased + python3 examples/benchmarks/ort_inference_performance.py \ + --model_source huggingface --model_identifier microsoft/resnet-50 + +Environment variables: + HF_TOKEN: HuggingFace token for gated models (optional) """ +import argparse + from superbench.benchmarks import BenchmarkRegistry, Platform from superbench.common.utils import logger -if __name__ == '__main__': + +def run_inhouse_benchmark(): + """Run ORT inference with in-house torchvision models.""" context = BenchmarkRegistry.create_benchmark_context( 'ort-inference', platform=Platform.CUDA, parameters='--pytorch_models resnet50 resnet101 --precision float16' ) @@ -21,3 +36,57 @@ benchmark.name, benchmark.return_code, benchmark.result ) ) + return benchmark + + +def run_huggingface_benchmark(model_identifier, precision='float16', batch_size=32, seq_length=512): + """Run ORT inference with a HuggingFace model. + + Args: + model_identifier: HuggingFace model ID (e.g., 'bert-base-uncased'). + precision: Inference precision ('float32', 'float16', 'int8'). + batch_size: Batch size for inference. + seq_length: Sequence length for transformer models. + """ + parameters = ( + f'--model_source huggingface ' + f'--model_identifier {model_identifier} ' + f'--precision {precision} ' + f'--batch_size {batch_size} ' + f'--seq_length {seq_length}' + ) + + logger.info(f'Running ORT inference benchmark with HuggingFace model: {model_identifier}') + + context = BenchmarkRegistry.create_benchmark_context('ort-inference', platform=Platform.CUDA, parameters=parameters) + benchmark = BenchmarkRegistry.launch_benchmark(context) + if benchmark: + logger.info( + 'benchmark: {}, return code: {}, result: {}'.format( + benchmark.name, benchmark.return_code, benchmark.result + ) + ) + return benchmark + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='ORT inference benchmark') + parser.add_argument( + '--model_source', + type=str, + default='in-house', + choices=['in-house', 'huggingface'], + help='Source of the model: in-house (default) or huggingface' + ) + parser.add_argument( + '--model_identifier', type=str, default='bert-base-uncased', help='HuggingFace model identifier' + ) + parser.add_argument('--precision', type=str, default='float16', choices=['float32', 'float16', 'int8']) + parser.add_argument('--batch_size', type=int, default=32) + parser.add_argument('--seq_length', type=int, default=512) + args = parser.parse_args() + + if args.model_source == 'huggingface': + run_huggingface_benchmark(args.model_identifier, args.precision, args.batch_size, args.seq_length) + else: + run_inhouse_benchmark() diff --git a/examples/benchmarks/tensorrt_inference_performance.py b/examples/benchmarks/tensorrt_inference_performance.py index cacbf1177..bb48dfb3f 100644 --- a/examples/benchmarks/tensorrt_inference_performance.py +++ b/examples/benchmarks/tensorrt_inference_performance.py @@ -4,13 +4,28 @@ """Micro benchmark example for TensorRT inference performance. Commands to run: + In-house models: python3 examples/benchmarks/tensorrt_inference_performance.py + python3 examples/benchmarks/tensorrt_inference_performance.py --model_source in-house + + HuggingFace models: + python3 examples/benchmarks/tensorrt_inference_performance.py \ + --model_source huggingface --model_identifier bert-base-uncased + python3 examples/benchmarks/tensorrt_inference_performance.py \ + --model_source huggingface --model_identifier microsoft/resnet-50 + +Environment variables: + HF_TOKEN: HuggingFace token for gated models (optional) """ +import argparse + from superbench.benchmarks import BenchmarkRegistry, Platform from superbench.common.utils import logger -if __name__ == '__main__': + +def run_inhouse_benchmark(): + """Run TensorRT inference with in-house torchvision models.""" context = BenchmarkRegistry.create_benchmark_context('tensorrt-inference', platform=Platform.CUDA) benchmark = BenchmarkRegistry.launch_benchmark(context) if benchmark: @@ -19,3 +34,64 @@ benchmark.name, benchmark.return_code, benchmark.result ) ) + return benchmark + + +def run_huggingface_benchmark(model_identifier, precision='fp16', batch_size=32, seq_length=512, iterations=2048): + """Run TensorRT inference with a HuggingFace model. + + Args: + model_identifier: HuggingFace model ID (e.g., 'bert-base-uncased'). + precision: Inference precision ('fp32', 'fp16', 'int8'). + batch_size: Batch size for inference. + seq_length: Sequence length for transformer models. + iterations: Number of inference iterations. + """ + parameters = ( + f'--model_source huggingface ' + f'--model_identifier {model_identifier} ' + f'--precision {precision} ' + f'--batch_size {batch_size} ' + f'--seq_length {seq_length} ' + f'--iterations {iterations}' + ) + + logger.info(f'Running TensorRT inference benchmark with HuggingFace model: {model_identifier}') + + context = BenchmarkRegistry.create_benchmark_context( + 'tensorrt-inference', platform=Platform.CUDA, parameters=parameters + ) + benchmark = BenchmarkRegistry.launch_benchmark(context) + if benchmark: + logger.info( + 'benchmark: {}, return code: {}, result: {}'.format( + benchmark.name, benchmark.return_code, benchmark.result + ) + ) + return benchmark + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='TensorRT inference benchmark') + parser.add_argument( + '--model_source', + type=str, + default='in-house', + choices=['in-house', 'huggingface'], + help='Source of the model: in-house (default) or huggingface' + ) + parser.add_argument( + '--model_identifier', type=str, default='bert-base-uncased', help='HuggingFace model identifier' + ) + parser.add_argument('--precision', type=str, default='fp16', choices=['fp32', 'fp16', 'int8']) + parser.add_argument('--batch_size', type=int, default=32) + parser.add_argument('--seq_length', type=int, default=512) + parser.add_argument('--iterations', type=int, default=2048) + args = parser.parse_args() + + if args.model_source == 'huggingface': + run_huggingface_benchmark( + args.model_identifier, args.precision, args.batch_size, args.seq_length, args.iterations + ) + else: + run_inhouse_benchmark() diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py index 876d2ccfe..77a9629b8 100644 --- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py +++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py @@ -9,22 +9,23 @@ import torch.hub import torch.onnx import torchvision.models -from transformers import BertConfig, GPT2Config, LlamaConfig -from superbench.benchmarks.model_benchmarks.pytorch_bert import BertBenchmarkModel -from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import GPT2BenchmarkModel -from superbench.benchmarks.model_benchmarks.pytorch_lstm import LSTMBenchmarkModel -from superbench.benchmarks.model_benchmarks.pytorch_llama import LlamaBenchmarkModel -from superbench.benchmarks.model_benchmarks.pytorch_mixtral import MixtralBenchmarkModel +import traceback -if MixtralBenchmarkModel is not None: - from transformers import MixtralConfig +from superbench.common.utils import logger class torch2onnxExporter(): """PyTorch model to ONNX exporter.""" def __init__(self): """Constructor.""" + from transformers import BertConfig, GPT2Config, LlamaConfig + from superbench.benchmarks.model_benchmarks.pytorch_bert import BertBenchmarkModel + from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import GPT2BenchmarkModel + from superbench.benchmarks.model_benchmarks.pytorch_lstm import LSTMBenchmarkModel + from superbench.benchmarks.model_benchmarks.pytorch_llama import LlamaBenchmarkModel + from superbench.benchmarks.model_benchmarks.pytorch_mixtral import MixtralBenchmarkModel + self.num_classes = 100 self.lstm_input_size = 256 self.benchmark_models = { @@ -129,6 +130,7 @@ def __init__(self): # Only include Mixtral models if MixtralBenchmarkModel is available if MixtralBenchmarkModel is not None: + from transformers import MixtralConfig self.benchmark_models.update( { 'mixtral-8x7b': @@ -270,3 +272,151 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512): del dummy_input torch.cuda.empty_cache() return file_name + + def export_huggingface_model(self, model, model_name, batch_size=1, seq_length=512, output_dir=None): + """Export a HuggingFace model to ONNX format. + + Args: + model: HuggingFace model instance to export. + model_name (str): Name for the exported ONNX model file. + batch_size (int): Batch size of input. Defaults to 1. + seq_length (int): Sequence length of input. Defaults to 512. + output_dir (str): Output directory path. If None, uses default path. + + Returns: + str: Exported ONNX model file path, or empty string if export fails. + """ + try: + # Use custom output directory if provided + output_path = Path(output_dir) if output_dir else self._onnx_model_path + file_name = str(output_path / (model_name + '.onnx')) + + # Put model in eval mode and move to CUDA if available + model.eval() + + # Disable cache to avoid DynamicCache issues with ONNX export + if hasattr(model.config, 'use_cache'): + model.config.use_cache = False + + if torch.cuda.is_available(): + model = model.cuda() + + device = 'cuda' if torch.cuda.is_available() else 'cpu' + + # Get model's dtype for inputs + model_dtype = next(model.parameters()).dtype + + # Detect model type and create appropriate inputs + # Vision models use pixel_values, NLP models use input_ids + # Use HuggingFace's main_input_name property for automatic detection + main_input = getattr(model, 'main_input_name', 'input_ids') + is_vision_model = main_input == 'pixel_values' + + if is_vision_model: + # Vision models: use pixel_values (batch_size, channels, height, width) + # Standard ImageNet size is 224x224, 3 channels + # Match the dtype of the model + dummy_input = torch.randn(batch_size, 3, 224, 224, dtype=model_dtype, device=device) + input_names = ['pixel_values'] + dynamic_axes = {'pixel_values': {0: 'batch_size'}, 'output': {0: 'batch_size'}} + + # Wrapper for vision models + class VisionModelWrapper(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, pixel_values): + outputs = self.model(pixel_values=pixel_values) + if hasattr(outputs, 'logits'): + return outputs.logits + elif hasattr(outputs, 'last_hidden_state'): + return outputs.last_hidden_state + else: + return outputs[0] if isinstance(outputs, (tuple, list)) else outputs + + wrapped_model = VisionModelWrapper(model) + export_args = (dummy_input, ) + else: + # NLP models: use input_ids and attention_mask + dummy_input = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device) + attention_mask = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device) + input_names = ['input_ids', 'attention_mask'] + dynamic_axes = { + 'input_ids': { + 0: 'batch_size', + 1: 'seq_length' + }, + 'attention_mask': { + 0: 'batch_size', + 1: 'seq_length' + }, + 'output': { + 0: 'batch_size' + }, + } + + # Wrapper for NLP models + class NLPModelWrapper(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + + def forward(self, input_ids, attention_mask): + outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) + if hasattr(outputs, 'logits'): + return outputs.logits + elif hasattr(outputs, 'last_hidden_state'): + return outputs.last_hidden_state + else: + return outputs[0] if isinstance(outputs, (tuple, list)) else outputs + + wrapped_model = NLPModelWrapper(model) + export_args = (dummy_input, attention_mask) + + # Export to ONNX for large models (>2GB), use external data format + model_size_gb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024**3) + use_external_data = model_size_gb > 2.0 + + if use_external_data: + logger.info(f'Model size is {model_size_gb:.2f}GB, using external data format for ONNX export') + + torch.onnx.export( + wrapped_model, + export_args, + file_name, + opset_version=14, + do_constant_folding=True, + input_names=input_names, + output_names=['output'], + dynamic_axes=dynamic_axes, + ) + + # If using external data, convert to external data format + if use_external_data: + import onnx + from onnx.external_data_helper import convert_model_to_external_data + + onnx_model = onnx.load(file_name) + external_data_path = model_name + '_data.bin' + convert_model_to_external_data( + onnx_model, + all_tensors_to_one_file=True, + location=external_data_path, + size_threshold=1024, + convert_attribute=False + ) + onnx.save(onnx_model, file_name) + logger.info(f'Converted ONNX model to external data format: {external_data_path}') + + # Clean up + del dummy_input + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return file_name + + except Exception as e: + logger.error(f'Failed to export HuggingFace model to ONNX: {str(e)}') + logger.error(traceback.format_exc()) + return '' diff --git a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py new file mode 100644 index 000000000..95ade4815 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py @@ -0,0 +1,421 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Hugging Face model loader for benchmarking.""" + +import os +from pathlib import Path +from typing import Optional, Tuple + +import torch +from transformers import ( + AutoModel, + AutoModelForCausalLM, + AutoConfig, + AutoTokenizer, + PreTrainedModel, + PretrainedConfig, +) + +from superbench.common.utils import logger +from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig + + +class ModelLoadError(Exception): + """Exception raised when model loading fails.""" + pass + + +class ModelNotFoundError(ModelLoadError): + """Exception raised when model is not found.""" + pass + + +class ModelIncompatibleError(ModelLoadError): + """Exception raised when model is incompatible with ONNX export.""" + pass + + +class HuggingFaceModelLoader: + """Loads models from Hugging Face Hub for benchmarking. + + This class handles downloading, caching, and loading models from + Hugging Face Hub with support for authentication, device mapping, + and compatibility validation. + + Attributes: + cache_dir: Directory to cache downloaded models. + token: HuggingFace authentication token for private/gated models. + """ + def __init__(self, cache_dir: Optional[str] = None, token: Optional[str] = None): + """Initialize the HuggingFace model loader. + + Args: + cache_dir: Directory to cache downloaded models. If None, uses HF default. + token: HuggingFace authentication token for private/gated models. + """ + self.cache_dir = cache_dir or os.getenv('HF_HOME') or os.path.expanduser('~/.cache/huggingface') + self.token = token or os.getenv('HF_TOKEN') or os.getenv('HUGGING_FACE_HUB_TOKEN') + + # Ensure cache directory exists + Path(self.cache_dir).mkdir(parents=True, exist_ok=True) + + logger.info(f'HuggingFaceModelLoader initialized with cache_dir: {self.cache_dir}') + if self.token: + logger.info('Authentication token provided for private/gated models (token not logged)') + + def load_model( + self, + model_identifier: str, + torch_dtype: Optional[str] = None, + device: str = 'cuda', + revision: Optional[str] = None, + device_map: Optional[str] = None, + config: Optional[PretrainedConfig] = None, + **kwargs + ) -> Tuple[PreTrainedModel, PretrainedConfig, AutoTokenizer]: + """Load a model from Hugging Face Hub. + + Args: + model_identifier: HF model ID (e.g., 'meta-llama/Llama-2-7b-hf'). + torch_dtype: Data type for model weights ('float32', 'float16', 'bfloat16'). + device: Device to load model on ('cuda', 'cpu'). + revision: Specific model version/commit/tag to use. + device_map: Device mapping strategy for large models. + config: Pre-downloaded model config. If None, downloads from Hub. + **kwargs: Additional arguments passed to from_pretrained(). + + Returns: + Tuple of (model, config, tokenizer). + + Raises: + ModelNotFoundError: If model doesn't exist on HF Hub. + ModelLoadError: If model loading fails for any reason. + """ + logger.info(f'Loading model: {model_identifier}') + + try: + # Convert torch_dtype string to torch dtype + dtype = self._get_torch_dtype(torch_dtype) if torch_dtype else None + + # Prepare loading kwargs + load_kwargs = {'cache_dir': self.cache_dir, 'revision': revision, **kwargs} + + # Add token if available + if self.token: + load_kwargs['token'] = self.token + + # Add dtype if specified + if dtype: + load_kwargs['torch_dtype'] = dtype + + # Load config (use pre-downloaded config if provided) + if config is None: + logger.info('Loading model configuration...') + config = AutoConfig.from_pretrained(model_identifier, trust_remote_code=True, **load_kwargs) + else: + logger.info('Using pre-downloaded model configuration.') + + # Load tokenizer (may fail for some models, that's ok) + tokenizer = None + try: + logger.info('Loading tokenizer...') + tokenizer = AutoTokenizer.from_pretrained(model_identifier, trust_remote_code=True, **load_kwargs) + except Exception as e: + logger.warning(f'Could not load tokenizer: {e}. Continuing without tokenizer.') + + # Load model + logger.info(f'Loading model weights (dtype={torch_dtype}, device={device})...') + model_kwargs = load_kwargs.copy() + model_kwargs['trust_remote_code'] = True + + # Handle device mapping for large models + if device_map: + model_kwargs['device_map'] = device_map + elif device == 'cuda' and torch.cuda.is_available(): + # Don't set device_map if device is explicitly cuda + pass + elif device != 'cpu': + model_kwargs['device_map'] = device + + # Pass pre-downloaded config to from_pretrained so any overrides take effect + if config is not None: + model_kwargs['config'] = config + + try: + model = AutoModel.from_pretrained(model_identifier, **model_kwargs) + except ValueError: + logger.info('AutoModel failed, trying AutoModelForCausalLM...') + model = AutoModelForCausalLM.from_pretrained(model_identifier, **model_kwargs) + + # Move to device if not using device_map + if not device_map and device != 'auto': + model = model.to(device) + + logger.info( + f'Successfully loaded model: {model_identifier} ' + f'({self._get_model_size(model):.2f}M parameters)' + ) + + return model, config, tokenizer + + except OSError as e: + if 'not found' in str(e).lower() or '404' in str(e): + raise ModelNotFoundError( + f"Model '{model_identifier}' not found on Hugging Face Hub. " + f'Please check the model ID at https://huggingface.co/models' + ) from e + raise ModelLoadError(f"Failed to load model '{model_identifier}': {e}") from e + except Exception as e: + raise ModelLoadError(f"Unexpected error loading model '{model_identifier}': {e}") from e + + def load_model_from_config( + self, + config: ModelSourceConfig, + device: Optional[str] = None, + config_pretrained: Optional[PretrainedConfig] = None, + ) -> Tuple[PreTrainedModel, PretrainedConfig, AutoTokenizer]: + """Load a model using ModelSourceConfig. + + Args: + config: ModelSourceConfig instance with loading parameters. + device: Device to load model on. If None, uses CUDA when available, else CPU. + config_pretrained: Pre-downloaded HF model config. If provided, skips redundant download. + + Returns: + Tuple of (model, config, tokenizer). + + Raises: + ValueError: If config source is not 'huggingface'. + ModelLoadError: If model loading fails. + """ + if not config.is_huggingface(): + raise ValueError(f"Cannot load model with source '{config.source}'. Use 'huggingface' source.") + + # Validate config + is_valid, error = config.validate() + if not is_valid: + raise ValueError(f'Invalid configuration: {error}') + + if device is None: + device = 'cuda' if torch.cuda.is_available() else 'cpu' + + # Extract loading parameters + return self.load_model( + model_identifier=config.identifier, + torch_dtype=config.torch_dtype, + device=device, + revision=config.revision, + device_map=config.device_map, + config=config_pretrained, + **config.additional_kwargs + ) + + def _get_torch_dtype(self, dtype_str: str) -> torch.dtype: + """Convert dtype string to torch.dtype. + + Args: + dtype_str: String representation of dtype ('float32', 'float16', etc.). + + Returns: + Corresponding torch.dtype. + + Raises: + ValueError: If dtype string is invalid. + """ + dtype_map = { + 'float32': torch.float32, + 'float16': torch.float16, + 'bfloat16': torch.bfloat16, + 'int8': torch.int8, + 'fp32': torch.float32, + 'fp16': torch.float16, + 'bf16': torch.bfloat16, + } + + if dtype_str.lower() not in dtype_map: + raise ValueError(f"Invalid dtype '{dtype_str}'.Must be one of {list(dtype_map.keys())}") + + return dtype_map[dtype_str.lower()] + + def _get_model_size(self, model: PreTrainedModel) -> float: + """Calculate model size in millions of parameters. + + Args: + model: The model to measure. + + Returns: + Number of parameters in millions. + """ + return sum(p.numel() for p in model.parameters()) / 1_000_000 + + @staticmethod + def estimate_param_count_from_config(hf_config) -> Optional[int]: + """Estimate parameter count from a HuggingFace config without instantiating the model. + + This avoids allocating tens/hundreds of GB of CPU RAM for large models (e.g. 70B). + The estimate covers embedding + transformer layers + LM head for common architectures. + + Args: + hf_config: A HuggingFace PretrainedConfig object. + + Returns: + int: Estimated number of parameters, or None if estimation is not possible. + """ + try: + vocab = getattr(hf_config, 'vocab_size', 0) + hidden = getattr(hf_config, 'hidden_size', 0) + layers = getattr(hf_config, 'num_hidden_layers', 0) + intermediate = getattr(hf_config, 'intermediate_size', hidden * 4) + num_heads = getattr(hf_config, 'num_attention_heads', 0) + num_kv_heads = getattr(hf_config, 'num_key_value_heads', num_heads) + head_dim = hidden // num_heads if num_heads > 0 else 0 + + if vocab == 0 or hidden == 0 or layers == 0: + return None + + # Embeddings: token + (optional) position + max_pos = getattr(hf_config, 'max_position_embeddings', 0) + has_pos_embed = getattr(hf_config, 'position_embedding_type', None) not in ('rotary', None) + embed_params = vocab * hidden + if has_pos_embed and max_pos > 0: + embed_params += max_pos * hidden + + # Per transformer layer: + # Self-attention: Q, K, V projections + output projection + # MLP: gate_proj + up_proj + down_proj (LLaMA-style) or fc1 + fc2 + # Layer norms: 2 * hidden + qkv_params = (num_heads * head_dim + 2 * num_kv_heads * head_dim) * hidden + attn_out = hidden * hidden + # For gated MLPs (LLaMA/Mistral), there are 3 matrices; otherwise 2 + has_gate = getattr(hf_config, 'hidden_act', 'gelu') in ('silu', 'swiglu') + mlp_params = (3 if has_gate else 2) * hidden * intermediate + norm_params = 2 * hidden + layer_params = qkv_params + attn_out + mlp_params + norm_params + + # MoE: if num_local_experts > 1, MLP is replicated per expert + num_experts = getattr(hf_config, 'num_local_experts', 1) + if num_experts > 1: + # Router + replicated MLP experts (attention is shared) + router_params = hidden * num_experts + layer_params = qkv_params + attn_out + norm_params + \ + num_experts * mlp_params + router_params + + total_params = embed_params + layers * layer_params + # LM head (often tied to embedding, but count it for safety) + total_params += vocab * hidden + # Final layer norm + total_params += hidden + + return total_params + except Exception as e: + logger.warning(f'Could not estimate param count from config: {e}') + return None + + @staticmethod + def estimate_memory(param_count, precision_str, mode='training'): + """Estimate GPU memory required for a model. + + For training: weights + gradients + optimizer states (Adam uses 2x) = 4x multiplier. + For inference: weights only + overhead for runtime buffers = ~1.2x multiplier. + + Args: + param_count (int): Number of model parameters. + precision_str (str): Precision string ('float32', 'float16', 'bfloat16', 'fp16', 'fp32', 'int8'). + mode (str): 'training' or 'inference'. + + Returns: + tuple: (estimated_bytes, gpu_total_bytes, fits) where fits is True if + the model is estimated to fit in available memory. + """ + precision_lower = precision_str.lower() + if precision_lower in ('float16', 'fp16', 'bfloat16', 'bf16'): + bytes_per_param = 2 + elif precision_lower in ('int8', ): + bytes_per_param = 1 + else: + bytes_per_param = 4 + + if mode == 'training': + # weights + gradients + 2x Adam optimizer states = 4x + multiplier = 4 + else: + # inference: weights + runtime overhead (~20%) + multiplier = 1.2 + + estimated_bytes = int(param_count * bytes_per_param * multiplier) + + gpu_available = torch.cuda.is_available() + if not gpu_available: + try: + import psutil + sys_mem = psutil.virtual_memory().total + except ImportError: + logger.warning('psutil not installed — cannot check system memory. Skipping memory check.') + return 0, 0, True + max_gpu_mem = 80 * (1024**3) # 80GB — largest common single-GPU memory + effective_mem = min(sys_mem, max_gpu_mem) + fits = (estimated_bytes / effective_mem) < 0.85 + return estimated_bytes, effective_mem, fits + + gpu_mem = torch.cuda.get_device_properties(0).total_memory + # Use 85% threshold to leave headroom for activations, framework overhead, etc. + fits = (estimated_bytes / gpu_mem) < 0.85 + return estimated_bytes, gpu_mem, fits + + @staticmethod + def check_memory_fits(model_identifier, hf_config, precision_str, mode='training', token=None): + """Check if a model fits in GPU memory before downloading weights. + + Downloads only the config (few KB) via hf_config, estimates memory, and returns + whether the model fits. Use this before calling load_model() to avoid wasting + time downloading large models that won't fit. + + Args: + model_identifier (str): HF model ID (for logging). + hf_config: A HuggingFace PretrainedConfig object. + precision_str (str): Precision string ('float32', 'float16', etc.). + mode (str): 'training' or 'inference'. + token (str, optional): HF token (unused, kept for API consistency). + + Returns: + tuple: (fits, param_count_millions, estimated_gb, available_gb) + fits is True if model is estimated to fit. + """ + param_count = HuggingFaceModelLoader.estimate_param_count_from_config(hf_config) + if param_count is None: + logger.warning( + f'Could not estimate param count from config for {model_identifier}. ' + f'Proceeding with download — memory check skipped.' + ) + return True, 0, 0, 0 + + estimated_bytes, available_bytes, fits = HuggingFaceModelLoader.estimate_memory( + param_count, precision_str, mode=mode + ) + + param_millions = param_count / 1e6 + estimated_gb = estimated_bytes / 1e9 + available_gb = available_bytes / 1e9 + + if fits: + logger.info( + f'Model {model_identifier} ({param_millions:.1f}M params) estimated to need ' + f'~{estimated_gb:.1f}GB for {mode}, fits in available memory ({available_gb:.1f}GB).' + ) + else: + mem_type = 'GPU memory' if torch.cuda.is_available() else 'system RAM' + logger.error( + f'Model {model_identifier} ({param_millions:.1f}M params) estimated to need ' + f'~{estimated_gb:.1f}GB for {mode} (weights' + f'{" + gradients + optimizer states" if mode == "training" else " + runtime overhead"}), ' + f'which exceeds available {mem_type} ({available_gb:.1f}GB). ' + f'Skipping benchmark. Use a smaller model variant or a machine with more memory.' + ) + + return fits, param_millions, estimated_gb, available_gb + + def __repr__(self) -> str: + """String representation of the loader.""" + token_status = 'authenticated' if self.token else 'no authentication' + return f"HuggingFaceModelLoader(cache_dir='{self.cache_dir}', {token_status})" diff --git a/superbench/benchmarks/micro_benchmarks/model_source_config.py b/superbench/benchmarks/micro_benchmarks/model_source_config.py new file mode 100644 index 000000000..48af35962 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/model_source_config.py @@ -0,0 +1,89 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Configuration classes for model source and loading.""" + +from dataclasses import dataclass, field +from typing import Optional, Dict, Any, Tuple + + +@dataclass +class ModelSourceConfig: + """Configuration for model source and loading parameters. + + This class encapsulates all configuration needed to load a model + from either in-house definitions or Hugging Face Hub. + + Attributes: + source: Source of the model ('in-house' or 'huggingface'). + identifier: Model name (in-house) or model ID (HuggingFace). + hf_token: Optional HuggingFace authentication token for private/gated models. + torch_dtype: Data type for model weights ('float32', 'float16', 'bfloat16'). + revision: Specific model version/commit/tag to use. + cache_dir: Directory to cache downloaded models. + device_map: Device mapping strategy for model loading. + use_auth_token: Deprecated, use hf_token instead. + additional_kwargs: Additional keyword arguments for model loading. + """ + + source: str = 'in-house' + identifier: str = '' + hf_token: Optional[str] = None + torch_dtype: str = 'float32' + revision: Optional[str] = None + cache_dir: Optional[str] = None + device_map: Optional[str] = None + use_auth_token: Optional[str] = None # Deprecated + additional_kwargs: Dict[str, Any] = field(default_factory=dict) + + def __post_init__(self): + """Post-initialization validation and normalization.""" + # Handle deprecated use_auth_token + if self.use_auth_token is not None and self.hf_token is None: + self.hf_token = self.use_auth_token + + # Normalize and validate source + self.source = self.source.lower() + if self.source not in ['in-house', 'huggingface']: + raise ValueError(f"Invalid model source '{self.source}'.Must be 'in-house' or 'huggingface'.") + + # Validate torch_dtype + valid_dtypes = ['float32', 'float16', 'bfloat16', 'int8'] + if self.torch_dtype not in valid_dtypes: + raise ValueError(f"Invalid torch_dtype '{self.torch_dtype}'.Must be one of {valid_dtypes}.") + + # Validate identifier is provided + if not self.identifier: + raise ValueError('Model identifier must be provided.') + + def validate(self) -> Tuple[bool, str]: + """Validate configuration parameters. + + Returns: + Tuple of (is_valid, error_message). + If is_valid is True, error_message is empty. + """ + # Check identifier is not empty for HuggingFace models + if self.source == 'huggingface': + if not self.identifier or not self.identifier.strip(): + return (False, 'HuggingFace model identifier cannot be empty') + + return (True, '') + + def is_huggingface(self) -> bool: + """Check if this configuration is for a HuggingFace model. + + Returns: + True if source is 'huggingface', False otherwise. + """ + return self.source == 'huggingface' + + def __repr__(self) -> str: + """String representation of the configuration.""" + token_status = 'set' if self.hf_token else 'not set' + return ( + f"ModelSourceConfig(source='{self.source}', " + f"identifier='{self.identifier}', " + f"torch_dtype='{self.torch_dtype}', " + f'hf_token={token_status})' + ) diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py index a472af121..8caf95df9 100644 --- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py @@ -14,6 +14,8 @@ from superbench.common.utils import logger from superbench.benchmarks import BenchmarkRegistry, Platform, Precision from superbench.benchmarks.micro_benchmarks import MicroBenchmark +from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig +from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader class ORTInferenceBenchmark(MicroBenchmark): @@ -96,6 +98,32 @@ def add_parser_arguments(self): help='The number of test step for benchmarking.', ) + # HuggingFace model arguments + self._parser.add_argument( + '--model_source', + type=str, + choices=['in-house', 'huggingface'], + default='in-house', + required=False, + help='Source of the model: inhouse (default) or huggingface.', + ) + + self._parser.add_argument( + '--model_identifier', + type=str, + default=None, + required=False, + help='Model identifier for HuggingFace models (e.g., bert-base-uncased).', + ) + + self._parser.add_argument( + '--seq_length', + type=int, + default=512, + required=False, + help='Sequence length for transformer models.', + ) + def _preprocess(self): """Preprocess/preparation operations before the benchmarking. @@ -113,6 +141,11 @@ def _preprocess(self): 3: ort.GraphOptimizationLevel.ORT_ENABLE_ALL, } + # Handle HuggingFace models if specified + if self._args.model_source == 'huggingface': + return self._preprocess_huggingface_models() + + # Original in-house model processing for model in self._args.pytorch_models: if hasattr(torchvision.models, model): data_type = Precision.FLOAT16.value if self._args.precision == Precision.FLOAT16 \ @@ -136,11 +169,118 @@ def _preprocess(self): return True + def _preprocess_huggingface_models(self): + """Preprocess HuggingFace models for ONNX Runtime inference. + + Returns: + bool: True if preprocessing succeeds. + """ + import os + + if not self._args.model_identifier: + logger.error('--model_identifier is required when using --model_source huggingface') + return False + + try: + logger.info(f'Loading HuggingFace model: {self._args.model_identifier}') + + # Step 1: Pre-download memory check — download config only (few KB) + from transformers import AutoConfig + hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN') + load_kwargs = {} + if hf_token: + load_kwargs['token'] = hf_token + hf_config = AutoConfig.from_pretrained(self._args.model_identifier, trust_remote_code=True, **load_kwargs) + + precision_str = self._args.precision.value if self._args.precision != Precision.INT8 else 'float32' + fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits( + self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token + ) + if not fits: + return False + + # Step 2: Proceed with model download and ONNX export + + # Get GPU rank to create unique file paths and avoid race conditions + # when multiple processes export the same model simultaneously + gpu_rank = os.getenv('CUDA_VISIBLE_DEVICES', '0') + proc_rank = os.getenv('PROC_RANK', gpu_rank) + + # Create model source config - load on CPU to avoid accelerate dispatching + # model across multiple GPUs which causes device mismatch during ONNX export + model_config = ModelSourceConfig( + source='huggingface', + identifier=self._args.model_identifier, + hf_token=hf_token, + torch_dtype=self._args.precision.value if self._args.precision != Precision.INT8 else 'float32', + device_map=None, + ) + + # Load model from HuggingFace on CPU + loader = HuggingFaceModelLoader() + hf_model, _, _ = loader.load_model_from_config(model_config, device='cpu') + from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter + exporter = torch2onnxExporter() + + model_name = self._args.model_identifier.replace('/', '_') + + # Prepare output path - use proc_rank subdirectory to avoid race conditions + # when multiple processes export the same model simultaneously + proc_output_path = self.__model_cache_path / f'rank_{proc_rank}' + proc_output_path.mkdir(parents=True, exist_ok=True) + + # For INT8, export as float32 first then quantize (matching in-house model behavior). + # For other precisions, include precision in the model name directly. + if self._args.precision == Precision.INT8: + export_precision = Precision.FLOAT32.value + else: + export_precision = self._args.precision.value + model_name_with_precision = f'{model_name}.{export_precision}' + + # Export directly to final destination to avoid path issues with external data + onnx_path = exporter.export_huggingface_model( + model=hf_model, + model_name=model_name_with_precision, + batch_size=self._args.batch_size, + seq_length=self._args.seq_length, + output_dir=str(proc_output_path), + ) + + if not onnx_path: + logger.error(f'Failed to export {self._args.model_identifier} to ONNX') + return False + + # Apply INT8 quantization if requested (matching in-house model behavior) + if self._args.precision == Precision.INT8: + from onnxruntime.quantization import quantize_dynamic + quantized_path = str(proc_output_path / f'{model_name}.{Precision.INT8.value}.onnx') + quantize_dynamic(onnx_path, quantized_path) + logger.info('Applied INT8 quantization to HuggingFace model') + + # Update model list and cache path for benchmarking + self._args.pytorch_models = [model_name] + self.__model_cache_path = proc_output_path + + logger.info('Successfully prepared HuggingFace model for ORT inference') + return True + + except Exception as e: + logger.error(f'Failed to prepare HuggingFace model: {str(e)}') + import traceback + logger.error(traceback.format_exc()) + return False + def _benchmark(self): """Implementation for benchmarking.""" import onnxruntime as ort precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'int8': 'int8'} + # Require CUDAExecutionProvider — this benchmark targets GPU inference + available = ort.get_available_providers() + if 'CUDAExecutionProvider' not in available: + logger.error(f'CUDAExecutionProvider is not available (available: {available}).') + return False + for model in self._args.pytorch_models: sess_options = ort.SessionOptions() sess_options.graph_optimization_level = self.__graph_opt_level[self._args.graph_opt_level] @@ -177,15 +317,33 @@ def __inference(self, ort_sess): elapse_times (List[float]): latency of every iterations. """ precision = np.float16 if self._args.precision == Precision.FLOAT16 else np.float32 - input_tensor = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision) + + # Get input names from the ONNX session to determine input format + input_names = [input.name for input in ort_sess.get_inputs()] + + # Determine input format based on what the model expects + if 'pixel_values' in input_names: + # Vision model: use pixel_values (batch_size, 3, 224, 224) + pixel_values = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision) + inputs = {'pixel_values': pixel_values} + elif 'input_ids' in input_names: + # NLP model: use input_ids and attention_mask + seq_len = getattr(self._args, 'seq_length', 512) + input_ids = np.random.randint(0, 30000, (self._args.batch_size, seq_len)).astype(np.int64) + attention_mask = np.ones((self._args.batch_size, seq_len), dtype=np.int64) + inputs = {'input_ids': input_ids, 'attention_mask': attention_mask} + else: + # Default for in-house torchvision models: use 'input' (batch_size, 3, 224, 224) + input_tensor = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision) + inputs = {'input': input_tensor} for i in range(self._args.num_warmup): - ort_sess.run(None, {'input': input_tensor}) + ort_sess.run(None, inputs) elapse_times = list() for i in range(self._args.num_steps): start = time.time() - ort_sess.run(None, {'input': input_tensor}) + ort_sess.run(None, inputs) end = time.time() elapse_times.append((end - start) * 1000) diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py index 4d5a5b4b7..3d8fb80d7 100644 --- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py +++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py @@ -10,6 +10,8 @@ from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter +from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig +from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader class TensorRTInferenceBenchmark(MicroBenchmarkWithInvoke): @@ -71,6 +73,24 @@ def add_parser_arguments(self): help='Run at least N inference iterations.', ) + # HuggingFace model arguments + self._parser.add_argument( + '--model_source', + type=str, + choices=['in-house', 'huggingface'], + default='in-house', + required=False, + help='Source of the model: inhouse (default) or huggingface.', + ) + + self._parser.add_argument( + '--model_identifier', + type=str, + default=None, + required=False, + help='Model identifier for HuggingFace models (e.g., bert-base-uncased).', + ) + def _preprocess(self): """Preprocess/preparation operations before the benchmarking. @@ -82,6 +102,11 @@ def _preprocess(self): self.__bin_path = str(Path(self._args.bin_dir) / self._bin_name) + # Handle HuggingFace models if specified + if self._args.model_source == 'huggingface': + return self._preprocess_huggingface_models() + + # Original in-house model processing exporter = torch2onnxExporter() for model in self._args.pytorch_models: if not (exporter.check_torchvision_model(model) or exporter.check_benchmark_model(model)): @@ -102,9 +127,8 @@ def _preprocess(self): # model options f'--onnx={onnx_model}', # build options - '--explicitBatch', f'--optShapes=input:{input_shape}', - '--workspace=8192', + '--memPoolSize=workspace:8192M', None if self._args.precision == 'fp32' else f'--{self._args.precision}', # inference options f'--iterations={self._args.iterations}', @@ -115,6 +139,134 @@ def _preprocess(self): return True + def _preprocess_huggingface_models(self): + """Preprocess HuggingFace models for TensorRT inference. + + Returns: + bool: True if preprocessing succeeds. + """ + import os + from transformers import AutoConfig + + if not self._args.model_identifier: + logger.error('--model_identifier is required when using --model_source huggingface') + return False + + try: + # Step 1: Pre-download memory check — download only the config (a few KB) + # and estimate whether the full model will fit in GPU memory. + hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN') + load_kwargs = {} + if hf_token: + load_kwargs['token'] = hf_token + + hf_config = AutoConfig.from_pretrained(self._args.model_identifier, trust_remote_code=True, **load_kwargs) + precision_str = self._args.precision # already a string: 'fp16', 'fp32', 'int8' + fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits( + self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token + ) + if not fits: + return False + + # Step 2: Download and load the full model + + # Get GPU rank to create unique file paths and avoid race conditions + # when multiple processes export the same model simultaneously + gpu_rank = os.getenv('CUDA_VISIBLE_DEVICES', '0') + proc_rank = os.getenv('PROC_RANK', gpu_rank) + + # Create model source config - load on CPU to avoid accelerate dispatching + # model across multiple GPUs which causes device mismatch during ONNX export. + # TensorRT handles precision internally via --fp16/--int8 flags, + # so the ONNX model is always exported in float32. + model_config = ModelSourceConfig( + source='huggingface', + identifier=self._args.model_identifier, + hf_token=hf_token, + torch_dtype='float32', + device_map=None, + ) + + logger.info(f'Loading HuggingFace model: {self._args.model_identifier}') + + # Load model from HuggingFace on CPU + loader = HuggingFaceModelLoader() + hf_model, _, _ = loader.load_model_from_config(model_config, device='cpu') + exporter = torch2onnxExporter() + + model_name = self._args.model_identifier.replace('/', '_') + + # Prepare output path - use proc_rank subdirectory to avoid race conditions + # when multiple processes export the same model simultaneously + output_dir = f'/tmp/tensorrt_onnx_rank_{proc_rank}' + os.makedirs(output_dir, exist_ok=True) + + onnx_path = exporter.export_huggingface_model( + model=hf_model, + model_name=model_name, + batch_size=self._args.batch_size, + seq_length=self._args.seq_length, + output_dir=output_dir, + ) + + if not onnx_path: + logger.error(f'Failed to export {self._args.model_identifier} to ONNX') + return False + + # Determine input shape based on model type by checking ONNX file + import onnx as onnx_lib + onnx_model = onnx_lib.load(onnx_path) + + # Get the first input to determine shape and name + input_name = onnx_model.graph.input[0].name + + # Vision models typically have 4D input (batch, channels, height, width) + # NLP models typically have 2D input (batch, sequence) + if input_name == 'pixel_values' or len(onnx_model.graph.input[0].type.tensor_type.shape.dim) == 4: + # Vision model: batch x channels x height x width + input_shapes = f'{input_name}:{self._args.batch_size}x3x224x224' + else: + # NLP model: batch x sequence - need to specify all inputs with same batch and seq length + seq_len = getattr(self._args, 'seq_length', 512) + shapes_list = [] + for inp in onnx_model.graph.input: + inp_name = inp.name + num_dims = len(inp.type.tensor_type.shape.dim) + if num_dims == 2: + # Standard 2D input: batch x sequence + shapes_list.append(f'{inp_name}:{self._args.batch_size}x{seq_len}') + elif num_dims == 4: + # 4D input (rare for NLP, but handle it) + shapes_list.append(f'{inp_name}:{self._args.batch_size}x1x{seq_len}x{seq_len}') + else: + # Default to 2D + shapes_list.append(f'{inp_name}:{self._args.batch_size}x{seq_len}') + input_shapes = ','.join(shapes_list) + + # Build TensorRT command with correct input name + args = [ + self.__bin_path, + f'--onnx={onnx_path}', + f'--optShapes={input_shapes}', + '--memPoolSize=workspace:8192M', + None if self._args.precision == 'fp32' else f'--{self._args.precision}', + f'--iterations={self._args.iterations}', + '--percentile=99', + ] + self._commands.append(' '.join(filter(None, args))) + + # Store model name for result processing + self._args.pytorch_models = [self._args.model_identifier.replace('/', '_')] + + logger.info('Successfully prepared HuggingFace model for TensorRT inference') + return True + + except Exception as e: + logger.error(f'Failed to prepare HuggingFace model: {str(e)}') + import traceback + logger.error(traceback.format_exc()) + return False + def _process_raw_result(self, cmd_idx, raw_output): """Function to parse raw results and save the summarized results. diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py new file mode 100644 index 000000000..85a265cb0 --- /dev/null +++ b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py @@ -0,0 +1,102 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""End-to-end integration tests for HuggingFace model loading. + +These tests actually download and load models from HuggingFace Hub. +The test class is skipped according to ``@decorator.cuda_test``, and +``test_load_model_to_gpu`` is additionally skipped when +``torch.cuda.is_available()`` is false. +""" + +import pytest +import torch + +transformers = pytest.importorskip('transformers') + +from tests.helper import decorator +from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader +from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig + + +@decorator.cuda_test +class TestHuggingFaceE2E: + """End-to-end tests for HuggingFace model loading.""" + @pytest.fixture + def loader(self): + """Create a loader instance.""" + return HuggingFaceModelLoader(cache_dir='/tmp/hf_test_cache') + + def test_load_tiny_bert_model(self, loader): + """Test loading a tiny BERT model from HuggingFace Hub. + + Uses prajjwal1/bert-tiny which is a small public BERT model (~17MB). + """ + model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu') + + assert model is not None + assert config is not None + assert config.model_type == 'bert' + + # Verify model can do a forward pass + dummy_input = torch.randint(0, 1000, (1, 10)) + with torch.no_grad(): + output = model(dummy_input) + assert output is not None + + def test_load_distilgpt2_model(self, loader): + """Test loading DistilGPT2 model from HuggingFace Hub. + + Uses distilbert/distilgpt2 which is a small public GPT-2 model (~82MB). + """ + model, config, tokenizer = loader.load_model('distilbert/distilgpt2', device='cpu') + + assert model is not None + assert config is not None + assert config.model_type == 'gpt2' + + # Verify model can do a forward pass + dummy_input = torch.randint(0, 1000, (1, 10)) + with torch.no_grad(): + output = model(dummy_input) + assert output is not None + + def test_load_model_from_config(self, loader): + """Test loading model using ModelSourceConfig via load_model_from_config.""" + config = ModelSourceConfig(source='huggingface', identifier='prajjwal1/bert-tiny', torch_dtype='float32') + + model, hf_config, tokenizer = loader.load_model_from_config(config, device='cpu') + + assert model is not None + assert hf_config.model_type == 'bert' + + def test_load_model_with_dtype(self, loader): + """Test loading model and converting dtype after load.""" + model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu') + + # Convert to float32 after loading + model = model.float() + + # Check model parameters are float32 + param = next(model.parameters()) + assert param.dtype == torch.float32 + + @pytest.mark.skipif(not torch.cuda.is_available(), reason='Requires GPU') + def test_load_model_to_gpu(self, loader): + """Test loading model and moving to GPU.""" + model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu') + + # Move to GPU manually + model = model.cuda() + + # Check model is on GPU + param = next(model.parameters()) + assert param.device.type == 'cuda' + + def test_architecture_detection(self, loader): + """Test that architecture is correctly detected from loaded model.""" + model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu') + + # Architecture should be detected from config + assert config.model_type is not None + assert 'bert' in config.model_type.lower() diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py b/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py new file mode 100644 index 000000000..1a1caf673 --- /dev/null +++ b/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py @@ -0,0 +1,117 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Unit tests for HuggingFaceModelLoader.""" + +import pytest +import torch +from unittest.mock import MagicMock, patch + +from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import ( + HuggingFaceModelLoader, + ModelNotFoundError, +) +from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig + + +class TestHuggingFaceModelLoader: + """Test cases for HuggingFaceModelLoader class.""" + @pytest.fixture + def loader(self): + """Create a loader instance for testing.""" + return HuggingFaceModelLoader(cache_dir='/tmp/test_cache', token=None) + + def test_initialization(self, loader): + """Test loader initialization.""" + assert loader.cache_dir == '/tmp/test_cache' + assert loader.token is None + + def test_initialization_with_env_token(self, monkeypatch, tmp_path): + """Test loader picks up token from environment.""" + monkeypatch.setenv('HF_TOKEN', 'env_token') + monkeypatch.setenv('HF_HOME', str(tmp_path / 'hf_cache')) + loader = HuggingFaceModelLoader() + assert loader.token == 'env_token' + + def test_get_torch_dtype_valid(self, loader): + """Test torch dtype conversion.""" + assert loader._get_torch_dtype('float32') == torch.float32 + assert loader._get_torch_dtype('float16') == torch.float16 + assert loader._get_torch_dtype('fp16') == torch.float16 + assert loader._get_torch_dtype('bfloat16') == torch.bfloat16 + + def test_get_torch_dtype_invalid(self, loader): + """Test invalid dtype raises error.""" + with pytest.raises(ValueError, match='Invalid dtype'): + loader._get_torch_dtype('invalid_dtype') + + @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoModel') + @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoConfig') + @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoTokenizer') + def test_load_model_success(self, mock_tokenizer, mock_config, mock_model, loader): + """Test successful model loading.""" + # Mock config + mock_cfg = MagicMock() + mock_cfg.model_type = 'bert' + mock_config.from_pretrained.return_value = mock_cfg + + # Mock model + mock_mdl = MagicMock() + mock_mdl.parameters.return_value = [torch.randn(100, 100)] + mock_mdl.to.return_value = mock_mdl + mock_model.from_pretrained.return_value = mock_mdl + + # Mock tokenizer + mock_tok = MagicMock() + mock_tokenizer.from_pretrained.return_value = mock_tok + + model, config, tokenizer = loader.load_model('test/model', device='cpu') + + assert model == mock_mdl + assert config == mock_cfg + assert tokenizer == mock_tok + + # Verify mocks were called with correct arguments + mock_config.from_pretrained.assert_called_once() + call_kwargs = mock_config.from_pretrained.call_args + assert call_kwargs[0][0] == 'test/model' + assert call_kwargs[1]['trust_remote_code'] is True + assert call_kwargs[1]['cache_dir'] == '/tmp/test_cache' + + mock_model.from_pretrained.assert_called_once() + model_call_kwargs = mock_model.from_pretrained.call_args + assert model_call_kwargs[1]['trust_remote_code'] is True + assert model_call_kwargs[1]['cache_dir'] == '/tmp/test_cache' + + mock_tokenizer.from_pretrained.assert_called_once() + + # Verify model was moved to the requested device + mock_mdl.to.assert_called_once_with('cpu') + + @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoTokenizer') + @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoModel') + @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoConfig') + def test_load_model_not_found(self, mock_config, mock_model, mock_tokenizer, loader): + """Test loading non-existent model.""" + mock_config.from_pretrained.side_effect = OSError('404 Client Error') + + with pytest.raises(ModelNotFoundError, match='not found'): + loader.load_model('nonexistent/model') + + def test_load_model_from_config_invalid_source(self, loader): + """Test loading with invalid source in config.""" + config = ModelSourceConfig(source='in-house', identifier='bert-base') + + with pytest.raises(ValueError, match='Cannot load model'): + loader.load_model_from_config(config) + + def test_get_model_size(self, loader): + """Test model size calculation.""" + mock_model = MagicMock() + mock_model.parameters.return_value = [ + torch.randn(1000, 1000), # 1M params + torch.randn(500, 500), # 0.25M params + ] + + size = loader._get_model_size(mock_model) + assert abs(size - 1.25) < 0.01 # Should be ~1.25M diff --git a/tests/benchmarks/micro_benchmarks/test_model_source_config.py b/tests/benchmarks/micro_benchmarks/test_model_source_config.py new file mode 100644 index 000000000..9d9f7f35e --- /dev/null +++ b/tests/benchmarks/micro_benchmarks/test_model_source_config.py @@ -0,0 +1,73 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Unit tests for ModelSourceConfig.""" + +import pytest +from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig + + +class TestModelSourceConfig: + """Test cases for ModelSourceConfig class.""" + def test_default_config(self): + """Test default configuration.""" + config = ModelSourceConfig(identifier='bert-base') + assert config.source == 'in-house' + assert config.identifier == 'bert-base' + assert config.torch_dtype == 'float32' + assert config.hf_token is None + + def test_huggingface_config(self): + """Test HuggingFace configuration.""" + config = ModelSourceConfig(source='huggingface', identifier='meta-llama/Llama-2-7b-hf', torch_dtype='float16') + assert config.source == 'huggingface' + assert config.identifier == 'meta-llama/Llama-2-7b-hf' + assert config.torch_dtype == 'float16' + + def test_invalid_source(self): + """Test invalid source raises error.""" + with pytest.raises(ValueError, match='Invalid model source'): + ModelSourceConfig(source='invalid', identifier='test') + + def test_invalid_dtype(self): + """Test invalid dtype raises error.""" + with pytest.raises(ValueError, match='Invalid torch_dtype'): + ModelSourceConfig(identifier='test', torch_dtype='invalid') + + def test_missing_identifier(self): + """Test missing identifier raises error.""" + with pytest.raises(ValueError, match='identifier must be provided'): + ModelSourceConfig(identifier='') + + def test_validate_huggingface_empty(self): + """Test validation of empty HuggingFace model identifier.""" + config = ModelSourceConfig(source='huggingface', identifier=' ') + is_valid, message = config.validate() + assert not is_valid + assert 'cannot be empty' in message + + def test_validate_valid_huggingface(self): + """Test validation of valid HuggingFace model.""" + config = ModelSourceConfig(source='huggingface', identifier='meta-llama/Llama-2-7b-hf') + is_valid, message = config.validate() + assert is_valid + assert message == '' + + def test_validate_valid_huggingface_short_name(self): + """Test validation of valid HuggingFace model with short name (no org).""" + config = ModelSourceConfig(source='huggingface', identifier='bert-base-uncased') + is_valid, message = config.validate() + assert is_valid + assert message == '' + + def test_is_huggingface(self): + """Test is_huggingface method.""" + hf_config = ModelSourceConfig(source='huggingface', identifier='test/model') + inhouse_config = ModelSourceConfig(source='in-house', identifier='bert-base') + assert hf_config.is_huggingface() is True + assert inhouse_config.is_huggingface() is False + + def test_deprecated_use_auth_token(self): + """Test deprecated use_auth_token parameter.""" + config = ModelSourceConfig(identifier='test', use_auth_token='old_token') + assert config.hf_token == 'old_token'