diff --git a/examples/benchmarks/ort_inference_performance.py b/examples/benchmarks/ort_inference_performance.py
index 18bda2043..73c258387 100644
--- a/examples/benchmarks/ort_inference_performance.py
+++ b/examples/benchmarks/ort_inference_performance.py
@@ -4,13 +4,28 @@
 """Micro benchmark example for ONNXRuntime inference performance.
 
 Commands to run:
+  In-house models:
     python3 examples/benchmarks/ort_inference_performance.py
+    python3 examples/benchmarks/ort_inference_performance.py --model_source in-house
+
+  HuggingFace models:
+    python3 examples/benchmarks/ort_inference_performance.py \
+      --model_source huggingface --model_identifier bert-base-uncased
+    python3 examples/benchmarks/ort_inference_performance.py \
+      --model_source huggingface --model_identifier microsoft/resnet-50
+
+Environment variables:
+  HF_TOKEN: HuggingFace token for gated models (optional)
 """
 
+import argparse
+
 from superbench.benchmarks import BenchmarkRegistry, Platform
 from superbench.common.utils import logger
 
-if __name__ == '__main__':
+
+def run_inhouse_benchmark():
+    """Run ORT inference with in-house torchvision models."""
     context = BenchmarkRegistry.create_benchmark_context(
         'ort-inference', platform=Platform.CUDA, parameters='--pytorch_models resnet50 resnet101 --precision float16'
     )
@@ -21,3 +36,57 @@
                 benchmark.name, benchmark.return_code, benchmark.result
             )
         )
+    return benchmark
+
+
+def run_huggingface_benchmark(model_identifier, precision='float16', batch_size=32, seq_length=512):
+    """Run ORT inference with a HuggingFace model.
+
+    Args:
+        model_identifier: HuggingFace model ID (e.g., 'bert-base-uncased').
+        precision: Inference precision ('float32', 'float16', 'int8').
+        batch_size: Batch size for inference.
+        seq_length: Sequence length for transformer models.
+    """
+    parameters = (
+        f'--model_source huggingface '
+        f'--model_identifier {model_identifier} '
+        f'--precision {precision} '
+        f'--batch_size {batch_size} '
+        f'--seq_length {seq_length}'
+    )
+
+    logger.info(f'Running ORT inference benchmark with HuggingFace model: {model_identifier}')
+
+    context = BenchmarkRegistry.create_benchmark_context('ort-inference', platform=Platform.CUDA, parameters=parameters)
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
+    return benchmark
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ORT inference benchmark')
+    parser.add_argument(
+        '--model_source',
+        type=str,
+        default='in-house',
+        choices=['in-house', 'huggingface'],
+        help='Source of the model: in-house (default) or huggingface'
+    )
+    parser.add_argument(
+        '--model_identifier', type=str, default='bert-base-uncased', help='HuggingFace model identifier'
+    )
+    parser.add_argument('--precision', type=str, default='float16', choices=['float32', 'float16', 'int8'])
+    parser.add_argument('--batch_size', type=int, default=32)
+    parser.add_argument('--seq_length', type=int, default=512)
+    args = parser.parse_args()
+
+    if args.model_source == 'huggingface':
+        run_huggingface_benchmark(args.model_identifier, args.precision, args.batch_size, args.seq_length)
+    else:
+        run_inhouse_benchmark()
diff --git a/examples/benchmarks/tensorrt_inference_performance.py b/examples/benchmarks/tensorrt_inference_performance.py
index cacbf1177..bb48dfb3f 100644
--- a/examples/benchmarks/tensorrt_inference_performance.py
+++ b/examples/benchmarks/tensorrt_inference_performance.py
@@ -4,13 +4,28 @@
 """Micro benchmark example for TensorRT inference performance.
 
 Commands to run:
+  In-house models:
     python3 examples/benchmarks/tensorrt_inference_performance.py
+    python3 examples/benchmarks/tensorrt_inference_performance.py --model_source in-house
+
+  HuggingFace models:
+    python3 examples/benchmarks/tensorrt_inference_performance.py \
+      --model_source huggingface --model_identifier bert-base-uncased
+    python3 examples/benchmarks/tensorrt_inference_performance.py \
+      --model_source huggingface --model_identifier microsoft/resnet-50
+
+Environment variables:
+  HF_TOKEN: HuggingFace token for gated models (optional)
 """
 
+import argparse
+
 from superbench.benchmarks import BenchmarkRegistry, Platform
 from superbench.common.utils import logger
 
-if __name__ == '__main__':
+
+def run_inhouse_benchmark():
+    """Run TensorRT inference with in-house torchvision models."""
     context = BenchmarkRegistry.create_benchmark_context('tensorrt-inference', platform=Platform.CUDA)
     benchmark = BenchmarkRegistry.launch_benchmark(context)
     if benchmark:
@@ -19,3 +34,64 @@
                 benchmark.name, benchmark.return_code, benchmark.result
             )
         )
+    return benchmark
+
+
+def run_huggingface_benchmark(model_identifier, precision='fp16', batch_size=32, seq_length=512, iterations=2048):
+    """Run TensorRT inference with a HuggingFace model.
+
+    Args:
+        model_identifier: HuggingFace model ID (e.g., 'bert-base-uncased').
+        precision: Inference precision ('fp32', 'fp16', 'int8').
+        batch_size: Batch size for inference.
+        seq_length: Sequence length for transformer models.
+        iterations: Number of inference iterations.
+    """
+    parameters = (
+        f'--model_source huggingface '
+        f'--model_identifier {model_identifier} '
+        f'--precision {precision} '
+        f'--batch_size {batch_size} '
+        f'--seq_length {seq_length} '
+        f'--iterations {iterations}'
+    )
+
+    logger.info(f'Running TensorRT inference benchmark with HuggingFace model: {model_identifier}')
+
+    context = BenchmarkRegistry.create_benchmark_context(
+        'tensorrt-inference', platform=Platform.CUDA, parameters=parameters
+    )
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
+    return benchmark
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='TensorRT inference benchmark')
+    parser.add_argument(
+        '--model_source',
+        type=str,
+        default='in-house',
+        choices=['in-house', 'huggingface'],
+        help='Source of the model: in-house (default) or huggingface'
+    )
+    parser.add_argument(
+        '--model_identifier', type=str, default='bert-base-uncased', help='HuggingFace model identifier'
+    )
+    parser.add_argument('--precision', type=str, default='fp16', choices=['fp32', 'fp16', 'int8'])
+    parser.add_argument('--batch_size', type=int, default=32)
+    parser.add_argument('--seq_length', type=int, default=512)
+    parser.add_argument('--iterations', type=int, default=2048)
+    args = parser.parse_args()
+
+    if args.model_source == 'huggingface':
+        run_huggingface_benchmark(
+            args.model_identifier, args.precision, args.batch_size, args.seq_length, args.iterations
+        )
+    else:
+        run_inhouse_benchmark()
diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
index 876d2ccfe..77a9629b8 100644
--- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
+++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py
@@ -9,22 +9,23 @@
 import torch.hub
 import torch.onnx
 import torchvision.models
-from transformers import BertConfig, GPT2Config, LlamaConfig
 
-from superbench.benchmarks.model_benchmarks.pytorch_bert import BertBenchmarkModel
-from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import GPT2BenchmarkModel
-from superbench.benchmarks.model_benchmarks.pytorch_lstm import LSTMBenchmarkModel
-from superbench.benchmarks.model_benchmarks.pytorch_llama import LlamaBenchmarkModel
-from superbench.benchmarks.model_benchmarks.pytorch_mixtral import MixtralBenchmarkModel
+import traceback
 
-if MixtralBenchmarkModel is not None:
-    from transformers import MixtralConfig
+from superbench.common.utils import logger
 
 
 class torch2onnxExporter():
     """PyTorch model to ONNX exporter."""
     def __init__(self):
         """Constructor."""
+        from transformers import BertConfig, GPT2Config, LlamaConfig
+        from superbench.benchmarks.model_benchmarks.pytorch_bert import BertBenchmarkModel
+        from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import GPT2BenchmarkModel
+        from superbench.benchmarks.model_benchmarks.pytorch_lstm import LSTMBenchmarkModel
+        from superbench.benchmarks.model_benchmarks.pytorch_llama import LlamaBenchmarkModel
+        from superbench.benchmarks.model_benchmarks.pytorch_mixtral import MixtralBenchmarkModel
+
         self.num_classes = 100
         self.lstm_input_size = 256
         self.benchmark_models = {
@@ -129,6 +130,7 @@ def __init__(self):
 
         # Only include Mixtral models if MixtralBenchmarkModel is available
         if MixtralBenchmarkModel is not None:
+            from transformers import MixtralConfig
             self.benchmark_models.update(
                 {
                     'mixtral-8x7b':
@@ -270,3 +272,151 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512):
         del dummy_input
         torch.cuda.empty_cache()
         return file_name
+
+    def export_huggingface_model(self, model, model_name, batch_size=1, seq_length=512, output_dir=None):
+        """Export a HuggingFace model to ONNX format.
+
+        Args:
+            model: HuggingFace model instance to export.
+            model_name (str): Name for the exported ONNX model file.
+            batch_size (int): Batch size of input. Defaults to 1.
+            seq_length (int): Sequence length of input. Defaults to 512.
+            output_dir (str): Output directory path. If None, uses default path.
+
+        Returns:
+            str: Exported ONNX model file path, or empty string if export fails.
+        """
+        try:
+            # Use custom output directory if provided
+            output_path = Path(output_dir) if output_dir else self._onnx_model_path
+            file_name = str(output_path / (model_name + '.onnx'))
+
+            # Put model in eval mode and move to CUDA if available
+            model.eval()
+
+            # Disable cache to avoid DynamicCache issues with ONNX export
+            if hasattr(model.config, 'use_cache'):
+                model.config.use_cache = False
+
+            if torch.cuda.is_available():
+                model = model.cuda()
+
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+            # Get model's dtype for inputs
+            model_dtype = next(model.parameters()).dtype
+
+            # Detect model type and create appropriate inputs
+            # Vision models use pixel_values, NLP models use input_ids
+            # Use HuggingFace's main_input_name property for automatic detection
+            main_input = getattr(model, 'main_input_name', 'input_ids')
+            is_vision_model = main_input == 'pixel_values'
+
+            if is_vision_model:
+                # Vision models: use pixel_values (batch_size, channels, height, width)
+                # Standard ImageNet size is 224x224, 3 channels
+                # Match the dtype of the model
+                dummy_input = torch.randn(batch_size, 3, 224, 224, dtype=model_dtype, device=device)
+                input_names = ['pixel_values']
+                dynamic_axes = {'pixel_values': {0: 'batch_size'}, 'output': {0: 'batch_size'}}
+
+                # Wrapper for vision models
+                class VisionModelWrapper(torch.nn.Module):
+                    def __init__(self, model):
+                        super().__init__()
+                        self.model = model
+
+                    def forward(self, pixel_values):
+                        outputs = self.model(pixel_values=pixel_values)
+                        if hasattr(outputs, 'logits'):
+                            return outputs.logits
+                        elif hasattr(outputs, 'last_hidden_state'):
+                            return outputs.last_hidden_state
+                        else:
+                            return outputs[0] if isinstance(outputs, (tuple, list)) else outputs
+
+                wrapped_model = VisionModelWrapper(model)
+                export_args = (dummy_input, )
+            else:
+                # NLP models: use input_ids and attention_mask
+                dummy_input = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device)
+                attention_mask = torch.ones((batch_size, seq_length), dtype=torch.int64, device=device)
+                input_names = ['input_ids', 'attention_mask']
+                dynamic_axes = {
+                    'input_ids': {
+                        0: 'batch_size',
+                        1: 'seq_length'
+                    },
+                    'attention_mask': {
+                        0: 'batch_size',
+                        1: 'seq_length'
+                    },
+                    'output': {
+                        0: 'batch_size'
+                    },
+                }
+
+                # Wrapper for NLP models
+                class NLPModelWrapper(torch.nn.Module):
+                    def __init__(self, model):
+                        super().__init__()
+                        self.model = model
+
+                    def forward(self, input_ids, attention_mask):
+                        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
+                        if hasattr(outputs, 'logits'):
+                            return outputs.logits
+                        elif hasattr(outputs, 'last_hidden_state'):
+                            return outputs.last_hidden_state
+                        else:
+                            return outputs[0] if isinstance(outputs, (tuple, list)) else outputs
+
+                wrapped_model = NLPModelWrapper(model)
+                export_args = (dummy_input, attention_mask)
+
+            # Export to ONNX for large models (>2GB), use external data format
+            model_size_gb = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024**3)
+            use_external_data = model_size_gb > 2.0
+
+            if use_external_data:
+                logger.info(f'Model size is {model_size_gb:.2f}GB, using external data format for ONNX export')
+
+            torch.onnx.export(
+                wrapped_model,
+                export_args,
+                file_name,
+                opset_version=14,
+                do_constant_folding=True,
+                input_names=input_names,
+                output_names=['output'],
+                dynamic_axes=dynamic_axes,
+            )
+
+            # If using external data, convert to external data format
+            if use_external_data:
+                import onnx
+                from onnx.external_data_helper import convert_model_to_external_data
+
+                onnx_model = onnx.load(file_name)
+                external_data_path = model_name + '_data.bin'
+                convert_model_to_external_data(
+                    onnx_model,
+                    all_tensors_to_one_file=True,
+                    location=external_data_path,
+                    size_threshold=1024,
+                    convert_attribute=False
+                )
+                onnx.save(onnx_model, file_name)
+                logger.info(f'Converted ONNX model to external data format: {external_data_path}')
+
+            # Clean up
+            del dummy_input
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+
+            return file_name
+
+        except Exception as e:
+            logger.error(f'Failed to export HuggingFace model to ONNX: {str(e)}')
+            logger.error(traceback.format_exc())
+            return ''
diff --git a/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
new file mode 100644
index 000000000..95ade4815
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/huggingface_model_loader.py
@@ -0,0 +1,421 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Hugging Face model loader for benchmarking."""
+
+import os
+from pathlib import Path
+from typing import Optional, Tuple
+
+import torch
+from transformers import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoConfig,
+    AutoTokenizer,
+    PreTrainedModel,
+    PretrainedConfig,
+)
+
+from superbench.common.utils import logger
+from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
+
+
+class ModelLoadError(Exception):
+    """Exception raised when model loading fails."""
+    pass
+
+
+class ModelNotFoundError(ModelLoadError):
+    """Exception raised when model is not found."""
+    pass
+
+
+class ModelIncompatibleError(ModelLoadError):
+    """Exception raised when model is incompatible with ONNX export."""
+    pass
+
+
+class HuggingFaceModelLoader:
+    """Loads models from Hugging Face Hub for benchmarking.
+
+    This class handles downloading, caching, and loading models from
+    Hugging Face Hub with support for authentication, device mapping,
+    and compatibility validation.
+
+    Attributes:
+        cache_dir: Directory to cache downloaded models.
+        token: HuggingFace authentication token for private/gated models.
+    """
+    def __init__(self, cache_dir: Optional[str] = None, token: Optional[str] = None):
+        """Initialize the HuggingFace model loader.
+
+        Args:
+            cache_dir: Directory to cache downloaded models. If None, uses HF default.
+            token: HuggingFace authentication token for private/gated models.
+        """
+        self.cache_dir = cache_dir or os.getenv('HF_HOME') or os.path.expanduser('~/.cache/huggingface')
+        self.token = token or os.getenv('HF_TOKEN') or os.getenv('HUGGING_FACE_HUB_TOKEN')
+
+        # Ensure cache directory exists
+        Path(self.cache_dir).mkdir(parents=True, exist_ok=True)
+
+        logger.info(f'HuggingFaceModelLoader initialized with cache_dir: {self.cache_dir}')
+        if self.token:
+            logger.info('Authentication token provided for private/gated models (token not logged)')
+
+    def load_model(
+        self,
+        model_identifier: str,
+        torch_dtype: Optional[str] = None,
+        device: str = 'cuda',
+        revision: Optional[str] = None,
+        device_map: Optional[str] = None,
+        config: Optional[PretrainedConfig] = None,
+        **kwargs
+    ) -> Tuple[PreTrainedModel, PretrainedConfig, AutoTokenizer]:
+        """Load a model from Hugging Face Hub.
+
+        Args:
+            model_identifier: HF model ID (e.g., 'meta-llama/Llama-2-7b-hf').
+            torch_dtype: Data type for model weights ('float32', 'float16', 'bfloat16').
+            device: Device to load model on ('cuda', 'cpu').
+            revision: Specific model version/commit/tag to use.
+            device_map: Device mapping strategy for large models.
+            config: Pre-downloaded model config. If None, downloads from Hub.
+            **kwargs: Additional arguments passed to from_pretrained().
+
+        Returns:
+            Tuple of (model, config, tokenizer).
+
+        Raises:
+            ModelNotFoundError: If model doesn't exist on HF Hub.
+            ModelLoadError: If model loading fails for any reason.
+        """
+        logger.info(f'Loading model: {model_identifier}')
+
+        try:
+            # Convert torch_dtype string to torch dtype
+            dtype = self._get_torch_dtype(torch_dtype) if torch_dtype else None
+
+            # Prepare loading kwargs
+            load_kwargs = {'cache_dir': self.cache_dir, 'revision': revision, **kwargs}
+
+            # Add token if available
+            if self.token:
+                load_kwargs['token'] = self.token
+
+            # Add dtype if specified
+            if dtype:
+                load_kwargs['torch_dtype'] = dtype
+
+            # Load config (use pre-downloaded config if provided)
+            if config is None:
+                logger.info('Loading model configuration...')
+                config = AutoConfig.from_pretrained(model_identifier, trust_remote_code=True, **load_kwargs)
+            else:
+                logger.info('Using pre-downloaded model configuration.')
+
+            # Load tokenizer (may fail for some models, that's ok)
+            tokenizer = None
+            try:
+                logger.info('Loading tokenizer...')
+                tokenizer = AutoTokenizer.from_pretrained(model_identifier, trust_remote_code=True, **load_kwargs)
+            except Exception as e:
+                logger.warning(f'Could not load tokenizer: {e}. Continuing without tokenizer.')
+
+            # Load model
+            logger.info(f'Loading model weights (dtype={torch_dtype}, device={device})...')
+            model_kwargs = load_kwargs.copy()
+            model_kwargs['trust_remote_code'] = True
+
+            # Handle device mapping for large models
+            if device_map:
+                model_kwargs['device_map'] = device_map
+            elif device == 'cuda' and torch.cuda.is_available():
+                # Don't set device_map if device is explicitly cuda
+                pass
+            elif device != 'cpu':
+                model_kwargs['device_map'] = device
+
+            # Pass pre-downloaded config to from_pretrained so any overrides take effect
+            if config is not None:
+                model_kwargs['config'] = config
+
+            try:
+                model = AutoModel.from_pretrained(model_identifier, **model_kwargs)
+            except ValueError:
+                logger.info('AutoModel failed, trying AutoModelForCausalLM...')
+                model = AutoModelForCausalLM.from_pretrained(model_identifier, **model_kwargs)
+
+            # Move to device if not using device_map
+            if not device_map and device != 'auto':
+                model = model.to(device)
+
+            logger.info(
+                f'Successfully loaded model: {model_identifier} '
+                f'({self._get_model_size(model):.2f}M parameters)'
+            )
+
+            return model, config, tokenizer
+
+        except OSError as e:
+            if 'not found' in str(e).lower() or '404' in str(e):
+                raise ModelNotFoundError(
+                    f"Model '{model_identifier}' not found on Hugging Face Hub. "
+                    f'Please check the model ID at https://huggingface.co/models'
+                ) from e
+            raise ModelLoadError(f"Failed to load model '{model_identifier}': {e}") from e
+        except Exception as e:
+            raise ModelLoadError(f"Unexpected error loading model '{model_identifier}': {e}") from e
+
+    def load_model_from_config(
+        self,
+        config: ModelSourceConfig,
+        device: Optional[str] = None,
+        config_pretrained: Optional[PretrainedConfig] = None,
+    ) -> Tuple[PreTrainedModel, PretrainedConfig, AutoTokenizer]:
+        """Load a model using ModelSourceConfig.
+
+        Args:
+            config: ModelSourceConfig instance with loading parameters.
+            device: Device to load model on. If None, uses CUDA when available, else CPU.
+            config_pretrained: Pre-downloaded HF model config. If provided, skips redundant download.
+
+        Returns:
+            Tuple of (model, config, tokenizer).
+
+        Raises:
+            ValueError: If config source is not 'huggingface'.
+            ModelLoadError: If model loading fails.
+        """
+        if not config.is_huggingface():
+            raise ValueError(f"Cannot load model with source '{config.source}'. Use 'huggingface' source.")
+
+        # Validate config
+        is_valid, error = config.validate()
+        if not is_valid:
+            raise ValueError(f'Invalid configuration: {error}')
+
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+        # Extract loading parameters
+        return self.load_model(
+            model_identifier=config.identifier,
+            torch_dtype=config.torch_dtype,
+            device=device,
+            revision=config.revision,
+            device_map=config.device_map,
+            config=config_pretrained,
+            **config.additional_kwargs
+        )
+
+    def _get_torch_dtype(self, dtype_str: str) -> torch.dtype:
+        """Convert dtype string to torch.dtype.
+
+        Args:
+            dtype_str: String representation of dtype ('float32', 'float16', etc.).
+
+        Returns:
+            Corresponding torch.dtype.
+
+        Raises:
+            ValueError: If dtype string is invalid.
+        """
+        dtype_map = {
+            'float32': torch.float32,
+            'float16': torch.float16,
+            'bfloat16': torch.bfloat16,
+            'int8': torch.int8,
+            'fp32': torch.float32,
+            'fp16': torch.float16,
+            'bf16': torch.bfloat16,
+        }
+
+        if dtype_str.lower() not in dtype_map:
+            raise ValueError(f"Invalid dtype '{dtype_str}'.Must be one of {list(dtype_map.keys())}")
+
+        return dtype_map[dtype_str.lower()]
+
+    def _get_model_size(self, model: PreTrainedModel) -> float:
+        """Calculate model size in millions of parameters.
+
+        Args:
+            model: The model to measure.
+
+        Returns:
+            Number of parameters in millions.
+        """
+        return sum(p.numel() for p in model.parameters()) / 1_000_000
+
+    @staticmethod
+    def estimate_param_count_from_config(hf_config) -> Optional[int]:
+        """Estimate parameter count from a HuggingFace config without instantiating the model.
+
+        This avoids allocating tens/hundreds of GB of CPU RAM for large models (e.g. 70B).
+        The estimate covers embedding + transformer layers + LM head for common architectures.
+
+        Args:
+            hf_config: A HuggingFace PretrainedConfig object.
+
+        Returns:
+            int: Estimated number of parameters, or None if estimation is not possible.
+        """
+        try:
+            vocab = getattr(hf_config, 'vocab_size', 0)
+            hidden = getattr(hf_config, 'hidden_size', 0)
+            layers = getattr(hf_config, 'num_hidden_layers', 0)
+            intermediate = getattr(hf_config, 'intermediate_size', hidden * 4)
+            num_heads = getattr(hf_config, 'num_attention_heads', 0)
+            num_kv_heads = getattr(hf_config, 'num_key_value_heads', num_heads)
+            head_dim = hidden // num_heads if num_heads > 0 else 0
+
+            if vocab == 0 or hidden == 0 or layers == 0:
+                return None
+
+            # Embeddings: token + (optional) position
+            max_pos = getattr(hf_config, 'max_position_embeddings', 0)
+            has_pos_embed = getattr(hf_config, 'position_embedding_type', None) not in ('rotary', None)
+            embed_params = vocab * hidden
+            if has_pos_embed and max_pos > 0:
+                embed_params += max_pos * hidden
+
+            # Per transformer layer:
+            #   Self-attention: Q, K, V projections + output projection
+            #   MLP: gate_proj + up_proj + down_proj (LLaMA-style) or fc1 + fc2
+            #   Layer norms: 2 * hidden
+            qkv_params = (num_heads * head_dim + 2 * num_kv_heads * head_dim) * hidden
+            attn_out = hidden * hidden
+            # For gated MLPs (LLaMA/Mistral), there are 3 matrices; otherwise 2
+            has_gate = getattr(hf_config, 'hidden_act', 'gelu') in ('silu', 'swiglu')
+            mlp_params = (3 if has_gate else 2) * hidden * intermediate
+            norm_params = 2 * hidden
+            layer_params = qkv_params + attn_out + mlp_params + norm_params
+
+            # MoE: if num_local_experts > 1, MLP is replicated per expert
+            num_experts = getattr(hf_config, 'num_local_experts', 1)
+            if num_experts > 1:
+                # Router + replicated MLP experts (attention is shared)
+                router_params = hidden * num_experts
+                layer_params = qkv_params + attn_out + norm_params + \
+                    num_experts * mlp_params + router_params
+
+            total_params = embed_params + layers * layer_params
+            # LM head (often tied to embedding, but count it for safety)
+            total_params += vocab * hidden
+            # Final layer norm
+            total_params += hidden
+
+            return total_params
+        except Exception as e:
+            logger.warning(f'Could not estimate param count from config: {e}')
+            return None
+
+    @staticmethod
+    def estimate_memory(param_count, precision_str, mode='training'):
+        """Estimate GPU memory required for a model.
+
+        For training: weights + gradients + optimizer states (Adam uses 2x) = 4x multiplier.
+        For inference: weights only + overhead for runtime buffers = ~1.2x multiplier.
+
+        Args:
+            param_count (int): Number of model parameters.
+            precision_str (str): Precision string ('float32', 'float16', 'bfloat16', 'fp16', 'fp32', 'int8').
+            mode (str): 'training' or 'inference'.
+
+        Returns:
+            tuple: (estimated_bytes, gpu_total_bytes, fits) where fits is True if
+                   the model is estimated to fit in available memory.
+        """
+        precision_lower = precision_str.lower()
+        if precision_lower in ('float16', 'fp16', 'bfloat16', 'bf16'):
+            bytes_per_param = 2
+        elif precision_lower in ('int8', ):
+            bytes_per_param = 1
+        else:
+            bytes_per_param = 4
+
+        if mode == 'training':
+            # weights + gradients + 2x Adam optimizer states = 4x
+            multiplier = 4
+        else:
+            # inference: weights + runtime overhead (~20%)
+            multiplier = 1.2
+
+        estimated_bytes = int(param_count * bytes_per_param * multiplier)
+
+        gpu_available = torch.cuda.is_available()
+        if not gpu_available:
+            try:
+                import psutil
+                sys_mem = psutil.virtual_memory().total
+            except ImportError:
+                logger.warning('psutil not installed — cannot check system memory. Skipping memory check.')
+                return 0, 0, True
+            max_gpu_mem = 80 * (1024**3)    # 80GB — largest common single-GPU memory
+            effective_mem = min(sys_mem, max_gpu_mem)
+            fits = (estimated_bytes / effective_mem) < 0.85
+            return estimated_bytes, effective_mem, fits
+
+        gpu_mem = torch.cuda.get_device_properties(0).total_memory
+        # Use 85% threshold to leave headroom for activations, framework overhead, etc.
+        fits = (estimated_bytes / gpu_mem) < 0.85
+        return estimated_bytes, gpu_mem, fits
+
+    @staticmethod
+    def check_memory_fits(model_identifier, hf_config, precision_str, mode='training', token=None):
+        """Check if a model fits in GPU memory before downloading weights.
+
+        Downloads only the config (few KB) via hf_config, estimates memory, and returns
+        whether the model fits. Use this before calling load_model() to avoid wasting
+        time downloading large models that won't fit.
+
+        Args:
+            model_identifier (str): HF model ID (for logging).
+            hf_config: A HuggingFace PretrainedConfig object.
+            precision_str (str): Precision string ('float32', 'float16', etc.).
+            mode (str): 'training' or 'inference'.
+            token (str, optional): HF token (unused, kept for API consistency).
+
+        Returns:
+            tuple: (fits, param_count_millions, estimated_gb, available_gb)
+                   fits is True if model is estimated to fit.
+        """
+        param_count = HuggingFaceModelLoader.estimate_param_count_from_config(hf_config)
+        if param_count is None:
+            logger.warning(
+                f'Could not estimate param count from config for {model_identifier}. '
+                f'Proceeding with download — memory check skipped.'
+            )
+            return True, 0, 0, 0
+
+        estimated_bytes, available_bytes, fits = HuggingFaceModelLoader.estimate_memory(
+            param_count, precision_str, mode=mode
+        )
+
+        param_millions = param_count / 1e6
+        estimated_gb = estimated_bytes / 1e9
+        available_gb = available_bytes / 1e9
+
+        if fits:
+            logger.info(
+                f'Model {model_identifier} ({param_millions:.1f}M params) estimated to need '
+                f'~{estimated_gb:.1f}GB for {mode}, fits in available memory ({available_gb:.1f}GB).'
+            )
+        else:
+            mem_type = 'GPU memory' if torch.cuda.is_available() else 'system RAM'
+            logger.error(
+                f'Model {model_identifier} ({param_millions:.1f}M params) estimated to need '
+                f'~{estimated_gb:.1f}GB for {mode} (weights'
+                f'{" + gradients + optimizer states" if mode == "training" else " + runtime overhead"}), '
+                f'which exceeds available {mem_type} ({available_gb:.1f}GB). '
+                f'Skipping benchmark. Use a smaller model variant or a machine with more memory.'
+            )
+
+        return fits, param_millions, estimated_gb, available_gb
+
+    def __repr__(self) -> str:
+        """String representation of the loader."""
+        token_status = 'authenticated' if self.token else 'no authentication'
+        return f"HuggingFaceModelLoader(cache_dir='{self.cache_dir}', {token_status})"
diff --git a/superbench/benchmarks/micro_benchmarks/model_source_config.py b/superbench/benchmarks/micro_benchmarks/model_source_config.py
new file mode 100644
index 000000000..48af35962
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/model_source_config.py
@@ -0,0 +1,89 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Configuration classes for model source and loading."""
+
+from dataclasses import dataclass, field
+from typing import Optional, Dict, Any, Tuple
+
+
+@dataclass
+class ModelSourceConfig:
+    """Configuration for model source and loading parameters.
+
+    This class encapsulates all configuration needed to load a model
+    from either in-house definitions or Hugging Face Hub.
+
+    Attributes:
+        source: Source of the model ('in-house' or 'huggingface').
+        identifier: Model name (in-house) or model ID (HuggingFace).
+        hf_token: Optional HuggingFace authentication token for private/gated models.
+        torch_dtype: Data type for model weights ('float32', 'float16', 'bfloat16').
+        revision: Specific model version/commit/tag to use.
+        cache_dir: Directory to cache downloaded models.
+        device_map: Device mapping strategy for model loading.
+        use_auth_token: Deprecated, use hf_token instead.
+        additional_kwargs: Additional keyword arguments for model loading.
+    """
+
+    source: str = 'in-house'
+    identifier: str = ''
+    hf_token: Optional[str] = None
+    torch_dtype: str = 'float32'
+    revision: Optional[str] = None
+    cache_dir: Optional[str] = None
+    device_map: Optional[str] = None
+    use_auth_token: Optional[str] = None    # Deprecated
+    additional_kwargs: Dict[str, Any] = field(default_factory=dict)
+
+    def __post_init__(self):
+        """Post-initialization validation and normalization."""
+        # Handle deprecated use_auth_token
+        if self.use_auth_token is not None and self.hf_token is None:
+            self.hf_token = self.use_auth_token
+
+        # Normalize and validate source
+        self.source = self.source.lower()
+        if self.source not in ['in-house', 'huggingface']:
+            raise ValueError(f"Invalid model source '{self.source}'.Must be 'in-house' or 'huggingface'.")
+
+        # Validate torch_dtype
+        valid_dtypes = ['float32', 'float16', 'bfloat16', 'int8']
+        if self.torch_dtype not in valid_dtypes:
+            raise ValueError(f"Invalid torch_dtype '{self.torch_dtype}'.Must be one of {valid_dtypes}.")
+
+        # Validate identifier is provided
+        if not self.identifier:
+            raise ValueError('Model identifier must be provided.')
+
+    def validate(self) -> Tuple[bool, str]:
+        """Validate configuration parameters.
+
+        Returns:
+            Tuple of (is_valid, error_message).
+            If is_valid is True, error_message is empty.
+        """
+        # Check identifier is not empty for HuggingFace models
+        if self.source == 'huggingface':
+            if not self.identifier or not self.identifier.strip():
+                return (False, 'HuggingFace model identifier cannot be empty')
+
+        return (True, '')
+
+    def is_huggingface(self) -> bool:
+        """Check if this configuration is for a HuggingFace model.
+
+        Returns:
+            True if source is 'huggingface', False otherwise.
+        """
+        return self.source == 'huggingface'
+
+    def __repr__(self) -> str:
+        """String representation of the configuration."""
+        token_status = 'set' if self.hf_token else 'not set'
+        return (
+            f"ModelSourceConfig(source='{self.source}', "
+            f"identifier='{self.identifier}', "
+            f"torch_dtype='{self.torch_dtype}', "
+            f'hf_token={token_status})'
+        )
diff --git a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
index a472af121..8caf95df9 100644
--- a/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/ort_inference_performance.py
@@ -14,6 +14,8 @@
 from superbench.common.utils import logger
 from superbench.benchmarks import BenchmarkRegistry, Platform, Precision
 from superbench.benchmarks.micro_benchmarks import MicroBenchmark
+from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
+from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader
 
 
 class ORTInferenceBenchmark(MicroBenchmark):
@@ -96,6 +98,32 @@ def add_parser_arguments(self):
             help='The number of test step for benchmarking.',
         )
 
+        # HuggingFace model arguments
+        self._parser.add_argument(
+            '--model_source',
+            type=str,
+            choices=['in-house', 'huggingface'],
+            default='in-house',
+            required=False,
+            help='Source of the model: inhouse (default) or huggingface.',
+        )
+
+        self._parser.add_argument(
+            '--model_identifier',
+            type=str,
+            default=None,
+            required=False,
+            help='Model identifier for HuggingFace models (e.g., bert-base-uncased).',
+        )
+
+        self._parser.add_argument(
+            '--seq_length',
+            type=int,
+            default=512,
+            required=False,
+            help='Sequence length for transformer models.',
+        )
+
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
 
@@ -113,6 +141,11 @@ def _preprocess(self):
             3: ort.GraphOptimizationLevel.ORT_ENABLE_ALL,
         }
 
+        # Handle HuggingFace models if specified
+        if self._args.model_source == 'huggingface':
+            return self._preprocess_huggingface_models()
+
+        # Original in-house model processing
         for model in self._args.pytorch_models:
             if hasattr(torchvision.models, model):
                 data_type = Precision.FLOAT16.value if self._args.precision == Precision.FLOAT16 \
@@ -136,11 +169,118 @@ def _preprocess(self):
 
         return True
 
+    def _preprocess_huggingface_models(self):
+        """Preprocess HuggingFace models for ONNX Runtime inference.
+
+        Returns:
+            bool: True if preprocessing succeeds.
+        """
+        import os
+
+        if not self._args.model_identifier:
+            logger.error('--model_identifier is required when using --model_source huggingface')
+            return False
+
+        try:
+            logger.info(f'Loading HuggingFace model: {self._args.model_identifier}')
+
+            # Step 1: Pre-download memory check — download config only (few KB)
+            from transformers import AutoConfig
+            hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN')
+            load_kwargs = {}
+            if hf_token:
+                load_kwargs['token'] = hf_token
+            hf_config = AutoConfig.from_pretrained(self._args.model_identifier, trust_remote_code=True, **load_kwargs)
+
+            precision_str = self._args.precision.value if self._args.precision != Precision.INT8 else 'float32'
+            fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits(
+                self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token
+            )
+            if not fits:
+                return False
+
+            # Step 2: Proceed with model download and ONNX export
+
+            # Get GPU rank to create unique file paths and avoid race conditions
+            # when multiple processes export the same model simultaneously
+            gpu_rank = os.getenv('CUDA_VISIBLE_DEVICES', '0')
+            proc_rank = os.getenv('PROC_RANK', gpu_rank)
+
+            # Create model source config - load on CPU to avoid accelerate dispatching
+            # model across multiple GPUs which causes device mismatch during ONNX export
+            model_config = ModelSourceConfig(
+                source='huggingface',
+                identifier=self._args.model_identifier,
+                hf_token=hf_token,
+                torch_dtype=self._args.precision.value if self._args.precision != Precision.INT8 else 'float32',
+                device_map=None,
+            )
+
+            # Load model from HuggingFace on CPU
+            loader = HuggingFaceModelLoader()
+            hf_model, _, _ = loader.load_model_from_config(model_config, device='cpu')
+            from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter
+            exporter = torch2onnxExporter()
+
+            model_name = self._args.model_identifier.replace('/', '_')
+
+            # Prepare output path - use proc_rank subdirectory to avoid race conditions
+            # when multiple processes export the same model simultaneously
+            proc_output_path = self.__model_cache_path / f'rank_{proc_rank}'
+            proc_output_path.mkdir(parents=True, exist_ok=True)
+
+            # For INT8, export as float32 first then quantize (matching in-house model behavior).
+            # For other precisions, include precision in the model name directly.
+            if self._args.precision == Precision.INT8:
+                export_precision = Precision.FLOAT32.value
+            else:
+                export_precision = self._args.precision.value
+            model_name_with_precision = f'{model_name}.{export_precision}'
+
+            # Export directly to final destination to avoid path issues with external data
+            onnx_path = exporter.export_huggingface_model(
+                model=hf_model,
+                model_name=model_name_with_precision,
+                batch_size=self._args.batch_size,
+                seq_length=self._args.seq_length,
+                output_dir=str(proc_output_path),
+            )
+
+            if not onnx_path:
+                logger.error(f'Failed to export {self._args.model_identifier} to ONNX')
+                return False
+
+            # Apply INT8 quantization if requested (matching in-house model behavior)
+            if self._args.precision == Precision.INT8:
+                from onnxruntime.quantization import quantize_dynamic
+                quantized_path = str(proc_output_path / f'{model_name}.{Precision.INT8.value}.onnx')
+                quantize_dynamic(onnx_path, quantized_path)
+                logger.info('Applied INT8 quantization to HuggingFace model')
+
+            # Update model list and cache path for benchmarking
+            self._args.pytorch_models = [model_name]
+            self.__model_cache_path = proc_output_path
+
+            logger.info('Successfully prepared HuggingFace model for ORT inference')
+            return True
+
+        except Exception as e:
+            logger.error(f'Failed to prepare HuggingFace model: {str(e)}')
+            import traceback
+            logger.error(traceback.format_exc())
+            return False
+
     def _benchmark(self):
         """Implementation for benchmarking."""
         import onnxruntime as ort
         precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'int8': 'int8'}
 
+        # Require CUDAExecutionProvider — this benchmark targets GPU inference
+        available = ort.get_available_providers()
+        if 'CUDAExecutionProvider' not in available:
+            logger.error(f'CUDAExecutionProvider is not available (available: {available}).')
+            return False
+
         for model in self._args.pytorch_models:
             sess_options = ort.SessionOptions()
             sess_options.graph_optimization_level = self.__graph_opt_level[self._args.graph_opt_level]
@@ -177,15 +317,33 @@ def __inference(self, ort_sess):
             elapse_times (List[float]): latency of every iterations.
         """
         precision = np.float16 if self._args.precision == Precision.FLOAT16 else np.float32
-        input_tensor = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision)
+
+        # Get input names from the ONNX session to determine input format
+        input_names = [input.name for input in ort_sess.get_inputs()]
+
+        # Determine input format based on what the model expects
+        if 'pixel_values' in input_names:
+            # Vision model: use pixel_values (batch_size, 3, 224, 224)
+            pixel_values = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision)
+            inputs = {'pixel_values': pixel_values}
+        elif 'input_ids' in input_names:
+            # NLP model: use input_ids and attention_mask
+            seq_len = getattr(self._args, 'seq_length', 512)
+            input_ids = np.random.randint(0, 30000, (self._args.batch_size, seq_len)).astype(np.int64)
+            attention_mask = np.ones((self._args.batch_size, seq_len), dtype=np.int64)
+            inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
+        else:
+            # Default for in-house torchvision models: use 'input' (batch_size, 3, 224, 224)
+            input_tensor = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision)
+            inputs = {'input': input_tensor}
 
         for i in range(self._args.num_warmup):
-            ort_sess.run(None, {'input': input_tensor})
+            ort_sess.run(None, inputs)
 
         elapse_times = list()
         for i in range(self._args.num_steps):
             start = time.time()
-            ort_sess.run(None, {'input': input_tensor})
+            ort_sess.run(None, inputs)
             end = time.time()
             elapse_times.append((end - start) * 1000)
 
diff --git a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
index 4d5a5b4b7..3d8fb80d7 100644
--- a/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/tensorrt_inference_performance.py
@@ -10,6 +10,8 @@
 from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
 from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
 from superbench.benchmarks.micro_benchmarks._export_torch_to_onnx import torch2onnxExporter
+from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
+from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader
 
 
 class TensorRTInferenceBenchmark(MicroBenchmarkWithInvoke):
@@ -71,6 +73,24 @@ def add_parser_arguments(self):
             help='Run at least N inference iterations.',
         )
 
+        # HuggingFace model arguments
+        self._parser.add_argument(
+            '--model_source',
+            type=str,
+            choices=['in-house', 'huggingface'],
+            default='in-house',
+            required=False,
+            help='Source of the model: inhouse (default) or huggingface.',
+        )
+
+        self._parser.add_argument(
+            '--model_identifier',
+            type=str,
+            default=None,
+            required=False,
+            help='Model identifier for HuggingFace models (e.g., bert-base-uncased).',
+        )
+
     def _preprocess(self):
         """Preprocess/preparation operations before the benchmarking.
 
@@ -82,6 +102,11 @@ def _preprocess(self):
 
         self.__bin_path = str(Path(self._args.bin_dir) / self._bin_name)
 
+        # Handle HuggingFace models if specified
+        if self._args.model_source == 'huggingface':
+            return self._preprocess_huggingface_models()
+
+        # Original in-house model processing
         exporter = torch2onnxExporter()
         for model in self._args.pytorch_models:
             if not (exporter.check_torchvision_model(model) or exporter.check_benchmark_model(model)):
@@ -102,9 +127,8 @@ def _preprocess(self):
                 # model options
                 f'--onnx={onnx_model}',
                 # build options
-                '--explicitBatch',
                 f'--optShapes=input:{input_shape}',
-                '--workspace=8192',
+                '--memPoolSize=workspace:8192M',
                 None if self._args.precision == 'fp32' else f'--{self._args.precision}',
                 # inference options
                 f'--iterations={self._args.iterations}',
@@ -115,6 +139,134 @@ def _preprocess(self):
 
         return True
 
+    def _preprocess_huggingface_models(self):
+        """Preprocess HuggingFace models for TensorRT inference.
+
+        Returns:
+            bool: True if preprocessing succeeds.
+        """
+        import os
+        from transformers import AutoConfig
+
+        if not self._args.model_identifier:
+            logger.error('--model_identifier is required when using --model_source huggingface')
+            return False
+
+        try:
+            # Step 1: Pre-download memory check — download only the config (a few KB)
+            # and estimate whether the full model will fit in GPU memory.
+            hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN')
+            load_kwargs = {}
+            if hf_token:
+                load_kwargs['token'] = hf_token
+
+            hf_config = AutoConfig.from_pretrained(self._args.model_identifier, trust_remote_code=True, **load_kwargs)
+            precision_str = self._args.precision    # already a string: 'fp16', 'fp32', 'int8'
+            fits, param_m, est_gb, avail_gb = HuggingFaceModelLoader.check_memory_fits(
+                self._args.model_identifier, hf_config, precision_str, mode='inference', token=hf_token
+            )
+            if not fits:
+                return False
+
+            # Step 2: Download and load the full model
+
+            # Get GPU rank to create unique file paths and avoid race conditions
+            # when multiple processes export the same model simultaneously
+            gpu_rank = os.getenv('CUDA_VISIBLE_DEVICES', '0')
+            proc_rank = os.getenv('PROC_RANK', gpu_rank)
+
+            # Create model source config - load on CPU to avoid accelerate dispatching
+            # model across multiple GPUs which causes device mismatch during ONNX export.
+            # TensorRT handles precision internally via --fp16/--int8 flags,
+            # so the ONNX model is always exported in float32.
+            model_config = ModelSourceConfig(
+                source='huggingface',
+                identifier=self._args.model_identifier,
+                hf_token=hf_token,
+                torch_dtype='float32',
+                device_map=None,
+            )
+
+            logger.info(f'Loading HuggingFace model: {self._args.model_identifier}')
+
+            # Load model from HuggingFace on CPU
+            loader = HuggingFaceModelLoader()
+            hf_model, _, _ = loader.load_model_from_config(model_config, device='cpu')
+            exporter = torch2onnxExporter()
+
+            model_name = self._args.model_identifier.replace('/', '_')
+
+            # Prepare output path - use proc_rank subdirectory to avoid race conditions
+            # when multiple processes export the same model simultaneously
+            output_dir = f'/tmp/tensorrt_onnx_rank_{proc_rank}'
+            os.makedirs(output_dir, exist_ok=True)
+
+            onnx_path = exporter.export_huggingface_model(
+                model=hf_model,
+                model_name=model_name,
+                batch_size=self._args.batch_size,
+                seq_length=self._args.seq_length,
+                output_dir=output_dir,
+            )
+
+            if not onnx_path:
+                logger.error(f'Failed to export {self._args.model_identifier} to ONNX')
+                return False
+
+            # Determine input shape based on model type by checking ONNX file
+            import onnx as onnx_lib
+            onnx_model = onnx_lib.load(onnx_path)
+
+            # Get the first input to determine shape and name
+            input_name = onnx_model.graph.input[0].name
+
+            # Vision models typically have 4D input (batch, channels, height, width)
+            # NLP models typically have 2D input (batch, sequence)
+            if input_name == 'pixel_values' or len(onnx_model.graph.input[0].type.tensor_type.shape.dim) == 4:
+                # Vision model: batch x channels x height x width
+                input_shapes = f'{input_name}:{self._args.batch_size}x3x224x224'
+            else:
+                # NLP model: batch x sequence - need to specify all inputs with same batch and seq length
+                seq_len = getattr(self._args, 'seq_length', 512)
+                shapes_list = []
+                for inp in onnx_model.graph.input:
+                    inp_name = inp.name
+                    num_dims = len(inp.type.tensor_type.shape.dim)
+                    if num_dims == 2:
+                        # Standard 2D input: batch x sequence
+                        shapes_list.append(f'{inp_name}:{self._args.batch_size}x{seq_len}')
+                    elif num_dims == 4:
+                        # 4D input (rare for NLP, but handle it)
+                        shapes_list.append(f'{inp_name}:{self._args.batch_size}x1x{seq_len}x{seq_len}')
+                    else:
+                        # Default to 2D
+                        shapes_list.append(f'{inp_name}:{self._args.batch_size}x{seq_len}')
+                input_shapes = ','.join(shapes_list)
+
+            # Build TensorRT command with correct input name
+            args = [
+                self.__bin_path,
+                f'--onnx={onnx_path}',
+                f'--optShapes={input_shapes}',
+                '--memPoolSize=workspace:8192M',
+                None if self._args.precision == 'fp32' else f'--{self._args.precision}',
+                f'--iterations={self._args.iterations}',
+                '--percentile=99',
+            ]
+            self._commands.append(' '.join(filter(None, args)))
+
+            # Store model name for result processing
+            self._args.pytorch_models = [self._args.model_identifier.replace('/', '_')]
+
+            logger.info('Successfully prepared HuggingFace model for TensorRT inference')
+            return True
+
+        except Exception as e:
+            logger.error(f'Failed to prepare HuggingFace model: {str(e)}')
+            import traceback
+            logger.error(traceback.format_exc())
+            return False
+
     def _process_raw_result(self, cmd_idx, raw_output):
         """Function to parse raw results and save the summarized results.
 
diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
new file mode 100644
index 000000000..85a265cb0
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_huggingface_e2e.py
@@ -0,0 +1,102 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""End-to-end integration tests for HuggingFace model loading.
+
+These tests actually download and load models from HuggingFace Hub.
+The test class is skipped according to ``@decorator.cuda_test``, and
+``test_load_model_to_gpu`` is additionally skipped when
+``torch.cuda.is_available()`` is false.
+"""
+
+import pytest
+import torch
+
+transformers = pytest.importorskip('transformers')
+
+from tests.helper import decorator
+from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import HuggingFaceModelLoader
+from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
+
+
+@decorator.cuda_test
+class TestHuggingFaceE2E:
+    """End-to-end tests for HuggingFace model loading."""
+    @pytest.fixture
+    def loader(self):
+        """Create a loader instance."""
+        return HuggingFaceModelLoader(cache_dir='/tmp/hf_test_cache')
+
+    def test_load_tiny_bert_model(self, loader):
+        """Test loading a tiny BERT model from HuggingFace Hub.
+
+        Uses prajjwal1/bert-tiny which is a small public BERT model (~17MB).
+        """
+        model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu')
+
+        assert model is not None
+        assert config is not None
+        assert config.model_type == 'bert'
+
+        # Verify model can do a forward pass
+        dummy_input = torch.randint(0, 1000, (1, 10))
+        with torch.no_grad():
+            output = model(dummy_input)
+        assert output is not None
+
+    def test_load_distilgpt2_model(self, loader):
+        """Test loading DistilGPT2 model from HuggingFace Hub.
+
+        Uses distilbert/distilgpt2 which is a small public GPT-2 model (~82MB).
+        """
+        model, config, tokenizer = loader.load_model('distilbert/distilgpt2', device='cpu')
+
+        assert model is not None
+        assert config is not None
+        assert config.model_type == 'gpt2'
+
+        # Verify model can do a forward pass
+        dummy_input = torch.randint(0, 1000, (1, 10))
+        with torch.no_grad():
+            output = model(dummy_input)
+        assert output is not None
+
+    def test_load_model_from_config(self, loader):
+        """Test loading model using ModelSourceConfig via load_model_from_config."""
+        config = ModelSourceConfig(source='huggingface', identifier='prajjwal1/bert-tiny', torch_dtype='float32')
+
+        model, hf_config, tokenizer = loader.load_model_from_config(config, device='cpu')
+
+        assert model is not None
+        assert hf_config.model_type == 'bert'
+
+    def test_load_model_with_dtype(self, loader):
+        """Test loading model and converting dtype after load."""
+        model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu')
+
+        # Convert to float32 after loading
+        model = model.float()
+
+        # Check model parameters are float32
+        param = next(model.parameters())
+        assert param.dtype == torch.float32
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason='Requires GPU')
+    def test_load_model_to_gpu(self, loader):
+        """Test loading model and moving to GPU."""
+        model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu')
+
+        # Move to GPU manually
+        model = model.cuda()
+
+        # Check model is on GPU
+        param = next(model.parameters())
+        assert param.device.type == 'cuda'
+
+    def test_architecture_detection(self, loader):
+        """Test that architecture is correctly detected from loaded model."""
+        model, config, tokenizer = loader.load_model('prajjwal1/bert-tiny', device='cpu')
+
+        # Architecture should be detected from config
+        assert config.model_type is not None
+        assert 'bert' in config.model_type.lower()
diff --git a/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py b/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py
new file mode 100644
index 000000000..1a1caf673
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_huggingface_loader.py
@@ -0,0 +1,117 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Unit tests for HuggingFaceModelLoader."""
+
+import pytest
+import torch
+from unittest.mock import MagicMock, patch
+
+from superbench.benchmarks.micro_benchmarks.huggingface_model_loader import (
+    HuggingFaceModelLoader,
+    ModelNotFoundError,
+)
+from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
+
+
+class TestHuggingFaceModelLoader:
+    """Test cases for HuggingFaceModelLoader class."""
+    @pytest.fixture
+    def loader(self):
+        """Create a loader instance for testing."""
+        return HuggingFaceModelLoader(cache_dir='/tmp/test_cache', token=None)
+
+    def test_initialization(self, loader):
+        """Test loader initialization."""
+        assert loader.cache_dir == '/tmp/test_cache'
+        assert loader.token is None
+
+    def test_initialization_with_env_token(self, monkeypatch, tmp_path):
+        """Test loader picks up token from environment."""
+        monkeypatch.setenv('HF_TOKEN', 'env_token')
+        monkeypatch.setenv('HF_HOME', str(tmp_path / 'hf_cache'))
+        loader = HuggingFaceModelLoader()
+        assert loader.token == 'env_token'
+
+    def test_get_torch_dtype_valid(self, loader):
+        """Test torch dtype conversion."""
+        assert loader._get_torch_dtype('float32') == torch.float32
+        assert loader._get_torch_dtype('float16') == torch.float16
+        assert loader._get_torch_dtype('fp16') == torch.float16
+        assert loader._get_torch_dtype('bfloat16') == torch.bfloat16
+
+    def test_get_torch_dtype_invalid(self, loader):
+        """Test invalid dtype raises error."""
+        with pytest.raises(ValueError, match='Invalid dtype'):
+            loader._get_torch_dtype('invalid_dtype')
+
+    @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoModel')
+    @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoConfig')
+    @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoTokenizer')
+    def test_load_model_success(self, mock_tokenizer, mock_config, mock_model, loader):
+        """Test successful model loading."""
+        # Mock config
+        mock_cfg = MagicMock()
+        mock_cfg.model_type = 'bert'
+        mock_config.from_pretrained.return_value = mock_cfg
+
+        # Mock model
+        mock_mdl = MagicMock()
+        mock_mdl.parameters.return_value = [torch.randn(100, 100)]
+        mock_mdl.to.return_value = mock_mdl
+        mock_model.from_pretrained.return_value = mock_mdl
+
+        # Mock tokenizer
+        mock_tok = MagicMock()
+        mock_tokenizer.from_pretrained.return_value = mock_tok
+
+        model, config, tokenizer = loader.load_model('test/model', device='cpu')
+
+        assert model == mock_mdl
+        assert config == mock_cfg
+        assert tokenizer == mock_tok
+
+        # Verify mocks were called with correct arguments
+        mock_config.from_pretrained.assert_called_once()
+        call_kwargs = mock_config.from_pretrained.call_args
+        assert call_kwargs[0][0] == 'test/model'
+        assert call_kwargs[1]['trust_remote_code'] is True
+        assert call_kwargs[1]['cache_dir'] == '/tmp/test_cache'
+
+        mock_model.from_pretrained.assert_called_once()
+        model_call_kwargs = mock_model.from_pretrained.call_args
+        assert model_call_kwargs[1]['trust_remote_code'] is True
+        assert model_call_kwargs[1]['cache_dir'] == '/tmp/test_cache'
+
+        mock_tokenizer.from_pretrained.assert_called_once()
+
+        # Verify model was moved to the requested device
+        mock_mdl.to.assert_called_once_with('cpu')
+
+    @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoTokenizer')
+    @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoModel')
+    @patch('superbench.benchmarks.micro_benchmarks.huggingface_model_loader.AutoConfig')
+    def test_load_model_not_found(self, mock_config, mock_model, mock_tokenizer, loader):
+        """Test loading non-existent model."""
+        mock_config.from_pretrained.side_effect = OSError('404 Client Error')
+
+        with pytest.raises(ModelNotFoundError, match='not found'):
+            loader.load_model('nonexistent/model')
+
+    def test_load_model_from_config_invalid_source(self, loader):
+        """Test loading with invalid source in config."""
+        config = ModelSourceConfig(source='in-house', identifier='bert-base')
+
+        with pytest.raises(ValueError, match='Cannot load model'):
+            loader.load_model_from_config(config)
+
+    def test_get_model_size(self, loader):
+        """Test model size calculation."""
+        mock_model = MagicMock()
+        mock_model.parameters.return_value = [
+            torch.randn(1000, 1000),    # 1M params
+            torch.randn(500, 500),    # 0.25M params
+        ]
+
+        size = loader._get_model_size(mock_model)
+        assert abs(size - 1.25) < 0.01    # Should be ~1.25M
diff --git a/tests/benchmarks/micro_benchmarks/test_model_source_config.py b/tests/benchmarks/micro_benchmarks/test_model_source_config.py
new file mode 100644
index 000000000..9d9f7f35e
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_model_source_config.py
@@ -0,0 +1,73 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Unit tests for ModelSourceConfig."""
+
+import pytest
+from superbench.benchmarks.micro_benchmarks.model_source_config import ModelSourceConfig
+
+
+class TestModelSourceConfig:
+    """Test cases for ModelSourceConfig class."""
+    def test_default_config(self):
+        """Test default configuration."""
+        config = ModelSourceConfig(identifier='bert-base')
+        assert config.source == 'in-house'
+        assert config.identifier == 'bert-base'
+        assert config.torch_dtype == 'float32'
+        assert config.hf_token is None
+
+    def test_huggingface_config(self):
+        """Test HuggingFace configuration."""
+        config = ModelSourceConfig(source='huggingface', identifier='meta-llama/Llama-2-7b-hf', torch_dtype='float16')
+        assert config.source == 'huggingface'
+        assert config.identifier == 'meta-llama/Llama-2-7b-hf'
+        assert config.torch_dtype == 'float16'
+
+    def test_invalid_source(self):
+        """Test invalid source raises error."""
+        with pytest.raises(ValueError, match='Invalid model source'):
+            ModelSourceConfig(source='invalid', identifier='test')
+
+    def test_invalid_dtype(self):
+        """Test invalid dtype raises error."""
+        with pytest.raises(ValueError, match='Invalid torch_dtype'):
+            ModelSourceConfig(identifier='test', torch_dtype='invalid')
+
+    def test_missing_identifier(self):
+        """Test missing identifier raises error."""
+        with pytest.raises(ValueError, match='identifier must be provided'):
+            ModelSourceConfig(identifier='')
+
+    def test_validate_huggingface_empty(self):
+        """Test validation of empty HuggingFace model identifier."""
+        config = ModelSourceConfig(source='huggingface', identifier='   ')
+        is_valid, message = config.validate()
+        assert not is_valid
+        assert 'cannot be empty' in message
+
+    def test_validate_valid_huggingface(self):
+        """Test validation of valid HuggingFace model."""
+        config = ModelSourceConfig(source='huggingface', identifier='meta-llama/Llama-2-7b-hf')
+        is_valid, message = config.validate()
+        assert is_valid
+        assert message == ''
+
+    def test_validate_valid_huggingface_short_name(self):
+        """Test validation of valid HuggingFace model with short name (no org)."""
+        config = ModelSourceConfig(source='huggingface', identifier='bert-base-uncased')
+        is_valid, message = config.validate()
+        assert is_valid
+        assert message == ''
+
+    def test_is_huggingface(self):
+        """Test is_huggingface method."""
+        hf_config = ModelSourceConfig(source='huggingface', identifier='test/model')
+        inhouse_config = ModelSourceConfig(source='in-house', identifier='bert-base')
+        assert hf_config.is_huggingface() is True
+        assert inhouse_config.is_huggingface() is False
+
+    def test_deprecated_use_auth_token(self):
+        """Test deprecated use_auth_token parameter."""
+        config = ModelSourceConfig(identifier='test', use_auth_token='old_token')
+        assert config.hf_token == 'old_token'