Added support for Vision models (#141)

vdwarakn · nazneenn · web-flow · commit 583335e8eb5b · 2025-06-04T21:43:16.000+05:30
Co-authored-by: nazneenn &lt;nazneen.nighar.sultana@intel.com&gt;
diff --git a/PyTorch/vLLM_Tutorials/Deploying_vLLM/README.md b/PyTorch/vLLM_Tutorials/Deploying_vLLM/README.md
@@ -18,6 +18,8 @@ This folder contains scripts and configuration files that can be used to build a
 |Qwen/Qwen2.5-32B-Instruct |1|
 |Qwen/Qwen2.5-72B-Instruct |4|
 |Qwen/Qwen2.5-7B-Instruct |1|
+|meta-llama/Llama-3.2-11B-Vision-Instruct |1|
+|meta-llama/Llama-3.2-90B-Vision-Instruct |4|
 ## Quick Start
 To run these models on your Gaudi machine:
 
@@ -53,7 +55,7 @@ docker build -f Dockerfile-1.21.1-ub24-vllm-v0.7.2+Gaudi $BUILD_ARGS -t vllm-v0.
 > You can do this by adding parameters to the docker run command.  
 > Example: "-e HF_HOME=/mnt/huggingface -v /mnt/huggingface:/mnt"
 
-5) Start the vLLM server with a default context of 4K and default TP from the table above
+5) Start the vLLM server with a default context (4k for text and 8k for vision models) and default TP as per the table above
 ```bash
 docker run -it --rm \
     -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy \
@@ -83,7 +85,7 @@ curl -s --noproxy '*' http://${target}:8000/v1/completions -H 'Content-Type: app
 </code>
 &nbsp; 
 
-8) (Optional) Run the perftest.sh command in a **separate terminal** for obtaining basic metrics like the example below for Gaudi3:  
+8.1) (Optional: For text based models) Run the perftest.sh command in a **separate terminal** for obtaining basic metrics like the example below for Gaudi3:  
 ```bash
 docker exec vllm-server /root/scripts/perftest.sh
 ```
@@ -142,6 +144,34 @@ P90 ITL (ms):                            61.32
 >   OUTPUT_TOKENS=2048  
 >   CONCURRENT_REQUESTS=64  
 
+8.2) (Optional: For vision models) Run the perftest_vision.sh command in a **separate terminal** for obtaining basic metrics like the example below for Gaudi3:  
+```bash
+docker exec vllm-server /root/scripts/perftest_vision.sh
+```
+<pre>
+# meta-llama/Llama-3.2-11B-Vision-Instruct
+============ Serving Benchmark Result ============
+Successful requests:                     500       
+Benchmark duration (s):                  121.53    
+Total input tokens:                      31710     
+Total generated tokens:                  64000     
+Request throughput (req/s):              4.11      
+Output token throughput (tok/s):         526.63    
+Total Token throughput (tok/s):          787.56    
+---------------Time to First Token----------------
+Mean TTFT (ms):                          5642.06   
+Median TTFT (ms):                        5589.81   
+P90 TTFT (ms):                           8825.33   
+-----Time per Output Token (excl. 1st token)------
+Mean TPOT (ms):                          74.14     
+Median TPOT (ms):                        72.15     
+P90 TPOT (ms):                           101.27    
+---------------Inter-token Latency----------------
+Mean ITL (ms):                           73.56     
+Median ITL (ms):                         34.46     
+P90 ITL (ms):                            88.77     
+==================================================
+
 9) Optionally, you can run perftest.sh with custom parameters like so:
 ```bash
 ## Usage: docker exec vllm-server /root/scripts/perftest.sh <INPUT_TOKENS> <OUTPUT_TOKENS> <CONCURRENT_REQUESTS>
diff --git a/PyTorch/vLLM_Tutorials/Deploying_vLLM/perftest_vision.sh b/PyTorch/vLLM_Tutorials/Deploying_vLLM/perftest_vision.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+## Edit the following variables to test for alternate performance scenarios
+DATASET=$1
+NUM_PROMPTS=$2
+CONCURRENT_REQ=$3
+DATASET=${DATASET:-"lmarena-ai/vision-arena-bench-v0.1"}
+NUM_PROMPTS=${NUM_PROMPTS:-500}
+CONCURRENT_REQ=${CONCURRENT_REQ:-64}
+
+cd /root
+python3 vllm-fork/benchmarks/benchmark_serving.py \
+                 --model $MODEL \
+                 --base-url http://localhost:8000 \
+                 --backend openai-chat \
+                 --endpoint /v1/chat/completions \
+                 --dataset-name hf \
+                 --dataset-path $DATASET \
+                 --hf-split train \
+                 --num-prompts $NUM_PROMPTS \
+                 --max-concurrency $CONCURRENT_REQ \
+                 --metric-percentiles 90 \
+2>&1 | tee -a perftest_dataset${DATASET}_prompts${NUM_PROMPTS}_user${CONCURRENT_REQ}.log
diff --git a/PyTorch/vLLM_Tutorials/Deploying_vLLM/settings_vllm.csv b/PyTorch/vLLM_Tutorials/Deploying_vLLM/settings_vllm.csv
@@ -1,16 +1,18 @@
-MODEL,INPUT,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING
-meta-llama/Llama-3.1-8B-Instruct,,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,32,4096,8,32,2,131072,1,TRUE,FALSE,0
-meta-llama/Llama-3.1-70B-Instruct,,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,131072,1,TRUE,FALSE,0
-meta-llama/Llama-3.3-70B-Instruct,,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,131072,1,TRUE,FALSE,0
-meta-llama/Llama-3.2-1B-Instruct,,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,16,2048,8,32,2,131072,1,TRUE,FALSE,0
-meta-llama/Llama-3.2-3B-Instruct,,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,28,3072,8,24,2,131072,1,TRUE,FALSE,0
-mistralai/Mixtral-8x7B-Instruct-v0.1,,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,32,4096,8,32,2,32768,1,TRUE,FALSE,0
-mistralai/Mixtral-8x22B-Instruct-v0.1,,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,56,6144,8,48,2,65536,1,TRUE,FALSE,0
-mistralai/Mistral-7B-Instruct-v0.2,,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,32,4096,8,32,2,32768,1,TRUE,FALSE,0
-meta-llama/Llama-3.1-405B-Instruct,,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,126,16384,8,128,2,131072,1,TRUE,FALSE,0
-Qwen/Qwen2.5-14B-Instruct,,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,48,5120,8,40,2,32768,1,TRUE,FALSE,0
-deepseek-ai/DeepSeek-R1-Distill-Llama-70B,,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,131072,1,TRUE,FALSE,0
-Qwen/Qwen2.5-32B-Instruct,,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,64,5120,8,40,2,32768,1,TRUE,FALSE,0
-Qwen/Qwen2.5-72B-Instruct,,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,32768,1,TRUE,FALSE,0
-Qwen/Qwen2.5-7B-Instruct,,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,28,3584,4,28,2,32768,1,TRUE,FALSE,0
-Qwen/Qwen2.5-32B-Instruct,,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,64,5120,8,40,2,32768,1,TRUE,FALSE,0
+MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING
+meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,32,4096,8,32,2,131072,1,TRUE,FALSE,0
+meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,131072,1,TRUE,FALSE,0
+meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,131072,1,TRUE,FALSE,0
+meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,16,2048,8,32,2,131072,1,TRUE,FALSE,0
+meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,28,3072,8,24,2,131072,1,TRUE,FALSE,0
+mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,32,4096,8,32,2,32768,1,TRUE,FALSE,0
+mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,56,6144,8,48,2,65536,1,TRUE,FALSE,0
+mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,32,4096,8,32,2,32768,1,TRUE,FALSE,0
+meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,126,16384,8,128,2,131072,1,TRUE,FALSE,0
+Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,48,5120,8,40,2,32768,1,TRUE,FALSE,0
+deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,131072,1,TRUE,FALSE,0
+Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,16,64,5120,8,40,2,32768,1,TRUE,FALSE,0
+Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,80,8192,8,64,2,32768,1,TRUE,FALSE,0
+Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,28,3584,4,28,2,32768,1,TRUE,FALSE,0
+Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,16,64,5120,8,40,2,32768,1,TRUE,FALSE,0
+meta-llama/Llama-3.2-11B-Vision-Instruct,1,8448,128,2,21340441670,2,2,19.87483507,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,40,4096,8,32,2,131072,1,TRUE,FALSE,0
+meta-llama/Llama-3.2-90B-Vision-Instruct,4,8448,512,2,177186710646,2,2,165.0179835,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,100,8192,8,64,2,131072,1,TRUE,FALSE,0
diff --git a/PyTorch/vLLM_Tutorials/Deploying_vLLM/vllm_autocalc.py b/PyTorch/vLLM_Tutorials/Deploying_vLLM/vllm_autocalc.py
@@ -20,7 +20,7 @@ def vllm_auto_calc(fd):
         print(f"Clamping TENSOR_PARALLEL_SIZE to {tensor_parallel_size_new}")
     fd['TENSOR_PARALLEL_SIZE'] = tensor_parallel_size_new
 
-    fd['MAX_MODEL_LEN'] = max(1, fd['MAX_MODEL_LEN'])
+    fd['MAX_MODEL_LEN'] = max(1, fd['MAX_MODEL_LEN']) 
 
     if fd['TENSOR_PARALLEL_SIZE'] > 1:
         fd['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = True
@@ -134,10 +134,11 @@ def vllm_auto_calc(fd):
                                         0.5)
     fd['KV_CACHE_MEM'] = (fd['USABLE_MEM'] * fd['GPU_MEM_UTILIZATION'] *
                           (1 - fd['VLLM_GRAPH_RESERVED_MEM']))
-
+    
     if fd.get('MAX_NUM_SEQS') is None:
         fd['MAX_NUM_SEQS'] = (fd['TENSOR_PARALLEL_SIZE'] * fd['KV_CACHE_MEM'] /
                               fd['KV_CACHE_PER_SEQ'])
+        print("max num seq",fd['MAX_NUM_SEQS'] )
         if DTYPE == 'fp8':
             fd['MAX_NUM_SEQS'] = (max(
                 1,
@@ -153,9 +154,15 @@ def vllm_auto_calc(fd):
             raise ValueError(
                 "Not enough memory for kv cache increase TENSOR_PARALLEL_SIZE "
                 "or reduce MAX_MODEL_LEN or increase bucket step")
+
+        if fd['MODEL'] in ['meta-llama/Llama-3.2-11B-Vision-Instruct', 'meta-llama/Llama-3.2-90B-Vision-Instruct']:
+            if fd['MAX_NUM_SEQS'] > 128:
+                fd['MAX_NUM_SEQS'] = 128
+                print(f"{fd['MODEL']} currently does not support max-num-seqs > 128, hence limiting the max-num-seqs to 128")
     else:
         fd['MAX_NUM_SEQS'] = max(1, fd['MAX_NUM_SEQS'])
 
+
     fd['VLLM_DECODE_BLOCK_BUCKET_MAX'] = max(
         128, math.ceil((fd['MAX_NUM_SEQS'] * fd['MAX_MODEL_LEN']) / 128))
     fd['VLLM_PROMPT_SEQ_BUCKET_MAX'] = fd['MAX_MODEL_LEN']