ensure efficient kascade is only used with fp16 datatype

Dhruv88 · web-flow · commit cd2d5be6324f · 2026-02-02T14:22:15.000+05:30
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 <p align="center">
   <picture>
-    <img alt="Kascade" src="assets/logo_kascade.png" height="20%" width="20%">
+    <img alt="Kascade" src="assets/logo_kascade.png" height="25%" width="25%">
   </picture>
 </p>
 
@@ -72,6 +72,7 @@ python scripts/eval_script.py --model_name meta-llama/Meta-Llama-3.1-8B-Instruct
 ```
 
 **NOTE:** Currently for `efficient_kascade` which uses our efficient kernels only tile_size 32 is supported.
+**NOTE:** Currently for `efficient_kascade` only fp16 is supported. Currently models are loaded in same type as mentioned in their model config. To be able to run `efficient_kascade` please go to line 17 in `src/model_utils.py` and change `torch_dtype=torch.float16` when loading the model.
 
 **NOTE**: To use multiple GPUs you can run `scripts/eval_script.py` with `accelerate launch` and take advantage of DDP for faster processing of queries. If you run into errors, fallback to single gpu runs.
 
diff --git a/scripts/eval_script.py b/scripts/eval_script.py
@@ -12,6 +12,7 @@
 from datasets import load_dataset
 from transformers import set_seed
 from transformers.utils import is_flash_attn_2_available, is_flash_attn_3_available
+import torch
 
 def main():
     # Parse the arguments
@@ -93,6 +94,9 @@ def main():
 
     # Loop: models -> strategies -> subsets
     for strategy_name in args.strategies:
+        if strategy_name == "efficient_kascade" and model.config.dtype != torch.float16:
+            raise ValueError("Efficient Kascade strategy requires model to be in float16 precision. Please go to line 17 in src/model_utils.py and change torch_dtype=torch.float16 when loading the model for running with efficient_kascade.")
+
         set_seed(args.seed) # Ensure reproducibility per run
         
         # Create strategy
diff --git a/src/model_utils.py b/src/model_utils.py
@@ -14,7 +14,7 @@ def get_tokenizer_and_model(model_name, attn_implementation, device):
             tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
 
     model_kwargs = {
-        "torch_dtype": torch.float16,
+        "torch_dtype": "auto",
         "attn_implementation": attn_implementation,
         "cache_dir": "/dev/shm",
         "pretrained_model_name_or_path": model_name,
@@ -33,8 +33,6 @@ def get_tokenizer_and_model(model_name, attn_implementation, device):
     single_gpu_mem_gb = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3)
     if model_size_gb * 2 > single_gpu_mem_gb:  # float16 is 2 bytes per parameter
         model_kwargs["device_map"] = "auto"
-    if model_size_gb > 8:
-        model_kwargs["torch_dtype"] = "auto"
     model = AutoModelForCausalLM.from_pretrained(**model_kwargs)
 
     model.eval()