Merge pull request #326 from karmeh01/add-compile-option

nSircombe · web-flow · commit 5819a4a39d24 · 2025-05-02T16:59:22.000+01:00
Revert "Removes compile option from genai PyTorch examples"
diff --git a/ML-Frameworks/pytorch-aarch64/examples/README.md b/ML-Frameworks/pytorch-aarch64/examples/README.md
@@ -201,7 +201,7 @@ The script [torchchat_llm_text_gen.py](torchchat_llm_text_gen.py) demonstrates h
 To run infernece using torchchat call:
 
 ```
-LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libtcmalloc.so.4  TORCHINDUCTOR_CPP_WRAPPER=1  TORCHINDUCTOR_FREEZING=1  OMP_NUM_THREADS=16 python torchchat_llm_text_gen.py
+LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libtcmalloc.so.4  TORCHINDUCTOR_CPP_WRAPPER=1  TORCHINDUCTOR_FREEZING=1  OMP_NUM_THREADS=16 python torchchat_llm_text_gen.py --compile
 ```
 
 #### Command-Line Options
@@ -212,6 +212,9 @@ LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libtcmalloc.so.4  TORCHINDUCTOR_CPP_WRAPPE
 `--max-new-tokens`
   Description: Max new tokens to generate.
 
+`--compile`
+  Description: Whether to compile the model (default: `False`).
+
 `--model`
   Description: Model alias. (Default: `"llama2"`  )
 
@@ -224,7 +227,7 @@ The script [transformers_llm_text_gen.py](transformers_llm_text_gen.py) demonstr
 To run infernece using torchchat call:
 
 ```
-LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libtcmalloc.so.4  TORCHINDUCTOR_CPP_WRAPPER=1  TORCHINDUCTOR_FREEZING=1  OMP_NUM_THREADS=16 python transformers_llm_text_gen.py
+LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libtcmalloc.so.4  TORCHINDUCTOR_CPP_WRAPPER=1  TORCHINDUCTOR_FREEZING=1  OMP_NUM_THREADS=16 python transformers_llm_text_gen.py --compile
 ```
 
 #### Command-Line Options
@@ -235,6 +238,9 @@ LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libtcmalloc.so.4  TORCHINDUCTOR_CPP_WRAPPE
 `--max-new-tokens`
   Description: Max new tokens to generate.
 
+`--compile`
+  Description: Whether to compile the model (default: `False`).
+
 `--model`
   Description: Local Path to model repo or huggingface model id. (Default: `"meta-llama/Llama-2-7b-hf"`  )
 
diff --git a/ML-Frameworks/pytorch-aarch64/examples/torchchat_llm_text_gen.py b/ML-Frameworks/pytorch-aarch64/examples/torchchat_llm_text_gen.py
@@ -31,6 +31,8 @@ def main(args):
         "python3", torchchat_path, "generate", args.model,
         "--quantize", str(args.quant_config),
         "--prompt", prompt,
+        "--compile" if args.compile else "",
+        "--compile-prefill" if args.compile else "",
         "--max-autotune", "--max-new-tokens", str(args.max_new_tokens)
     ]
     command = [arg for arg in command if arg]
@@ -45,6 +47,8 @@ def main(args):
                         help='Path to json file for quantization config')
     parser.add_argument('--max-new-tokens', type=int,
                         default=64, help='New tokens to generate at decode.')
+    parser.add_argument('--compile', action='store_true',
+                        help='Whether to compile the model.')
     parser.add_argument('--model', type=str, default="llama2",
                         help='Torchchat supported model alias')
     parser.add_argument('--prompt', type=str, default="In a distant world where magic and technology coexist, "
diff --git a/ML-Frameworks/pytorch-aarch64/examples/transformers_llm_text_gen.py b/ML-Frameworks/pytorch-aarch64/examples/transformers_llm_text_gen.py
@@ -130,6 +130,10 @@ def get_quantized_model(args):
     print("Quantizing model to 4 bit ..")
     quantize_model(model, "cpu", args.quant_config)
     model = model.eval()
+    if args.compile:
+        model.generation_config.cache_implementation = "static"
+        model.forward = torch.compile(
+            model.forward, backend='inductor', dynamic=True, fullgraph=True)
     return model, tokenizer, config
 
 
@@ -193,6 +197,8 @@ def main(args):
                         "gen_ai_utils/quant_configs/aarch64_cpu_channelwise.json", help='Path to json file for quantization config')
     parser.add_argument('--max-new-tokens', type=int,
                         default=64, help='New tokens to generate at decode.')
+    parser.add_argument('--compile', action='store_true',
+                        help='Whether to compile the model.')
     parser.add_argument('--model', type=Path, default=Path("meta-llama/Llama-2-7b-hf"),
                         help='Hugging Face model ID or Cloned model repository with model files')
     parser.add_argument('--prompt', type=str, default="In a distant world where magic and technology coexist, "