Skip to content

Commit 1897798

Browse files
authored
llama3.1 and intervl optimal configuration (#90)
1 parent b5ca01f commit 1897798

3 files changed

Lines changed: 61 additions & 32 deletions

File tree

language_models/llama3_8b/app.py

Lines changed: 50 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,25 +6,37 @@
66
from transformers import AutoModelForCausalLM, AutoTokenizer
77

88
# Model parameters
9-
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
9+
MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
1010
MAX_LENGTH = 512
11-
TEMPERATURE = 1.0
12-
TOP_P = 0.95
13-
TOP_K = 40
14-
REPETITION_PENALTY = 1.0
15-
NO_REPEAT_NGRAM_SIZE = 0
16-
DO_SAMPLE = True
11+
TEMPERATURE = 0.7
12+
TOP_P = 0.9
13+
TOP_K = 50
14+
REPETITION_PENALTY = 1.05
15+
NO_REPEAT_NGRAM_SIZE = 2
16+
DO_SAMPLE = True
17+
NUM_BEAMS = 1
18+
EARLY_STOPPING = True
1719

18-
CACHE_PATH = "./cached_models"
20+
BEAM_VOLUME_PATH = "./cached_models"
1921

2022

2123
# This runs once when the container first starts
2224
def load_models():
23-
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE_PATH)
25+
tokenizer = AutoTokenizer.from_pretrained(
26+
MODEL_NAME,
27+
cache_dir=BEAM_VOLUME_PATH,
28+
padding_side='left'
29+
)
2430
tokenizer.pad_token = tokenizer.eos_token
2531
model = AutoModelForCausalLM.from_pretrained(
26-
MODEL_NAME, device_map="auto", torch_dtype=torch.float16, cache_dir=CACHE_PATH
32+
MODEL_NAME,
33+
device_map="auto",
34+
torch_dtype=torch.float16,
35+
cache_dir=BEAM_VOLUME_PATH,
36+
use_cache=True,
37+
low_cpu_mem_usage=True
2738
)
39+
model.eval()
2840
return model, tokenizer
2941

3042

@@ -38,22 +50,25 @@ def load_models():
3850
"huggingface_hub[hf-transfer]",
3951
]
4052
)
41-
.with_envs("HF_HUB_ENABLE_HF_TRANSFER=1")
53+
.with_envs({
54+
"HF_HUB_ENABLE_HF_TRANSFER": "1",
55+
"TOKENIZERS_PARALLELISM": "false",
56+
"CUDA_VISIBLE_DEVICES": "0",
57+
})
4258
)
4359

4460

4561
@endpoint(
4662
secrets=["HF_TOKEN"],
4763
on_start=load_models,
48-
name="meta-llama-3-8b-instruct",
64+
name="meta-llama-3.1-8b-instruct",
4965
cpu=2,
50-
memory="32Gi",
51-
gpu_count=2,
66+
memory="16Gi",
5267
gpu="A10G",
5368
volumes=[
5469
Volume(
5570
name="cached_models",
56-
mount_path=CACHE_PATH,
71+
mount_path=BEAM_VOLUME_PATH,
5772
)
5873
],
5974
image=image,
@@ -68,30 +83,43 @@ def generate_text(context, **inputs):
6883
return {"error": "Please provide messages for text generation."}
6984

7085
generate_args = {
71-
"max_length": inputs.get("max_tokens", MAX_LENGTH),
86+
"max_new_tokens": inputs.get("max_tokens", MAX_LENGTH),
7287
"temperature": inputs.get("temperature", TEMPERATURE),
7388
"top_p": inputs.get("top_p", TOP_P),
7489
"top_k": inputs.get("top_k", TOP_K),
7590
"repetition_penalty": inputs.get("repetition_penalty", REPETITION_PENALTY),
7691
"no_repeat_ngram_size": inputs.get(
7792
"no_repeat_ngram_size", NO_REPEAT_NGRAM_SIZE
7893
),
94+
"num_beams": inputs.get("num_beams", NUM_BEAMS),
95+
"early_stopping": inputs.get("early_stopping", EARLY_STOPPING),
7996
"do_sample": inputs.get("do_sample", DO_SAMPLE),
8097
"use_cache": True,
8198
"eos_token_id": tokenizer.eos_token_id,
8299
"pad_token_id": tokenizer.pad_token_id,
83100
}
84101

85-
model_inputs = tokenizer.apply_chat_template(
102+
model_inputs_str = tokenizer.apply_chat_template(
86103
messages, tokenize=False, add_generation_prompt=True
87104
)
88-
inputs = tokenizer(model_inputs, return_tensors="pt", padding=True)
89-
input_ids = inputs["input_ids"].to("cuda")
90-
attention_mask = inputs["attention_mask"].to("cuda")
105+
106+
tokenized_inputs = tokenizer(
107+
model_inputs_str,
108+
return_tensors="pt",
109+
padding=True,
110+
truncation=True,
111+
max_length=2048
112+
)
113+
input_ids = tokenized_inputs["input_ids"].to("cuda")
114+
attention_mask = tokenized_inputs["attention_mask"].to("cuda")
115+
input_ids_length = input_ids.shape[-1]
91116

92117
with torch.no_grad():
93118
outputs = model.generate(
94-
input_ids=input_ids, attention_mask=attention_mask, **generate_args
119+
input_ids=input_ids,
120+
attention_mask=attention_mask,
121+
**generate_args
95122
)
96-
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
123+
new_tokens = outputs[0][input_ids_length:]
124+
output_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
97125
return {"output": output_text}

vllm/chat.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def process_user_input(
8686
self, user_input: str, img_link: Optional[str] = None, stream: bool = False
8787
) -> str:
8888
"""Process user input and return assistant's response."""
89-
if self.model == "OpenGVLab/InternVL2_5-8B" and img_link:
89+
if self.model == "OpenGVLab/InternVL3-8B-AWQ" and img_link:
9090
self.conversation_history.append(
9191
{
9292
"role": "user",
@@ -178,7 +178,7 @@ def chat() -> None:
178178

179179
# Handle image input for vision models
180180
img_link = None
181-
if model == "OpenGVLab/InternVL2_5-8B":
181+
if model == "OpenGVLab/InternVL3-8B-AWQ":
182182
img_link = Prompt.ask(
183183
"[bold yellow]Image link (press enter to skip)[/bold yellow]"
184184
)

vllm/models.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,28 @@
11
from beam.integrations import VLLM, VLLMArgs
22
from beam import Image
33

4-
INTERNVL2_5 = "OpenGVLab/InternVL2_5-8B"
4+
INTERNVL3_AWQ = "OpenGVLab/InternVL3-8B-AWQ"
55
YI_CODER_CHAT = "01-ai/Yi-Coder-9B-Chat"
66
MISTRAL_INSTRUCT = "mistralai/Mistral-7B-Instruct-v0.3"
77
DEEPSEEK_R1 = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
88

99
internvl = VLLM(
10-
name=INTERNVL2_5.split("/")[-1],
11-
cpu=8,
12-
memory="32Gi",
10+
name=INTERNVL3_AWQ.split("/")[-1],
11+
cpu=4,
12+
memory="16Gi",
1313
gpu="A10G",
14-
gpu_count=2,
14+
gpu_count=1,
1515
image=(Image(python_version="python3.12")).add_python_packages(
1616
["vllm==0.6.4.post1"]
1717
),
1818
vllm_args=VLLMArgs(
19-
model=INTERNVL2_5,
20-
served_model_name=[INTERNVL2_5],
19+
model=INTERNVL3_AWQ,
20+
served_model_name=[INTERNVL3_AWQ],
2121
trust_remote_code=True,
2222
max_model_len=4096,
23-
gpu_memory_utilization=0.95,
23+
gpu_memory_utilization=0.90,
2424
limit_mm_per_prompt={"image": 2},
25+
quantization="awq",
2526
),
2627
)
2728

0 commit comments

Comments
 (0)