66 from transformers import AutoModelForCausalLM , AutoTokenizer
77
88# Model parameters
9- MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
9+ MODEL_NAME = "meta-llama/Meta-Llama-3.1 -8B-Instruct"
1010MAX_LENGTH = 512
11- TEMPERATURE = 1.0
12- TOP_P = 0.95
13- TOP_K = 40
14- REPETITION_PENALTY = 1.0
15- NO_REPEAT_NGRAM_SIZE = 0
16- DO_SAMPLE = True
11+ TEMPERATURE = 0.7
12+ TOP_P = 0.9
13+ TOP_K = 50
14+ REPETITION_PENALTY = 1.05
15+ NO_REPEAT_NGRAM_SIZE = 2
16+ DO_SAMPLE = True
17+ NUM_BEAMS = 1
18+ EARLY_STOPPING = True
1719
18- CACHE_PATH = "./cached_models"
20+ BEAM_VOLUME_PATH = "./cached_models"
1921
2022
2123# This runs once when the container first starts
2224def load_models ():
23- tokenizer = AutoTokenizer .from_pretrained (MODEL_NAME , cache_dir = CACHE_PATH )
25+ tokenizer = AutoTokenizer .from_pretrained (
26+ MODEL_NAME ,
27+ cache_dir = BEAM_VOLUME_PATH ,
28+ padding_side = 'left'
29+ )
2430 tokenizer .pad_token = tokenizer .eos_token
2531 model = AutoModelForCausalLM .from_pretrained (
26- MODEL_NAME , device_map = "auto" , torch_dtype = torch .float16 , cache_dir = CACHE_PATH
32+ MODEL_NAME ,
33+ device_map = "auto" ,
34+ torch_dtype = torch .float16 ,
35+ cache_dir = BEAM_VOLUME_PATH ,
36+ use_cache = True ,
37+ low_cpu_mem_usage = True
2738 )
39+ model .eval ()
2840 return model , tokenizer
2941
3042
@@ -38,22 +50,25 @@ def load_models():
3850 "huggingface_hub[hf-transfer]" ,
3951 ]
4052 )
41- .with_envs ("HF_HUB_ENABLE_HF_TRANSFER=1" )
53+ .with_envs ({
54+ "HF_HUB_ENABLE_HF_TRANSFER" : "1" ,
55+ "TOKENIZERS_PARALLELISM" : "false" ,
56+ "CUDA_VISIBLE_DEVICES" : "0" ,
57+ })
4258)
4359
4460
4561@endpoint (
4662 secrets = ["HF_TOKEN" ],
4763 on_start = load_models ,
48- name = "meta-llama-3-8b-instruct" ,
64+ name = "meta-llama-3.1 -8b-instruct" ,
4965 cpu = 2 ,
50- memory = "32Gi" ,
51- gpu_count = 2 ,
66+ memory = "16Gi" ,
5267 gpu = "A10G" ,
5368 volumes = [
5469 Volume (
5570 name = "cached_models" ,
56- mount_path = CACHE_PATH ,
71+ mount_path = BEAM_VOLUME_PATH ,
5772 )
5873 ],
5974 image = image ,
@@ -68,30 +83,43 @@ def generate_text(context, **inputs):
6883 return {"error" : "Please provide messages for text generation." }
6984
7085 generate_args = {
71- "max_length " : inputs .get ("max_tokens" , MAX_LENGTH ),
86+ "max_new_tokens " : inputs .get ("max_tokens" , MAX_LENGTH ),
7287 "temperature" : inputs .get ("temperature" , TEMPERATURE ),
7388 "top_p" : inputs .get ("top_p" , TOP_P ),
7489 "top_k" : inputs .get ("top_k" , TOP_K ),
7590 "repetition_penalty" : inputs .get ("repetition_penalty" , REPETITION_PENALTY ),
7691 "no_repeat_ngram_size" : inputs .get (
7792 "no_repeat_ngram_size" , NO_REPEAT_NGRAM_SIZE
7893 ),
94+ "num_beams" : inputs .get ("num_beams" , NUM_BEAMS ),
95+ "early_stopping" : inputs .get ("early_stopping" , EARLY_STOPPING ),
7996 "do_sample" : inputs .get ("do_sample" , DO_SAMPLE ),
8097 "use_cache" : True ,
8198 "eos_token_id" : tokenizer .eos_token_id ,
8299 "pad_token_id" : tokenizer .pad_token_id ,
83100 }
84101
85- model_inputs = tokenizer .apply_chat_template (
102+ model_inputs_str = tokenizer .apply_chat_template (
86103 messages , tokenize = False , add_generation_prompt = True
87104 )
88- inputs = tokenizer (model_inputs , return_tensors = "pt" , padding = True )
89- input_ids = inputs ["input_ids" ].to ("cuda" )
90- attention_mask = inputs ["attention_mask" ].to ("cuda" )
105+
106+ tokenized_inputs = tokenizer (
107+ model_inputs_str ,
108+ return_tensors = "pt" ,
109+ padding = True ,
110+ truncation = True ,
111+ max_length = 2048
112+ )
113+ input_ids = tokenized_inputs ["input_ids" ].to ("cuda" )
114+ attention_mask = tokenized_inputs ["attention_mask" ].to ("cuda" )
115+ input_ids_length = input_ids .shape [- 1 ]
91116
92117 with torch .no_grad ():
93118 outputs = model .generate (
94- input_ids = input_ids , attention_mask = attention_mask , ** generate_args
119+ input_ids = input_ids ,
120+ attention_mask = attention_mask ,
121+ ** generate_args
95122 )
96- output_text = tokenizer .decode (outputs [0 ], skip_special_tokens = True )
123+ new_tokens = outputs [0 ][input_ids_length :]
124+ output_text = tokenizer .decode (new_tokens , skip_special_tokens = True )
97125 return {"output" : output_text }
0 commit comments