-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathchat_verifier_way2.py
More file actions
110 lines (87 loc) · 3.81 KB
/
chat_verifier_way2.py
File metadata and controls
110 lines (87 loc) · 3.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os, json, itertools, bisect, gc
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import transformers
import torch
from accelerate import Accelerator
import accelerate
import time
import re
model = None
tokenizer = None
generator = None
os.environ["CUDA_VISIBLE_DEVICES"]="0"
flag = "ExplanationVerifier" ## "Qiming-Alpaca", "ExplanationGenerator", "ExplanationVerifier"
if flag == "Qiming-Alpaca":
load_model_name = "./qiming_alpaca/"
First_chat = "Qiming-Alpaca: I am Qiming-Alpaca, what questions do you have?"
invitation = "Qiming-Alpaca: "
human_invitation = "User: "
elif flag == "ExplanationVerifier":
# load_model_name = "./qiming_llama_7B_Cardiff_Sydney_merged_verifier_way_2/"
load_model_name = "./qiming_alpaca_7B_Cardiff_Sydney_merged_verifier_way_2/"
First_chat = "Explanation Verifier: I am an expert in explantion verifier, what questions can I help?"
invitation = " Output: "
human_invitation = "User: "
def load_model(model_name, eight_bit=0, device_map="auto"):
global model, tokenizer, generator
print("Loading "+model_name+"...")
if device_map == "zero":
device_map = "balanced_low_0"
# config
gpu_count = torch.cuda.device_count()
print('gpu_count', gpu_count)
tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name)
model = transformers.LlamaForCausalLM.from_pretrained(
model_name,
#device_map=device_map,
#device_map="auto",
torch_dtype=torch.float16,
#max_memory = {0: "14GB", 1: "14GB", 2: "14GB", 3: "14GB",4: "14GB",5: "14GB",6: "14GB",7: "14GB"},
#load_in_8bit=eight_bit,
#from_tf=True,
low_cpu_mem_usage=True,
load_in_8bit=False,
cache_dir="cache"
).cuda()
generator = model.generate
load_model(load_model_name)
def explanationVerifier(msg, generator_response):
# invitation = "Qiming-Alpaca: "
# human_invitation = "User: "
# input
# history.append(human_invitation + msg)
# fulltext = "If you are a doctor, please answer the medical questions based on the patient's description. \n\n" + "\n\n".join(history) + "\n\n" + invitation
# fulltext = "\n\n".join(generator_response) + "\n\n" + invitation
merged_response = [msg + " Explanation: " + generator_response + " </s> "]
fulltext = "Instruction: As a question rating verifier expert, can you generate the question rating score for the given input? \n\n" + \
"\n\n".join(merged_response) + "\n\n" + invitation
#print('SENDING==========')
#print(fulltext)
#print('==========')
generated_text = ""
gen_in = tokenizer(fulltext, return_tensors="pt").input_ids.cuda()
in_tokens = len(gen_in)
with torch.no_grad():
generated_ids = generator(
gen_in,
max_new_tokens=1024,
use_cache=True,
pad_token_id=tokenizer.eos_token_id,
num_return_sequences=1,
do_sample=True,
repetition_penalty=1.1, # 1.0 means 'off'. unfortunately if we penalize it it will not output Sphynx:
temperature=0.5, # default: 1.0
top_k = 50, # default: 50
top_p = 1.0, # default: 1.0
early_stopping=True,
)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # for some reason, batch_decode returns an array of one element?
# text_without_prompt = generated_text[len(fulltext):]
pattern = r"Output: (\d+\.\d+)"
text_without_prompt = re.search(pattern, generated_text)
response = text_without_prompt.group(1)
# response = response.split(human_invitation)[0]
response = response.strip()
# print(invitation + response)
# print("")
return float(response)