Updates and Optimizations to Generation Scripts (#9)

Dando18 · Daniel Nichols · web-flow · commit 140af99e3a83 · 2023-11-28T14:52:28.000-05:00
* updates and optimizations to generation scripts

---------

Co-authored-by: Daniel Nichols &lt;dnicho@login01.chn&gt;
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@
 # python
 *.pyc
 __pycache__
+.env
 
 # cpp
 *.out
diff --git a/generate.py b/generate.py
diff --git a/generate/generate.py b/generate/generate.py
@@ -0,0 +1,86 @@
+# std imports
+import argparse
+import json
+import time
+from tqdm import tqdm
+import sys
+
+# tpl imports
+import torch
+from transformers import pipeline
+
+# local imports
+from utils import BalancedBracketsCriteria, PromptDataset, clean_output, get_inference_config
+
+
+""" Parse command line arguments """
+parser = argparse.ArgumentParser(description='Generate code')
+parser.add_argument('--prompts', required=True, help='Path to the prompt JSON file')
+parser.add_argument('--model', required=True, help='Path to the language model')
+parser.add_argument('--output', required=True, help='Path to the output JSON file')
+parser.add_argument('--max_new_tokens', type=int, default=1024, help='Maximum number of new tokens to generate (default: 1024)')
+parser.add_argument('--num_samples_per_prompt', type=int, default=50, help='Number of code samples to generate (default: 50)')
+parser.add_argument('--temperature', type=float, default=0.2, help='Temperature for controlling randomness (default: 0.2)')
+parser.add_argument('--top_p', type=float, default=0.95, help='Top p value for nucleus sampling (default: 0.95)')
+parser.add_argument('--do_sample', action='store_true', help='Enable sampling (default: False)')
+parser.add_argument('--batch_size', type=int, default=16, help='Batch size for generation (default: 8)')
+parser.add_argument('--prompted', action='store_true', help='Use prompted generation. See StarCoder paper (default: False)')
+args = parser.parse_args()
+
+""" Load prompts """
+with open(args.prompts, 'r') as json_file:
+    prompts = json.load(json_file)
+
+""" Initialize inference config """
+inference_config = get_inference_config(args.model, prompted=args.prompted)
+
+# to use a torch.utils.data.DataSet with the HuggingFace pipeline, we need to flatten out the prompts
+# and repeat them for however many samples we want to generate per prompt
+prompts_repeated = [p for p in prompts for _ in range(args.num_samples_per_prompt)]
+
+""" Initialize HuggingFace pipeline for generation """
+generator = pipeline(model=args.model, torch_dtype=inference_config.get_dtype(), device_map="auto")
+inference_config.init_padding(generator.tokenizer)
+
+""" Create a prompt data set to pass to generate method """
+prompt_dataset = PromptDataset([inference_config.format_prompt(p["prompt"]) for p in prompts_repeated])
+generated_outputs = generator(
+    prompt_dataset,
+    max_new_tokens=args.max_new_tokens,
+    do_sample=args.do_sample,
+    temperature=args.temperature,
+    top_p=args.top_p,
+    pad_token_id=inference_config.get_pad_token_id(generator.tokenizer),
+    eos_token_id=inference_config.get_eos_token_id(generator.tokenizer),
+    batch_size=args.batch_size,
+)
+
+""" Iterate over prompts and generate code """
+responses = []
+cur_prompt = None
+start_time = time.time()
+total_tokens = 0
+for idx, (prompt, output) in tqdm(enumerate(zip(prompts_repeated, generated_outputs)), total=len(prompts_repeated), desc="Generating code", file=sys.stdout):
+    if idx % args.num_samples_per_prompt == 0:
+        cur_prompt = prompt.copy()
+        cur_prompt.update({"temperature": args.temperature, "top_p": args.top_p, "do_sample": args.do_sample, "max_new_tokens": args.max_new_tokens, "prompted": args.prompted})
+        cur_prompt["outputs"] = []
+        prompt_str = cur_prompt["prompt"]
+
+    total_tokens += len(generator.tokenizer.encode(output[0]["generated_text"]))
+    cleaned_output = clean_output(output[0]["generated_text"], prompt_str)
+    cur_prompt["outputs"].append(cleaned_output)
+
+    if idx % args.num_samples_per_prompt == args.num_samples_per_prompt - 1:
+        responses.append(cur_prompt)
+
+    if idx != 0 and idx % args.num_samples_per_prompt == 0:
+        print(f"Tokens per second: {total_tokens / (time.time() - start_time):.2f}")
+
+end_time = time.time()
+tokens_per_second = total_tokens / (end_time - start_time)
+print(f"Generated {len(responses)} code samples in {end_time - start_time:.2f} seconds ({tokens_per_second:.2f} tokens per second)")
+
+""" Save responses to JSON file """
+with open(args.output, 'w') as output_file:
+    json.dump(responses, output_file, indent=4)
diff --git a/generate/utils.py b/generate/utils.py
@@ -0,0 +1,247 @@
+# std imports
+from abc import ABC, abstractmethod
+
+# tpl imports
+import torch
+from torch.utils.data import Dataset
+from transformers import StoppingCriteria
+
+
+class InferenceConfig(ABC):
+
+    def __init__(self, prompted : bool = False):
+        self.prompted = prompted
+    
+    @abstractmethod
+    def get_dtype(self):
+        pass
+    
+    @abstractmethod
+    def init_padding(self, tokenizer):
+        pass
+
+    @abstractmethod
+    def get_pad_token_id(self, tokenizer) -> int:
+        pass
+
+    @abstractmethod
+    def get_eos_token_id(self, tokenizer) -> int:
+        pass
+
+    @abstractmethod
+    def format_prompt(self, prompt : str) -> str:
+        pass
+
+
+class StarCoderConfig(InferenceConfig):
+
+    def __init__(self, prompted : bool = False):
+        super().__init__(prompted=prompted)
+
+    def get_dtype(self):
+        return torch.float16
+
+    def init_padding(self, tokenizer):
+        tokenizer.pad_token_id = tokenizer.eos_token_id  # for batching
+        tokenizer.padding_side = "left"   # for decoder-only models
+
+    def get_pad_token_id(self, tokenizer) -> int:
+        return tokenizer.eos_token_id
+
+    def get_eos_token_id(self, tokenizer) -> int:
+        return None
+
+    def format_prompt(self, prompt : str) -> str:
+        if self.prompted:
+            return f"<filename>solutions/solution_1.cpp\n// here is the correct implementation of the coding exercise\n\n{prompt}"
+        return prompt.strip()
+
+class CodeLlamaConfig(InferenceConfig):
+
+    def __init__(self, prompted : bool = False):
+        super().__init__(prompted=prompted)
+
+    def get_dtype(self):
+        return torch.float16
+
+    def init_padding(self, tokenizer):
+        tokenizer.pad_token_id = tokenizer.eos_token_id  # for batching
+        tokenizer.padding_side = "left"   # for decoder-only models
+        pass
+
+    def get_pad_token_id(self, tokenizer) -> int:
+        return tokenizer.pad_token_id
+
+    def get_eos_token_id(self, tokenizer) -> int:
+        return tokenizer.eos_token_id
+
+    def format_prompt(self, prompt : str) -> str:
+        if self.prompted:
+            return f"// filename: solutions/solution_1.cpp\n// here is the correct implementation of the coding exercise\n\n{prompt}"
+        return prompt.strip()
+
+
+class PolyCoderConfig(InferenceConfig):
+
+    def __init__(self, prompted : bool = False):
+        super().__init__(prompted=prompted)
+
+    def get_dtype(self):
+        return torch.float16
+
+    def init_padding(self, tokenizer):
+        tokenizer.pad_token_id = tokenizer.eos_token_id  # for batching
+        tokenizer.padding_side = "left"   # for decoder-only models
+
+    def get_pad_token_id(self, tokenizer) -> int:
+        return tokenizer.eos_token_id
+
+    def get_eos_token_id(self, tokenizer) -> int:
+        return tokenizer.eos_token_id
+
+    def format_prompt(self, prompt : str) -> str:
+        if self.prompted:
+            return f"// filename: solutions/solution_1.cpp\n// here is the correct implementation of the coding exercise\n\n{prompt}"
+        return prompt.strip()
+
+
+class PhindConfig(InferenceConfig):
+
+    def __init__(self, prompted : bool = False):
+        super().__init__(prompted=prompted)
+
+    def get_dtype(self):
+        return torch.float16
+
+    def init_padding(self, tokenizer):
+        tokenizer.pad_token_id = tokenizer.eos_token_id  # for batching
+        tokenizer.padding_side = "left"   # for decoder-only models
+
+    def get_pad_token_id(self, tokenizer) -> int:
+        return tokenizer.eos_token_id
+
+    def get_eos_token_id(self, tokenizer) -> int:
+        return tokenizer.eos_token_id
+
+    def format_prompt(self, prompt : str) -> str:
+        if self.prompted:
+            return f"// filename: solutions/solution_1.cpp\n// here is the correct implementation of the coding exercise\n\n{prompt}"
+        return prompt.strip()
+
+
+def get_inference_config(model_name : str, **kwargs) -> InferenceConfig:
+    if model_name == "bigcode/starcoderbase":
+        return StarCoderConfig(**kwargs)
+    elif model_name.startswith("codellama/CodeLlama-") and 'Instruct' not in model_name:
+        return CodeLlamaConfig(**kwargs)
+    elif model_name == "NinedayWang/PolyCoder-2.7B":
+        return PolyCoderConfig(**kwargs)
+    elif model_name == 'Phind/Phind-CodeLlama-34B-v2':
+        return PhindConfig(**kwargs)
+    else:
+        raise ValueError(f"Unknown model name: {model_name}")
+
+
+def clean_output(output : str, prompt : str) -> str:
+    """ Remove `prompt` from the begging of `output`.
+        Also truncate at the end of the function definition (i.e. matching closing brace).
+    """
+    # replace up to the end of the first instance of prompt
+    prompt_loc = output.find(prompt)
+    if prompt_loc == -1:
+        raise ValueError(f"Prompt not found in output: {prompt}")
+    output = output[prompt_loc + len(prompt):].strip()
+
+    # temporarily add opening brace to the beginning
+    output = '{' + output
+
+    # find the matching brace to output[0]
+    stack = []
+    index = 0
+    while index < len(output):
+        token = output[index]
+        if token == '{':
+            stack.append(token)
+        elif token == '}':
+            stack.pop()
+            if len(stack) == 0:
+                break
+
+        index += 1
+
+    # truncate at the matching brace
+    output = output[1:index+1]
+    return output
+
+class PromptDataset(Dataset):
+    ''' PyTorch dataset that simply wraps a list of strings. They do not have to have the same length.
+    '''
+
+    def __init__(self, prompts):
+        super().__init__()
+        self.prompts_ = prompts
+    
+    def __len__(self):
+        return len(self.prompts_)
+    
+    def __getitem__(self, idx): 
+        return self.prompts_[idx]
+
+
+def has_balanced_brackets(text : str, left_bracket : str = '{', right_bracket : str = '}') -> bool:
+    ''' Check if string has balanced brackets.
+        modified from: https://stackoverflow.com/a/38834249/3769237
+
+        Arguments:
+            text: string to check for balanced brackets in.
+            left_bracket: left bracket to balance
+            right_bracket: right bracket to balance
+
+        Returns:
+            true if left_bracket and right_bracket are balanced
+    '''
+    stack = []
+    balanced = True
+    index = 0
+    while index < len(text) and balanced:
+        token = text[index]
+        if token == left_bracket:
+            stack.append(token)
+        elif token == right_bracket:
+            if len(stack) == 0:
+                balanced = False
+            else:
+                stack.pop()
+
+        index += 1
+
+    return balanced and len(stack) == 0
+
+
+class BalancedBracketsCriteria(StoppingCriteria):
+    ''' extension of transformers' text-generation stopping criteria.
+        Stops either when function is complete (i.e. { and } are balanced) or when max_length is surpassed, whichever
+        happens first. 
+
+        _Note:_ This is a slow stopping criteria, but it's much faster than continually running model inference when 
+        it does not need to be run anymore.
+    '''
+
+    def __init__(self, max_length : int, tokenizer, left_bracket : str = '{', right_bracket : str = '}'):
+        self.max_length = max_length
+        self.tokenizer = tokenizer
+        self.left_bracket = left_bracket
+        self.right_bracket = right_bracket
+    
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        if input_ids.shape[-1] > self.max_length:
+            # already too long, early stop
+            return True
+
+        # return true if {} are balanced i.e. the function is complete
+        return all(
+            has_balanced_brackets(
+                self.tokenizer.decode(t), 
+                left_bracket=self.left_bracket, 
+                right_bracket=self.right_bracket
+            ) for t in input_ids)