CogStack-ModelServe/app/model_services/huggingface_llm_model.py at a869a834cb5e69a5a1273b09b9f9a57f23b4548a · CogStack/CogStack-ModelServe · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
import os
import logging
import asyncio
import torch
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Tuple, Any, AsyncIterable, TextIO, Callable, Union
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizerBase,
    TextIteratorStreamer,
    BitsAndBytesConfig,
)
from app import __version__ as app_version
from app.exception import ConfigurationException
from app.model_services.base import AbstractModelService
from app.trainers.huggingface_llm_trainer import HuggingFaceLlmSupervisedTrainer, HuggingFaceLlmUnsupervisedTrainer
from app.domain import ModelCard, ModelType, Annotation, Device
from app.config import Settings
from app.utils import (
    get_settings,
    non_default_device_is_available,
    unpack_model_data_package,
    ensure_tensor_contiguity,
    get_model_data_package_base_name,
)

logger = logging.getLogger("cms")


class HuggingFaceLlmModel(AbstractModelService):
    """A model service for Hugging Face generative LLMs."""

    def __init__(
        self,
        config: Settings,
        model_parent_dir: Optional[str] = None,
        enable_trainer: Optional[bool] = None,
        model_name: Optional[str] = None,
        base_model_file: Optional[str] = None,
    ) -> None:
        """
        Initialises the HuggingFace LLM model service with specified configurations.

        Args:
            config (Settings): The configuration for the model service.
            model_parent_dir (Optional[str]): The directory where the model package is stored. Defaults to None.
            enable_trainer (Optional[bool]): The flag to enable or disable trainers. Defaults to None.
            model_name (Optional[str]): The name of the model. Defaults to None.
            base_model_file (Optional[str]): The model package file name. Defaults to None.
        """

        super().__init__(config)
        self._config = config
        self._model_parent_dir = model_parent_dir or os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "model"))
        self._model_pack_path = os.path.join(self._model_parent_dir, base_model_file or config.BASE_MODEL_FILE)
        self._enable_trainer = enable_trainer if enable_trainer is not None else config.ENABLE_TRAINING_APIS == "true"
        self._model: PreTrainedModel = None
        self._tokenizer: PreTrainedTokenizerBase = None
        self._whitelisted_tuis = set([tui.strip() for tui in config.TYPE_UNIQUE_ID_WHITELIST.split(",")])
        self._multi_label_threshold = 0.5
        self._text_generator = ThreadPoolExecutor(max_workers=50)
        self.model_name = model_name or "HuggingFace LLM model"
        self.is_4bit_quantised = False

    @property
    def model(self) -> PreTrainedModel:
        """Getter for the HuggingFace pre-trained model."""

        return self._model

    @model.setter
    def model(self, model: PreTrainedModel) -> None:
        """Setter for the HuggingFace pre-trained model."""

        self._model = model

    @model.deleter
    def model(self) -> None:
        """Deleter for the HuggingFace pre-trained model."""

        del self._model

    @property
    def tokenizer(self) -> PreTrainedTokenizerBase:
        """Getter for the HuggingFace tokenizer."""

        return self._tokenizer

    @tokenizer.setter
    def tokenizer(self, tokenizer: PreTrainedTokenizerBase) -> None:
        """Setter for the HuggingFace tokenizer."""

        self._tokenizer = tokenizer

    @tokenizer.deleter
    def tokenizer(self) -> None:
        """Deleter for the HuggingFace tokenizer."""

        del self._tokenizer

    @property
    def api_version(self) -> str:
        """Getter for the API version of the model service."""

        # APP version is used although each model service could have its own API versioning
        return app_version

    @classmethod
    def from_model(cls, model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase) -> "HuggingFaceLlmModel":
        """
        Creates a model service from a provided HuggingFace pre-trained model and its tokenizer.

        Args:
            model (PreTrainedModel): The HuggingFace pre-trained model.
            tokenizer (PreTrainedTokenizerBase): The tokenizer for the HuggingFace pre-trained model.

        Returns:
            HuggingFaceLlmModel: A HuggingFace Generative model service.
        """

        model_service = cls(get_settings(), enable_trainer=False)
        model_service.model = model
        model_service.tokenizer = tokenizer
        return model_service

    @staticmethod
    def load_model(
        model_file_path: str,
        *args: Tuple,
        load_in_4bit: bool = False,
        **kwargs: Dict[str, Any]
    ) -> Tuple[PreTrainedModel, PreTrainedTokenizerBase]:
        """
        Loads a pre-trained model and its tokenizer from a model package file.

        Args:
            model_file_path (str): The path to the model package file.
            *args (Tuple): Additional positional arguments.
            load_in_4bit (bool): Whether to load the model in 4-bit precision. Defaults to False.
            **kwargs (Dict[str, Any]): Additional keyword arguments.

        Returns:
            Tuple[PreTrainedModel, PreTrainedTokenizerBase]: A tuple containing the HuggingFace pre-trained model and its tokenizer.

        Raises:
            ConfigurationException: If the model package is not valid or not supported.
        """

        model_path = os.path.join(os.path.dirname(model_file_path), get_model_data_package_base_name(model_file_path))
        if unpack_model_data_package(model_file_path, model_path):
            try:
                if load_in_4bit:
                    bnb_config = BitsAndBytesConfig(
                        load_in_4bit=True,
                        bnb_4bit_quant_type="nf4",
                        bnb_4bit_compute_dtype=torch.bfloat16,
                        bnb_4bit_use_double_quant=True,
                    )
                    if get_settings().DEVICE == Device.DEFAULT.value:
                        model = AutoModelForCausalLM.from_pretrained(
                            model_path,
                            quantization_config=bnb_config,
                            device_map="auto",
                        )
                    else:
                        model = AutoModelForCausalLM.from_pretrained(model_path, quantization_config=bnb_config)
                else:
                    if get_settings().DEVICE == Device.DEFAULT.value:
                        model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
                    else:
                        model = AutoModelForCausalLM.from_pretrained(model_path)
                ensure_tensor_contiguity(model)
                tokenizer = AutoTokenizer.from_pretrained(
                    model_path,
                    model_max_length=model.config.max_position_embeddings,
                    do_lower_case=False,
                )
                logger.info("Model package loaded from %s", os.path.normpath(model_file_path))
                return model, tokenizer
            except ValueError as e:
                logger.error(e)
                raise ConfigurationException(f"Model package is not valid or not supported: {model_file_path}")
        else:
            raise ConfigurationException(f"Model package archive format is not supported: {model_file_path}")

    def init_model(self, load_in_4bit: bool = False, *args: Any, **kwargs: Any) -> None:
        """Initialises the HuggingFace model and its tokenizer based on the configuration.

        Args:
            load_in_4bit (bool): Whether to load the model in 4-bit precision. Defaults to False.
            *args (Any): Additional positional arguments to be passed to this method.
            **kwargs (Any): Additional keyword arguments to be passed to this method.
        """

        if all([
            hasattr(self, "_model"),
            hasattr(self, "_tokenizer"),
            isinstance(self._model, PreTrainedModel),
            isinstance(self._tokenizer, PreTrainedTokenizerBase),
        ]):
            logger.warning("Model service is already initialised and can be initialised only once")
        else:
            self._model, self._tokenizer = self.load_model(self._model_pack_path, load_in_4bit=load_in_4bit)
            if non_default_device_is_available(get_settings().DEVICE):
                self._model.to(get_settings().DEVICE)
            if self._enable_trainer:
                self._supervised_trainer = HuggingFaceLlmSupervisedTrainer(self)
                self._unsupervised_trainer = HuggingFaceLlmUnsupervisedTrainer(self)
            self.is_4bit_quantised = load_in_4bit

    def info(self) -> ModelCard:
        """
        Retrieves a ModelCard containing information about the model.

        Returns:
            ModelCard: Information about the model.
        """
        return ModelCard(
            model_description=self.model_name,
            model_type=ModelType.HUGGINGFACE_LLM,
            api_version=self.api_version,
            model_card=self._model.config.to_dict(),
        )

    def annotate(self, text: str) -> List[Annotation]:
        raise NotImplementedError("Annotation is not yet implemented for HuggingFace Generative models")

    def batch_annotate(self, texts: List[str]) -> List[List[Annotation]]:
        raise NotImplementedError("Batch annotation is not yet implemented for HuggingFace Generative models")

    def generate(
        self,
        prompt: str,
        min_tokens: int = 100,
        max_tokens: int = 512,
        num_beams: int = 5,
        temperature: float = 0.7,
        top_p: float = 0.9,
        stop_sequences: Optional[List[str]] = None,
        report_tokens: Optional[Callable[[str], None]] = None,
        **kwargs: Any
    ) -> str:
        """
        Generates text based on the prompt.

        Args:
            prompt (str): The prompt for the text generation
            min_tokens (int): The minimum number of tokens to generate. Defaults to 100.
            max_tokens (int): The maximum number of tokens to generate. Defaults to 512.
            num_beams (int): The number of beams for beam search. Defaults to 5.
            temperature (float): The temperature for the text generation. Defaults to 0.7.
            top_p (float): The Top-P value for nucleus sampling. Defaults to 0.9.
            stop_sequences (Optional[List[str]]): List of strings that will stop generation when encountered. Defaults to None.
            report_tokens (Optional[Callable[[str], None]]): The callback function to send metrics. Defaults to None.
            **kwargs (Any): Additional keyword arguments to be passed to this method.

        Returns:
            Any: The string containing the generated text.
        """

        self.model.eval()

        inputs = self.tokenizer(prompt, add_special_tokens=False, return_tensors="pt")
        inputs.to(self.model.device)

        generation_kwargs = dict(
            inputs=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            min_new_tokens=min_tokens,
            max_new_tokens=max_tokens,
            num_beams=num_beams,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
        )

        outputs = self.model.generate(**generation_kwargs)
        generated_text = self.tokenizer.decode(outputs[0], skip_prompt=True, skip_special_tokens=True)

        if stop_sequences:
            for stop_seq in stop_sequences:
                if stop_seq in generated_text:
                    generated_text = generated_text.split(stop_seq)[0]
                    break

        logger.debug("Response generation completed")

        if report_tokens:
            report_tokens(
                prompt_token_num=inputs.input_ids.shape[-1],    # type: ignore
                completion_token_num=outputs[0].shape[-1],  # type: ignore
            )

        return generated_text

    async def generate_async(
        self,
        prompt: str,
        max_tokens: int = 512,
        temperature: float = 0.7,
        top_p: float = 0.9,
        stop_sequences: Optional[List[str]] = None,
        report_tokens: Optional[Callable[[str], None]] = None,
        **kwargs: Any
    ) -> AsyncIterable:
        """
        Asynchronously generates text stream based on the prompt.

        Args:
            prompt (str): The prompt for the text generation.
            max_tokens (int): The maximum number of tokens to generate. Defaults to 512.
            temperature (float): The temperature for the text generation. Defaults to 0.7.
            top_p (float): The Top-P value for nucleus sampling. Defaults to 0.9.
            stop_sequences (Optional[List[str]]): List of strings that will stop generation when encountered. Defaults to None.
            report_tokens (Optional[Callable[[str], None]]): The callback function to send metrics. Defaults to None.
            **kwargs (Any): Additional keyword arguments to be passed to the model loader.

        Returns:
            AsyncIterable: The stream containing the generated text.
        """

        self.model.eval()

        inputs = self.tokenizer(prompt, add_special_tokens=False, return_tensors="pt")
        inputs.to(self.model.device)

        streamer = TextIteratorStreamer(
            self.tokenizer,
            skip_prompt=True,
            skip_special_tokens=True
        )
        generation_kwargs = dict(
            inputs=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            streamer=streamer,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
        )

        try:
            _ = self._text_generator.submit(self.model.generate, **generation_kwargs)
            output = ""
            for content in streamer:
                prev_output = output
                output += content
                if stop_sequences:
                    for stop_seq in stop_sequences:
                        if stop_seq in output:
                            remaining = output[len(prev_output):output.find(stop_seq)]
                            if remaining:
                                yield remaining
                            return
                yield content
                await asyncio.sleep(0.01)
            if report_tokens:
                report_tokens(
                    prompt_token_num=inputs.input_ids.shape[-1],    # type: ignore
                    completion_token_num=self.tokenizer(    # type: ignore
                        output,
                        add_special_tokens=False,
                        return_tensors="pt"
                    ).input_ids.shape[-1],
                )
        except Exception as e:
            logger.error("An error occurred while generating the response")
            logger.exception(e)
            return
        finally:
            logger.debug("Chat response generation completed")

    def create_embeddings(
        self,
        text: Union[str, List[str]],
        *args: Any,
        **kwargs: Any
    ) -> Union[List[float], List[List[float]]]:
        """
        Creates embeddings for a given text or list of texts using the model's hidden states.

        Args:
            text (Union[str, List[str]]): The text(s) to be embedded.
            *args (Any): Additional positional arguments to be passed to this method.
            **kwargs (Any): Additional keyword arguments to be passed to this method.

        Returns:
            List[float], List[List[float]]: The embedding vector(s) for the text(s).

        Raises:
            NotImplementedError: If the model doesn't support embeddings.
        """

        self.model.eval()

        texts = [text] if isinstance(text, str) else text
        all_embeddings = []

        for txt in texts:
            inputs = self.tokenizer(txt, add_special_tokens=False, truncation=False, padding=False)
            input_ids = inputs["input_ids"]
            attention_mask = inputs["attention_mask"]
            window_size = max(self.model.config.max_position_embeddings - 2, 1)
            stride = window_size
            chunk_embeddings = []

            for start in range(0, len(input_ids), stride):
                end = min(start + window_size, len(input_ids))
                chunk_inputs = {
                    "input_ids": torch.tensor(
                        [input_ids[start:end]], dtype=torch.long
                    ).to(self.model.device),
                    "attention_mask": torch.tensor(
                        [attention_mask[start:end]], dtype=torch.long
                    ).to(self.model.device),
                }

                with torch.no_grad():
                    outputs = self.model(**chunk_inputs, output_hidden_states=True)

                last_hidden_state = outputs.hidden_states[-1]
                chunk_attention_mask = chunk_inputs["attention_mask"]
                masked_hidden_states = last_hidden_state * chunk_attention_mask.unsqueeze(-1)
                sum_hidden_states = masked_hidden_states.sum(dim=1)
                num_tokens = chunk_attention_mask.sum(dim=1, keepdim=True)
                chunk_embedding = sum_hidden_states / num_tokens
                chunk_embeddings.append(chunk_embedding)

                if end >= len(input_ids):
                    break

            final_embedding = torch.mean(torch.cat(chunk_embeddings, dim=0), dim=0, keepdim=True)
            l2_normalised = torch.nn.functional.normalize(final_embedding, p=2, dim=1)
            all_embeddings.append(l2_normalised.cpu().numpy().tolist()[0])

        return all_embeddings[0] if isinstance(text, str) else all_embeddings

    def train_supervised(
        self,
        data_file: TextIO,
        epochs: int,
        log_frequency: int,
        training_id: str,
        input_file_name: str,
        raw_data_files: Optional[List[TextIO]] = None,
        description: Optional[str] = None,
        synchronised: bool = False,
        **hyperparams: Dict[str, Any],
    ) -> Tuple[bool, str, str]:
        """
        Initiates supervised training on the model.

        Args:
            data_file (TextIO): The file containing the trainer export data.
            epochs (int): The number of training epochs.
            log_frequency (int): The number of epochs after which training metrics will be logged.
            training_id (str): A unique identifier for the training process.
            input_file_name (str): The name of the input file to be logged.
            raw_data_files (Optional[List[TextIO]]): Additional raw data files to be logged. Defaults to None.
            description (Optional[str]): The description of the training or change logs. Defaults to empty.
            synchronised (bool): Whether to wait for the training to complete.
            **hyperparams (Dict[str, Any]): Additional hyperparameters for training.

        Returns:
            Tuple[bool, str, str]: A tuple with the first element indicating success or failure.

        Raises:
            ConfigurationException: If the supervised trainer is not enabled.
        """
        if self._supervised_trainer is None:
            raise ConfigurationException("The supervised trainer is not enabled")
        return self._supervised_trainer.train(
            data_file,
            epochs,
            log_frequency,
            training_id,
            input_file_name,
            raw_data_files,
            description,
            synchronised,
            **hyperparams,
        )

    def train_unsupervised(
        self,
        data_file: TextIO,
        epochs: int,
        log_frequency: int,
        training_id: str,
        input_file_name: str,
        raw_data_files: Optional[List[TextIO]] = None,
        description: Optional[str] = None,
        synchronised: bool = False,
        **hyperparams: Dict[str, Any],
    ) -> Tuple[bool, str, str]:
        """
        Initiates unsupervised training on the model.

        Args:
            data_file (TextIO): The file containing a JSON list of texts.
            epochs (int): The number of training epochs.
            log_frequency (int): The number of epochs after which training metrics will be logged.
            training_id (str): A unique identifier for the training process.
            input_file_name (str): The name of the input file to be logged.
            raw_data_files (Optional[List[TextIO]]): Additional raw data files to be logged. Defaults to None.
            description (Optional[str]): The description of the training or change logs. Defaults to empty.
            synchronised (bool): Whether to wait for the training to complete.
            **hyperparams (Dict[str, Any]): Additional hyperparameters for training.

        Returns:
            Tuple[bool, str, str]:  A tuple with the first element indicating success or failure.

        Raises:
            ConfigurationException: If the unsupervised trainer is not enabled.
        """
        if self._unsupervised_trainer is None:
            raise ConfigurationException("The unsupervised trainer is not enabled")
        return self._unsupervised_trainer.train(
            data_file,
            epochs,
            log_frequency,
            training_id,
            input_file_name,
            raw_data_files,
            description,
            synchronised,
            **hyperparams,
        )