| from __future__ import annotations |
|
|
| import abc |
| from typing import ( |
| List, |
| Optional, |
| Any, |
| ) |
|
|
| import llama_cpp |
| from llama_cpp.llama_types import List |
|
|
|
|
| class BaseLlamaTokenizer(abc.ABC): |
| @abc.abstractmethod |
| def tokenize( |
| self, text: bytes, add_bos: bool = True, special: bool = True |
| ) -> List[int]: |
| """Tokenize the text into tokens. |
| |
| Args: |
| text: The utf-8 encoded string to tokenize. |
| add_bos: Whether to add a beginning of sequence token. |
| special: Whether to tokenize special tokens. |
| """ |
| raise NotImplementedError |
|
|
| @abc.abstractmethod |
| def detokenize( |
| self, |
| tokens: List[int], |
| prev_tokens: Optional[List[int]] = None, |
| special: bool = False, |
| ) -> bytes: |
| """Detokenize the tokens into text. |
| |
| Args: |
| tokens: The list of tokens to detokenize. |
| prev_tokens: The list of previous tokens. Offset mapping will be performed if provided. |
| special: Whether to detokenize special tokens. |
| """ |
| raise NotImplementedError |
|
|
|
|
| class LlamaTokenizer(BaseLlamaTokenizer): |
| def __init__(self, llama: llama_cpp.Llama): |
| self._model = llama._model |
|
|
| def tokenize( |
| self, text: bytes, add_bos: bool = True, special: bool = True |
| ) -> List[int]: |
| return self._model.tokenize(text, add_bos=add_bos, special=special) |
|
|
| def detokenize( |
| self, |
| tokens: List[int], |
| prev_tokens: Optional[List[int]] = None, |
| special: bool = False, |
| ) -> bytes: |
| return self._model.detokenize(tokens, special=special) |
|
|
| def encode( |
| self, text: str, add_bos: bool = True, special: bool = True |
| ) -> List[int]: |
| return self.tokenize( |
| text.encode("utf-8", errors="ignore"), add_bos=add_bos, special=special |
| ) |
|
|
| def decode(self, tokens: List[int]) -> str: |
| return self.detokenize(tokens).decode("utf-8", errors="ignore") |
|
|
| @classmethod |
| def from_ggml_file(cls, path: str) -> "LlamaTokenizer": |
| return cls(llama_cpp.Llama(model_path=path, vocab_only=True)) |
|
|
|
|
| class LlamaHFTokenizer(BaseLlamaTokenizer): |
| def __init__(self, hf_tokenizer: Any): |
| self.hf_tokenizer = hf_tokenizer |
|
|
| def tokenize( |
| self, text: bytes, add_bos: bool = True, special: bool = True |
| ) -> List[int]: |
| return self.hf_tokenizer.encode( |
| text.decode("utf-8", errors="ignore"), add_special_tokens=special |
| ) |
|
|
| def detokenize( |
| self, |
| tokens: List[int], |
| prev_tokens: Optional[List[int]] = None, |
| special: bool = False, |
| ) -> bytes: |
| skip_special_tokens = not special |
| if prev_tokens is not None: |
| text = self.hf_tokenizer.decode( |
| prev_tokens + tokens, skip_special_tokens=skip_special_tokens |
| ).encode("utf-8", errors="ignore") |
| prev_text = self.hf_tokenizer.decode( |
| prev_tokens, skip_special_tokens=skip_special_tokens |
| ).encode("utf-8", errors="ignore") |
| return text[len(prev_text) :] |
| else: |
| return self.hf_tokenizer.decode( |
| tokens, skip_special_tokens=skip_special_tokens |
| ).encode("utf-8", errors="ignore") |
|
|
| @classmethod |
| def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer": |
| try: |
| from transformers import AutoTokenizer |
| except ImportError: |
| raise ImportError( |
| "The `transformers` library is required to use the `HFTokenizer`." |
| "You can install it with `pip install transformers`." |
| ) |
| hf_tokenizer = AutoTokenizer.from_pretrained( |
| pretrained_model_name_or_path=pretrained_model_name_or_path |
| ) |
| return cls(hf_tokenizer) |
|
|