diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 46469c86200..ca06aadfddd 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1490,6 +1490,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "e4d54df1ebc1f2b91acd986c5b51aa50837d5faf7c7398e73c1f9e9ee5d19869": # ref: https://huggingface.co/kakaocorp/kanana-2-30b-a3b-instruct-2601 res = "kanana2" + if chkhsh == "5f9861fd826d8e124b222f41f41b928e78d8f6c8fbdf25625d06cc1e8736662c": + # ref: https://huggingface.co/OpenLLM-France/Luciole-1B-Base + res = "qwen2" if res is None: logger.warning("\n") @@ -1515,15 +1518,179 @@ def get_vocab_base_pre(self, tokenizer) -> str: def _set_vocab_none(self) -> None: self.gguf_writer.add_tokenizer_model("none") - def _set_vocab_gpt2(self) -> None: + @staticmethod + def _gpt2_bytes_to_unicode() -> dict[int, str]: + # Returns the GPT-2 byte-to-unicode mapping: each byte (0-255) maps to a + # printable unicode character. Printable ASCII and Latin-1 supplement bytes + # map to themselves; remaining bytes are shifted to 256+. + # This is the same as openai/gpt-2's bytes_to_unicode(). + bs = list(range(ord("!"), ord("~") + 1)) + list(range(0xA1, 0xAC + 1)) + list(range(0xAE, 0xFF + 1)) + cs = list(bs) + n = 0 + for b in range(256): + if b not in bs: + bs.append(b) + cs.append(256 + n) + n += 1 + return dict(zip(bs, (chr(c) for c in cs))) + + def _set_vocab_gpt2(self, convert_metaspace_to_gpt2=False) -> None: tokens, toktypes, tokpre = self.get_vocab_base() + + if convert_metaspace_to_gpt2: + # The tokenizer uses raw UTF-8 with Metaspace (▁ for spaces), but + # the "gpt2" tokenizer model in llama.cpp expects GPT-2 byte encoding + # (where each byte is mapped to a printable unicode char, e.g. space -> Ġ). + # Convert all tokens: replace ▁ back to space, then apply GPT-2 byte encoding. + byte_encoder = self._gpt2_bytes_to_unicode() + seen: set[str] = set() + for i, token in enumerate(tokens): + if toktypes[i] in (gguf.TokenType.NORMAL, gguf.TokenType.USER_DEFINED): + if token == " ": + # Useless token in Luciole + encoded = "".join(byte_encoder[b] for b in "\u2581".encode("utf-8")) + else: + encoded = "".join(byte_encoder[b] for b in token.replace("\u2581", " ").encode("utf-8")) + assert encoded not in seen, f"Unexpected collision in GPT-2 byte encoding: {encoded!r} for '{token}'" + seen.add(encoded) + tokens[i] = encoded + else: # gguf.TokenType.CONTROL + print("NOCOMMIT", i, token, toktypes[i]) + assert token not in seen, f"Unexpected collision in GPT-2 byte encoding: {token}" + seen.add(token) + self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + if convert_metaspace_to_gpt2: + special_vocab.merges = [ + " ".join( + "".join(byte_encoder[b] for b in part.replace("\u2581", " ").encode("utf-8")) + for part in merge.split(" ") + ) + for merge in special_vocab.merges + ] + special_vocab.add_to_gguf(self.gguf_writer) + return tokens + + def _set_vocab_bpe_as_spm(self) -> None: + """Convert a HuggingFace BPE tokenizer (with Metaspace ▁) to SPM format for llama.cpp. + + This reads the vocab from tokenizer.json, keeps tokens in their original + UTF-8 form (with ▁ preserved), assigns scores from merge ranks, and adds + byte fallback tokens <0x00>-<0xFF> required by the SPM tokenizer in C++. + """ + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) + + reverse_vocab = {id_: tok for tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + added_tokens_decoder = tokenizer.added_tokens_decoder + + # Build merge rank lookup: token_text -> rank (lower rank = merged earlier = higher priority) + merge_ranks: dict[str, int] = {} + merges_file = self.dir_model / "tokenizer.json" + if merges_file.is_file(): + import json as _json + with open(merges_file, "r", encoding="utf-8") as f: + tokenizer_json = _json.load(f) + merges = tokenizer_json.get("model", {}).get("merges", []) + for rank, merge in enumerate(merges): + # merge can be "token_a token_b" (str) or ["token_a", "token_b"] (list) + parts = merge.split(" ") if isinstance(merge, str) else merge + merged_token = "".join(parts) + if merged_token not in merge_ranks: + merge_ranks[merged_token] = rank + + # Prepare token arrays + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size + + # Track which byte values are covered (for byte fallback) + byte_token_ids: dict[int, int] = {} + + for token_id in range(vocab_size): + if token_id not in reverse_vocab: + continue + + token_text = reverse_vocab[token_id] + + if token_id in added_tokens_decoder: + info = added_tokens_decoder[token_id] + if info.special or self.does_token_look_special(token_text): + tokens[token_id] = token_text.encode("utf-8") + scores[token_id] = 0.0 + toktypes[token_id] = SentencePieceTokenTypes.CONTROL + continue + + # Check if this is a byte fallback token (<0xHH>) or a single-byte token + import re as _re + raw_bytes = token_text.encode("utf-8") + byte_match = _re.fullmatch(r"<0x([0-9A-Fa-f]{2})>", token_text) + if byte_match: + byte_val = int(byte_match.group(1), 16) + byte_token_ids[byte_val] = token_id + tokens[token_id] = token_text.encode("utf-8") + scores[token_id] = -10000.0 + toktypes[token_id] = SentencePieceTokenTypes.BYTE + continue + elif len(raw_bytes) == 1: + byte_token_ids[raw_bytes[0]] = token_id + + # Assign score based on merge rank or token_id + if token_text in merge_ranks: + # Merged tokens: earlier merges get higher (less negative) scores + # Use negative rank so that rank 0 (first merge) gets highest score + score = -float(merge_ranks[token_text]) + else: + # Base tokens (single chars) get high scores; unknown tokens get low scores + if len(raw_bytes) == 1: + score = 0.0 + else: + score = -10000.0 + float(token_id) + + tokens[token_id] = raw_bytes + scores[token_id] = score + toktypes[token_id] = SentencePieceTokenTypes.NORMAL + + # Add byte fallback tokens for any missing byte values + # SPM in llama.cpp requires <0x00> through <0xFF> with BYTE type + next_pad_idx = 0 + for byte_val in range(256): + if byte_val in byte_token_ids: + continue # already handled above + hex_str = f"<0x{byte_val:02X}>" + if byte_val in byte_token_ids: + tid = byte_token_ids[byte_val] + tokens[tid] = hex_str.encode("utf-8") + toktypes[tid] = SentencePieceTokenTypes.BYTE + scores[tid] = -10000.0 + else: + # Find an unused PAD slot + while next_pad_idx < len(tokens) and toktypes[next_pad_idx] != SentencePieceTokenTypes.UNUSED: + next_pad_idx += 1 + if next_pad_idx < vocab_size: + tokens[next_pad_idx] = hex_str.encode("utf-8") + toktypes[next_pad_idx] = SentencePieceTokenTypes.BYTE + scores[next_pad_idx] = -10000.0 + next_pad_idx += 1 + else: + logger.warning(f"No room to add byte fallback token {hex_str}") + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) + return tokens def _set_vocab_qwen(self): dir_model = self.dir_model @@ -9607,14 +9774,27 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) +LUCIOLE_TO_BPE = False @ModelBase.register("NemotronForCausalLM") class NemotronModel(TextModel): model_arch = gguf.MODEL_ARCH.NEMOTRON def set_vocab(self): - self._set_vocab_sentencepiece() - self.gguf_writer.add_pad_token_id(0) - self.gguf_writer.add_unk_token_id(1) + if (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + self.gguf_writer.add_pad_token_id(0) + self.gguf_writer.add_unk_token_id(1) + else: + # Luciole + if LUCIOLE_TO_BPE: + tokens = self._set_vocab_gpt2(convert_metaspace_to_gpt2=True) + self.gguf_writer.add_pad_token_id(tokens.index("")) + self.gguf_writer.add_unk_token_id(tokens.index("")) + else: + tokens = self._set_vocab_bpe_as_spm() + self.gguf_writer.add_pad_token_id(tokens.index(b"")) + self.gguf_writer.add_unk_token_id(tokens.index(b"")) + self.gguf_writer.add_add_space_prefix(True) def set_gguf_parameters(self): super().set_gguf_parameters() @@ -9645,6 +9825,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.endswith("norm.weight"): data_torch = data_torch + 1 + # for tied embeddings, duplicate token_embd as output.weight + if self.hparams.get("tie_word_embeddings", False) and name == "model.embed_tokens.weight": + yield (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch) + yield from super().modify_tensors(data_torch, name, bid) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 93e697607e6..374ae159c2e 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -30,7 +30,7 @@ class LibLlama: DEFAULT_PATH_LLAMA_H = "./include/llama.h" DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"] - DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON + DEFAULT_PATH_LIBLLAMA = "./build/bin/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None): path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H @@ -79,6 +79,9 @@ def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}): self.model = self.lib.llama_model_load_from_file(path_model.encode(), mparams) if not self.model: raise RuntimeError("error: failed to load model '%s'" % path_model) + self.vocab = self.lib.llama_model_get_vocab(self.model) + if not self.vocab: + raise RuntimeError("error: failed to get vocab for model '%s'" % path_model) if isinstance(cparams, dict): cparams = libllama.context_default_params(**cparams) self.ctx = self.lib.llama_new_context_with_model(self.model, cparams) @@ -99,10 +102,10 @@ def free(self): def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]: encoded_text: bytes = text.encode("utf-8") - num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special) + num = self.lib.llama_tokenize(self.vocab, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special) while num < 0 and len(self.token_ids) < (16 << 20): self.token_ids = self.ffi.new("llama_token[]", -2 * num) - num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special) + num = self.lib.llama_tokenize(self.vocab, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special) return list(self.token_ids[0:num]) def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str: @@ -110,10 +113,10 @@ def detokenize(self, ids: list[int], remove_special: bool = False, unparse_speci self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids)) for i, id in enumerate(ids): self.token_ids[i] = id - num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) + num = self.lib.llama_detokenize(self.vocab, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) while num < 0 and len(self.text_buff) < (16 << 20): self.text_buff = self.ffi.new("uint8_t[]", -2 * num) - num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) + num = self.lib.llama_detokenize(self.vocab, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special) return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD' @@ -152,6 +155,9 @@ def encode(self, text: str) -> list[int]: def decode(self, ids: list[int]) -> str: return self.model.decode(ids, skip_special_tokens=False) + + def convert_ids_to_tokens(self, ids: list[int]) -> list[str]: + return self.model.convert_ids_to_tokens(ids) class TokenizerLlamaCpp (Tokenizer): @@ -204,6 +210,12 @@ def generator_custom_text() -> Iterator[str]: "\n =", "' era", "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", + ] + + +def generator_digit() -> Iterator[str]: + """Digits""" + yield from [ "3", "33", "333", @@ -213,6 +225,20 @@ def generator_custom_text() -> Iterator[str]: "3333333", "33333333", "333333333", + "333333333+333", + ] + + +def generator_contractions() -> Iterator[str]: + """Contractions and apostrophes""" + yield from [ + "I'll", + "We've they're", + "Bonjour quoiqu'aujourd'hui", + "puisqu'après", + "j're", + "“Bonjour quoiqu'aujourd'hui”", + "puisqu’après", ] @@ -418,7 +444,7 @@ def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str): return min(len(ids1), len(ids2)) def check_detokenizer(text: str, text1: str, text2: str) -> bool: - if text1 == text2: # equal to TokenizerGroundtruth? + if text1 == text2 or text2 == text: # equal to TokenizerGroundtruth? return True # equal to source text? if tokenizer1.add_bos_token and tokenizer1.bos_token and isinstance(tokenizer1.bos_token, str): # remove BOS @@ -436,7 +462,7 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool: t_start = time.perf_counter() encode_errors = 0 decode_errors = 0 - MAX_ERRORS = 10 + MAX_ERRORS = 20 logger.info("%s: %s" % (generator.__qualname__, "ini")) for text in generator: @@ -455,23 +481,30 @@ def check_detokenizer(text: str, text1: str, text2: str) -> bool: t_encode2 += t2 - t1 t_decode1 += t3 - t2 t_decode2 += t4 - t3 - if encode_errors < MAX_ERRORS and ids1 != ids2: + had_error = False + if (MAX_ERRORS is None or encode_errors < MAX_ERRORS) and ids1 != ids2: i = find_first_mismatch(ids1, ids2) - ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1] - ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1] - logger.error(" Expected: " + str(ids1)) - logger.error(" Result: " + str(ids2)) + ids1_ctx = list(ids1)[max(0, i - 2) : i + 5 + 1] + ids2_ctx = list(ids2)[max(0, i - 2) : i + 5 + 1] + logger.error(f" Input: {repr(text[:100])}") + logger.error(" Expected: " + str(ids1_ctx) + " " + str(tokenizer1.convert_ids_to_tokens(ids1_ctx))) + logger.error(" Result: " + str(ids2_ctx) + " " + str(tokenizer1.convert_ids_to_tokens(ids2_ctx))) encode_errors += 1 - logger.error(f" {encode_errors=}") - if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2): + # logger.error(f" {encode_errors=}") + had_error = True + if (MAX_ERRORS is None or decode_errors < MAX_ERRORS) and not check_detokenizer(text, text1, text2): i = find_first_mismatch(text1, text2) - text1 = list(text1[max(0, i - 2) : i + 5 + 1]) - text2 = list(text2[max(0, i - 2) : i + 5 + 1]) - logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1)) - logger.error(" Result: " + " ".join(hex(ord(x)) for x in text2)) + text1_ctx = text1[max(0, i - 2) : i + 5 + 1] + text2_ctx = text2[max(0, i - 2) : i + 5 + 1] + logger.error(f" Input: {repr(text[:100])}") + logger.error(" Expected: " + repr(text1_ctx)) + logger.error(" Result: " + repr(text2_ctx)) decode_errors += 1 - logger.error(f" {decode_errors=}") - if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS: + # logger.error(f" {decode_errors=}") + had_error = True + if had_error: + logger.error("") + if MAX_ERRORS is not None and encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS: logger.error(f" EXIT: {encode_errors=} {decode_errors=}") # raise Exception() break @@ -493,74 +526,76 @@ def main(argv: list[str] | None = None): tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer) tokenizer2 = TokenizerLlamaCpp(args.vocab_file) - # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text()) - # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases()) + compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text()) + compare_tokenizers(tokenizer1, tokenizer2, generator_digit()) + compare_tokenizers(tokenizer1, tokenizer2, generator_contractions()) + compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases()) compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip()) compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe()) compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes()) compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1)) compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000)) - # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000)) + compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000)) tokenizer2.model.free() if __name__ == "__main__": - # main() - - if True: - logging.basicConfig( - level = logging.DEBUG, - format = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s", - datefmt = "%Y-%m-%d %H:%M:%S", - filename = logger.name + ".log", - filemode = "a" - ) - logging.basicConfig( - level = logging.DEBUG, - format = "%(levelname)s %(message)s", - ) - - path_tokenizers = Path("./models/tokenizers/") - path_vocab_format = "./models/ggml-vocab-%s.gguf" - - tokenizers = [ - "llama-spm", # SPM - "phi-3", # SPM - "gemma", # SPM - "gemma-2", # SPM - "baichuan", # SPM - "bert-bge", # WPM - "jina-v2-en", # WPM - "llama-bpe", # BPE - "phi-2", # BPE - "deepseek-llm", # BPE - "deepseek-coder", # BPE - "falcon", # BPE - "mpt", # BPE - "starcoder", # BPE - "gpt-2", # BPE - "stablelm2", # BPE - "refact", # BPE - "qwen2", # BPE - "olmo", # BPE - "jina-v2-es", # BPE - "jina-v2-de", # BPE - "smaug-bpe", # BPE - "poro-chat", # BPE - "jina-v2-code", # BPE - "viking", # BPE - "jais", # BPE - ] - - logger.info("=" * 50) - for tokenizer in tokenizers: - logger.info("-" * 50) - logger.info(f"TOKENIZER: '{tokenizer}'") - vocab_file = Path(path_vocab_format % tokenizer) - dir_tokenizer = path_tokenizers / tokenizer - main([str(vocab_file), str(dir_tokenizer), "--verbose"]) + main() + + # if True: + # logging.basicConfig( + # level = logging.DEBUG, + # format = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s", + # datefmt = "%Y-%m-%d %H:%M:%S", + # filename = logger.name + ".log", + # filemode = "a" + # ) + # logging.basicConfig( + # level = logging.DEBUG, + # format = "%(levelname)s %(message)s", + # ) + + # path_tokenizers = Path("./models/tokenizers/") + # path_vocab_format = "./models/ggml-vocab-%s.gguf" + + # tokenizers = [ + # "llama-spm", # SPM + # "phi-3", # SPM + # "gemma", # SPM + # "gemma-2", # SPM + # "baichuan", # SPM + # "bert-bge", # WPM + # "jina-v2-en", # WPM + # "llama-bpe", # BPE + # "phi-2", # BPE + # "deepseek-llm", # BPE + # "deepseek-coder", # BPE + # "falcon", # BPE + # "mpt", # BPE + # "starcoder", # BPE + # "gpt-2", # BPE + # "stablelm2", # BPE + # "refact", # BPE + # "qwen2", # BPE + # "olmo", # BPE + # "jina-v2-es", # BPE + # "jina-v2-de", # BPE + # "smaug-bpe", # BPE + # "poro-chat", # BPE + # "jina-v2-code", # BPE + # "viking", # BPE + # "jais", # BPE + # ] + + # logger.info("=" * 50) + # for tokenizer in tokenizers: + # logger.info("-" * 50) + # logger.info(f"TOKENIZER: '{tokenizer}'") + # vocab_file = Path(path_vocab_format % tokenizer) + # dir_tokenizer = path_tokenizers / tokenizer + # main([str(vocab_file), str(dir_tokenizer), "--verbose"])