Skip to content

Commit

Permalink
gguf : make token scores and types optional (ggerganov#3347)
Browse files Browse the repository at this point in the history
  • Loading branch information
cebtenzzre committed Sep 28, 2023
1 parent 2619109 commit ecf90b1
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 22 deletions.
6 changes: 0 additions & 6 deletions convert-falcon-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,6 @@ def parse_args() -> argparse.Namespace:
print("gguf: get tokenizer metadata")

tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []

tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():
Expand Down Expand Up @@ -177,12 +175,8 @@ def parse_args() -> argparse.Namespace:
text = bytearray(pad_token)

tokens.append(text)
scores.append(0.0) # dymmy
toktypes.append(gguf.TokenType.NORMAL) # dummy

gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
Expand Down
6 changes: 0 additions & 6 deletions convert-starcoder-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,6 @@ def parse_args() -> argparse.Namespace:
print("gguf: get tokenizer metadata")

tokens: list[bytearray] = []
scores: list[float] = []
toktypes: list[int] = []

tokenizer_json_file = dir_model / 'tokenizer.json'
if not tokenizer_json_file.is_file():
Expand Down Expand Up @@ -161,12 +159,8 @@ def parse_args() -> argparse.Namespace:
text = bytearray(pad_token)

tokens.append(text)
scores.append(0.0) # dymmy
toktypes.append(gguf.TokenType.NORMAL) # dummy

gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
Expand Down
18 changes: 8 additions & 10 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1931,20 +1931,18 @@ static void llm_load_vocab(
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
}

const float * scores = nullptr;
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
if (score_idx == -1) {
throw std::runtime_error("cannot find tokenizer scores in model file\n");
if (score_idx != -1) {
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
}

const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);

const int * toktypes = nullptr;
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
if (toktype_idx == -1) {
throw std::runtime_error("cannot find token type list in GGUF file\n");
if (toktype_idx != -1) {
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
}

const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);

// determine vocab type
{
std::string tokenizer_name;
Expand Down Expand Up @@ -2012,8 +2010,8 @@ static void llm_load_vocab(

auto & token_data = vocab.id_to_token[i];
token_data.text = std::move(word);
token_data.score = scores[i];
token_data.type = (llama_token_type) toktypes[i];
token_data.score = scores ? scores[i] : 0.0f;
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
}

// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
Expand Down

0 comments on commit ecf90b1

Please sign in to comment.