Skip to content

Commit

Permalink
change deprecated llama_eval to llama_decode
Browse files Browse the repository at this point in the history
  • Loading branch information
guinmoon committed Mar 20, 2024
1 parent 01d07c4 commit f70a553
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 63 deletions.
2 changes: 1 addition & 1 deletion Sources/llmfarm_core/AI.swift
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ public func get_model_context_param_by_config(_ model_config:Dictionary<String,
}

if (model_config["prompt_format"] != nil && model_config["prompt_format"]! as! String != "auto"
&& model_config["prompt_format"]! as! String != "{{prompt}}"){
&& model_config["prompt_format"]! as! String != "{prompt}"){
tmp_param.custom_prompt_format = model_config["prompt_format"]! as! String
(tmp_param.custom_prompt_format,tmp_param.system_prompt) = get_system_prompt(tmp_param.custom_prompt_format)
tmp_param.promptFormat = .Custom
Expand Down
24 changes: 13 additions & 11 deletions Sources/llmfarm_core/LLMBase.swift
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ public class LLMBase {
if inputTokens.count == 0{
return "Empty input."
}
self.session_tokens.append(contentsOf: inputTokens)
// self.session_tokens.append(contentsOf: inputTokens)
let inputTokensCount = inputTokens.count
print("Input tokens: \(inputTokens)")
// Add new input tokens to past array
Expand Down Expand Up @@ -382,8 +382,8 @@ public class LLMBase {
var outputTokens: [ModelToken] = []
var output = [String]()
// Loop until target count is reached
var outputEnabled = true
while outputEnabled {
var completion_loop = true
while completion_loop {
// Pull a generation from context
var outputToken:Int32 = -1
try ExceptionCather.catchException {
Expand All @@ -407,14 +407,15 @@ public class LLMBase {
}
// Add output token to array
outputTokens.append(outputToken)
past.append([outputToken])
// Repeat tokens update
outputRepeatTokens.append(outputToken)
if outputRepeatTokens.count > params.repeat_last_n {
outputRepeatTokens.removeFirst()
}
// Check for eos - end early - check eos before bos in case they are the same
if outputToken == llm_token_eos() {
outputEnabled = false
completion_loop = false
print("[EOS]")
break
}
Expand All @@ -425,23 +426,24 @@ public class LLMBase {
skipCallback = true
}
// Convert token to string and callback
self.session_tokens.append(outputToken)
// self.session_tokens.append(outputToken)
if !skipCallback, let str = llm_token_to_str(outputToken: outputToken){
output.append(str)
// Per token callback
let (output, time) = Utils.time {
return str
}
if callback(output, time) {
let (output, time) = Utils.time {
return str
}
if callback(output, time) {
// if callback(output, 0) {
// Early exit if requested by callback
print(" * exit requested by callback *")
//generating = false
outputEnabled = false //outputRemaining = 0
completion_loop = false //outputRemaining = 0
break
}
}
// Check if we need to run another response eval
if outputEnabled {
if completion_loop {
// Send generated token back into model for next generation
var eval_res:Bool? = nil
if self.nPast >= self.contextParams.context - 4{
Expand Down
129 changes: 79 additions & 50 deletions Sources/llmfarm_core/LLaMa.swift
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,37 @@ var LLaMa_obj:LLaMa? = nil
public class LLaMa: LLMBase {

public var model: OpaquePointer?
public var ctx_sampling: OpaquePointer?
public var batch: llama_batch?
public var hardware_arch: String=""
public var temporary_invalid_cchars: [CChar] = []
public var progressCallback: ((Float) -> (Bool))? = nil
// public var sparams: llama_sampling_params?

// int32_t n_prev = 64; // number of previous tokens to remember
// int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
// int32_t top_k = 40; // <= 0 to use vocab size
// float top_p = 0.95f; // 1.0 = disabled
// float min_p = 0.05f; // 0.0 = disabled
// float tfs_z = 1.00f; // 1.0 = disabled
// float typical_p = 1.00f; // 1.0 = disabled
// float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
// float dynatemp_range = 0.00f; // 0.0 = disabled
// float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
// int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
// float penalty_repeat = 1.10f; // 1.0 = disabled
// float penalty_freq = 0.00f; // 0.0 = disabled
// float penalty_present = 0.00f; // 0.0 = disabled
// int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
// float mirostat_tau = 5.00f; // target entropy
// float mirostat_eta = 0.10f; // learning rate
// bool penalize_nl = true;

public override func llm_load_model(path: String = "", contextParams: ModelAndContextParams = .default, params:gpt_context_params,
model_load_progress_callback:((Float) -> (Bool))?) throws -> Bool{
var context_params = llama_context_default_params()
var model_params = llama_model_default_params()
// self.ctx_sampling = llama_sampling_init(sparams);
context_params.n_ctx = UInt32(contextParams.context)
context_params.seed = UInt32(contextParams.seed)
context_params.n_threads = UInt32(contextParams.n_threads)
Expand Down Expand Up @@ -150,31 +172,69 @@ public class LLaMa: LLMBase {
// }

public override func llm_eval(inputBatch:[ModelToken]) throws -> Bool{
var mutable_inputBatch = inputBatch
if llama_eval(self.context, mutable_inputBatch.mutPtr, Int32(inputBatch.count), min(self.contextParams.context, self.nPast)) != 0 {
return false
// var mutable_inputBatch = inputBatch
// if llama_eval(self.context, mutable_inputBatch.mutPtr, Int32(inputBatch.count), min(self.contextParams.context, self.nPast)) != 0 {
// return false
// }
if self.nPast==0{
// if self.nPast==0 || inputBatch.count>1{
completion_init(tokens_list:inputBatch)
}else{
llama_batch_clear(&batch!)
for i1 in 0..<inputBatch.count {
let i = Int(i1)
llama_batch_add(&batch!, inputBatch[i], Int32(i)+self.nPast, [0], true)
}
// batch!.logits[Int(batch!.n_tokens) - 1] = 1
if llama_decode(context, batch!) != 0 {
print("failed to evaluate llama!")
return false
}
}
// if self.nPast==0{
// completion_init(tokens_list:inputBatch)
// }else{
// llama_batch_clear(&batch!)
// for i1:Int32 in 0..<Int32(inputBatch.count) {
// llama_batch_add(&batch!, inputBatch[Int(i1)], self.nPast+i1, [0], false)
// }
// batch!.logits[Int(batch!.n_tokens) - 1] = 1
//// llama_batch_add(&batch!, inputBatch[0], self.nPast, [0], true)
return true
}

func completion_init(tokens_list: [ModelToken]) {
// print("attempting to complete \"\(text)\"")

// tokens_list = tokenize(text: text, add_bos: true)
temporary_invalid_cchars = []

// let n_ctx = llama_n_ctx(context)
// let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
//
//// n_decode += 1
//// n_cur += 1
// print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
//
// if llama_decode(context, batch!) != 0 {
// print("failed to evaluate llama!")
// }
// if n_kv_req > n_ctx {
// print("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
// }
return true

// for id in tokens_list {
// print(String(cString: token_to_piece(token: id) + [0]))
// }

llama_batch_clear(&batch!)

for i1 in 0..<tokens_list.count {
let i = Int(i1)
llama_batch_add(&batch!, tokens_list[i], Int32(i), [0], false)
}
batch!.logits[Int(batch!.n_tokens) - 1] = 1

if llama_decode(context, batch!) != 0 {
print("llama_decode() failed")
}

// n_cur = batch.n_tokens
}


func sample_wip(){
var new_token_id: llama_token = 0


}

override func llm_init_logits() throws -> Bool {
return true
}
Expand Down Expand Up @@ -215,38 +275,7 @@ public class LLaMa: LLMBase {
return SwiftString
}

func completion_init(tokens_list: [ModelToken]) {
// print("attempting to complete \"\(text)\"")

// tokens_list = tokenize(text: text, add_bos: true)
temporary_invalid_cchars = []

// let n_ctx = llama_n_ctx(context)
// let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
//
// print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
//
// if n_kv_req > n_ctx {
// print("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
// }

// for id in tokens_list {
// print(String(cString: token_to_piece(token: id) + [0]))
// }

llama_batch_clear(&batch!)

for i1:Int32 in 0..<Int32(tokens_list.count) {
llama_batch_add(&batch!, tokens_list[Int(i1)], i1, [0], false)
}
batch!.logits[Int(batch!.n_tokens) - 1] = 1 // true

if llama_decode(context, batch!) != 0 {
print("llama_decode() failed")
}

// n_cur = batch.n_tokens
}


private func token_to_piece(token: Int32) -> [CChar] {
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
Expand Down
2 changes: 1 addition & 1 deletion Sources/llmfarm_core_cpp/ggml/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@
do { \
if (!(x)) { \
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
char descr[500]; \
char descr[700]; \
sprintf(descr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x);\
throw_exception(descr); \
} \
Expand Down

0 comments on commit f70a553

Please sign in to comment.