diff --git a/api.go b/api.go index 7265ac8..0d6204c 100644 --- a/api.go +++ b/api.go @@ -110,37 +110,29 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) { return } - // Get the next token from LLaMA + // Perform the LLaMA evaluation step + evalTokenStart := i - 1 + evalTokenCount := 1 + evalTokenPast := i if i == int(llast_n_tokens_used_size) { - - log.Println("doing llama_eval (for the first time on all supplied input)...") - - evalErr := C.llama_eval(lcontext, - &llast_n_tokens[0], C.int(i), // tokens + n_tokens is the provided batch of new tokens to process - C.int(0), // n_past is the number of tokens to use from previous eval calls - C.int(runtime.GOMAXPROCS(0))) - if evalErr != 0 { - log.Printf("llama_eval: %d", evalErr) - http.Error(w, "Internal error", 500) - return - } - - } else { - - log.Println("doing llama_eval (incrementally on the newly generated token)...") - - evalErr := C.llama_eval(lcontext, - &llast_n_tokens[i-1], 1, // tokens + n_tokens is the provided batch of new tokens to process - C.int(i), // n_past is the number of tokens to use from previous eval calls - C.int(runtime.GOMAXPROCS(0))) - if evalErr != 0 { - log.Printf("llama_eval: %d", evalErr) - http.Error(w, "Internal error", 500) - return - } + evalTokenStart = 0 + evalTokenCount = i + evalTokenPast = 0 } + evalStartTime := time.Now() + evalErr := C.llama_eval(lcontext, + &llast_n_tokens[evalTokenStart], C.int(evalTokenCount), // tokens + n_tokens is the provided batch of new tokens to process + C.int(evalTokenPast), // n_past is the number of tokens to use from previous eval calls + C.int(runtime.GOMAXPROCS(0))) + + log.Printf("llama_eval: Evaluated %d token(s) in %s", evalTokenCount, time.Now().Sub(evalStartTime).String()) + if evalErr != 0 { + log.Printf("llama_eval: %d", evalErr) + http.Error(w, "Internal error", 500) + return + } if err := r.Context().Err(); err != nil { return }