api: reduce log verbosity, log the time-per-token

This commit is contained in:
mappu 2023-04-08 16:04:43 +12:00
parent 07c5ca1015
commit 3ff357b7d4

23
api.go
View File

@ -5,6 +5,7 @@ import (
"log" "log"
"net/http" "net/http"
"runtime" "runtime"
"time"
) )
/* /*
@ -54,6 +55,7 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
// TODO // TODO
// Wait for a free worker // Wait for a free worker
// TODO signal the queue length to the user?
select { select {
case this.sem <- struct{}{}: case this.sem <- struct{}{}:
// OK // OK
@ -84,11 +86,9 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
llast_n_tokens := make([]C.llama_token, ParamContextSize) llast_n_tokens := make([]C.llama_token, ParamContextSize)
log.Println("tokenizing supplied prompt...")
llast_n_tokens_used_size := C.llama_tokenize(lcontext, C.CString(apiParams.Content), &llast_n_tokens[0], ParamContextSize, true) llast_n_tokens_used_size := C.llama_tokenize(lcontext, C.CString(apiParams.Content), &llast_n_tokens[0], ParamContextSize, true)
if llast_n_tokens_used_size <= 0 { if llast_n_tokens_used_size <= 0 {
log.Printf("llama_tokenize returned non-positive size (%d)", llast_n_tokens_used_size) log.Printf("llama_tokenize: got non-positive size (%d)", llast_n_tokens_used_size)
http.Error(w, "Internal error", 500) http.Error(w, "Internal error", 500)
return return
} }
@ -137,9 +137,7 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
return return
} }
// // Perform the LLaMA sampling step
log.Println("doing llama_sample_top_p_top_k...")
penalizeStart := 0 penalizeStart := 0
penalizeLen := i penalizeLen := i
@ -149,12 +147,8 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
} }
newTokenId := C.llama_sample_top_p_top_k(lcontext, newTokenId := C.llama_sample_top_p_top_k(lcontext,
&llast_n_tokens[penalizeStart], C.int(penalizeLen), // Penalize recent tokens
// Penalize recent tokens ParamTopK, ParamTopP, ParamTemperature, ParamRepeatPenalty) // Other static parameters
&llast_n_tokens[penalizeStart], C.int(penalizeLen),
// Other static parameters
ParamTopK, ParamTopP, ParamTemperature, ParamRepeatPenalty)
if newTokenId == C.llama_token_eos() { if newTokenId == C.llama_token_eos() {
// The model doesn't have anything to say // The model doesn't have anything to say
@ -170,11 +164,10 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
// The model did have something to say // The model did have something to say
tokenStr := C.GoString(C.llama_token_to_str(lcontext, newTokenId)) tokenStr := C.GoString(C.llama_token_to_str(lcontext, newTokenId))
// Push this new token into the lembedding_ state, or else we'll just get it over and over again // Push this new token into the llast_n_tokens state, or else we'll just get it over and over again
llast_n_tokens[i] = newTokenId llast_n_tokens[i] = newTokenId
// time.Sleep(1 * time.Second) w.Write([]byte(tokenStr))
w.Write([]byte(tokenStr)) // fmt.Sprintf(" update %d", i)))
flusher.Flush() flusher.Flush()
} }
} }