api: reduce log verbosity, log the time-per-token
This commit is contained in:
parent
07c5ca1015
commit
3ff357b7d4
23
api.go
23
api.go
@ -5,6 +5,7 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
"net/http"
|
"net/http"
|
||||||
"runtime"
|
"runtime"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -54,6 +55,7 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
|
|||||||
// TODO
|
// TODO
|
||||||
|
|
||||||
// Wait for a free worker
|
// Wait for a free worker
|
||||||
|
// TODO signal the queue length to the user?
|
||||||
select {
|
select {
|
||||||
case this.sem <- struct{}{}:
|
case this.sem <- struct{}{}:
|
||||||
// OK
|
// OK
|
||||||
@ -84,11 +86,9 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
llast_n_tokens := make([]C.llama_token, ParamContextSize)
|
llast_n_tokens := make([]C.llama_token, ParamContextSize)
|
||||||
|
|
||||||
log.Println("tokenizing supplied prompt...")
|
|
||||||
|
|
||||||
llast_n_tokens_used_size := C.llama_tokenize(lcontext, C.CString(apiParams.Content), &llast_n_tokens[0], ParamContextSize, true)
|
llast_n_tokens_used_size := C.llama_tokenize(lcontext, C.CString(apiParams.Content), &llast_n_tokens[0], ParamContextSize, true)
|
||||||
if llast_n_tokens_used_size <= 0 {
|
if llast_n_tokens_used_size <= 0 {
|
||||||
log.Printf("llama_tokenize returned non-positive size (%d)", llast_n_tokens_used_size)
|
log.Printf("llama_tokenize: got non-positive size (%d)", llast_n_tokens_used_size)
|
||||||
http.Error(w, "Internal error", 500)
|
http.Error(w, "Internal error", 500)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@ -137,9 +137,7 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
// Perform the LLaMA sampling step
|
||||||
|
|
||||||
log.Println("doing llama_sample_top_p_top_k...")
|
|
||||||
|
|
||||||
penalizeStart := 0
|
penalizeStart := 0
|
||||||
penalizeLen := i
|
penalizeLen := i
|
||||||
@ -149,12 +147,8 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
newTokenId := C.llama_sample_top_p_top_k(lcontext,
|
newTokenId := C.llama_sample_top_p_top_k(lcontext,
|
||||||
|
&llast_n_tokens[penalizeStart], C.int(penalizeLen), // Penalize recent tokens
|
||||||
// Penalize recent tokens
|
ParamTopK, ParamTopP, ParamTemperature, ParamRepeatPenalty) // Other static parameters
|
||||||
&llast_n_tokens[penalizeStart], C.int(penalizeLen),
|
|
||||||
|
|
||||||
// Other static parameters
|
|
||||||
ParamTopK, ParamTopP, ParamTemperature, ParamRepeatPenalty)
|
|
||||||
|
|
||||||
if newTokenId == C.llama_token_eos() {
|
if newTokenId == C.llama_token_eos() {
|
||||||
// The model doesn't have anything to say
|
// The model doesn't have anything to say
|
||||||
@ -170,11 +164,10 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
|
|||||||
// The model did have something to say
|
// The model did have something to say
|
||||||
tokenStr := C.GoString(C.llama_token_to_str(lcontext, newTokenId))
|
tokenStr := C.GoString(C.llama_token_to_str(lcontext, newTokenId))
|
||||||
|
|
||||||
// Push this new token into the lembedding_ state, or else we'll just get it over and over again
|
// Push this new token into the llast_n_tokens state, or else we'll just get it over and over again
|
||||||
llast_n_tokens[i] = newTokenId
|
llast_n_tokens[i] = newTokenId
|
||||||
|
|
||||||
// time.Sleep(1 * time.Second)
|
w.Write([]byte(tokenStr))
|
||||||
w.Write([]byte(tokenStr)) // fmt.Sprintf(" update %d", i)))
|
|
||||||
flusher.Flush()
|
flusher.Flush()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user