api: reduce log verbosity, log the time-per-token

2023-04-08 16:04:43 +12:00 · 2023-04-08 16:04:43 +12:00 · 3ff357b7d4
commit 3ff357b7d4
parent 07c5ca1015
1 changed files with 8 additions and 15 deletions
--- a/api.go
+++ b/api.go
@ -5,6 +5,7 @@ import (
 	"log"
 	"net/http"
 	"runtime"
 	"time"
 )
 /*
@ -54,6 +55,7 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
 	// TODO
 	// Wait for a free worker
 	// TODO signal the queue length to the user?
 	select {
 	case this.sem <- struct{}{}:
 		// OK
@ -84,11 +86,9 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
 	llast_n_tokens := make([]C.llama_token, ParamContextSize)
 	log.Println("tokenizing supplied prompt...")
 	llast_n_tokens_used_size := C.llama_tokenize(lcontext, C.CString(apiParams.Content), &llast_n_tokens[0], ParamContextSize, true)
 	if llast_n_tokens_used_size <= 0 {
-		log.Printf("llama_tokenize returned non-positive size (%d)", llast_n_tokens_used_size)
+		log.Printf("llama_tokenize: got non-positive size (%d)", llast_n_tokens_used_size)
 		http.Error(w, "Internal error", 500)
 		return
 	}
@ -137,9 +137,7 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
 			return
 		}
-		//
+		// Perform the LLaMA sampling step
 		log.Println("doing llama_sample_top_p_top_k...")
 		penalizeStart := 0
 		penalizeLen := i
@ -149,12 +147,8 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
 		}
 		newTokenId := C.llama_sample_top_p_top_k(lcontext,
-
+			&llast_n_tokens[penalizeStart], C.int(penalizeLen), // Penalize recent tokens
-			// Penalize recent tokens
+			ParamTopK, ParamTopP, ParamTemperature, ParamRepeatPenalty) // Other static parameters
 			&llast_n_tokens[penalizeStart], C.int(penalizeLen),
 			// Other static parameters
 			ParamTopK, ParamTopP, ParamTemperature, ParamRepeatPenalty)
 		if newTokenId == C.llama_token_eos() {
 			// The model doesn't have anything to say
@ -170,11 +164,10 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) {
 		// The model did have something to say
 		tokenStr := C.GoString(C.llama_token_to_str(lcontext, newTokenId))
-		// Push this new token into the lembedding_ state, or else we'll just get it over and over again
+		// Push this new token into the llast_n_tokens state, or else we'll just get it over and over again
 		llast_n_tokens[i] = newTokenId
-		// time.Sleep(1 * time.Second)
+		w.Write([]byte(tokenStr))
 		w.Write([]byte(tokenStr)) // fmt.Sprintf(" update %d", i)))
 		flusher.Flush()
 	}
 }