diff --git a/api.go b/api.go index 0d6204c..f55a602 100644 --- a/api.go +++ b/api.go @@ -5,6 +5,7 @@ import ( "log" "net/http" "runtime" + "time" ) /* @@ -54,6 +55,7 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) { // TODO // Wait for a free worker + // TODO signal the queue length to the user? select { case this.sem <- struct{}{}: // OK @@ -84,11 +86,9 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) { llast_n_tokens := make([]C.llama_token, ParamContextSize) - log.Println("tokenizing supplied prompt...") - llast_n_tokens_used_size := C.llama_tokenize(lcontext, C.CString(apiParams.Content), &llast_n_tokens[0], ParamContextSize, true) if llast_n_tokens_used_size <= 0 { - log.Printf("llama_tokenize returned non-positive size (%d)", llast_n_tokens_used_size) + log.Printf("llama_tokenize: got non-positive size (%d)", llast_n_tokens_used_size) http.Error(w, "Internal error", 500) return } @@ -137,9 +137,7 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) { return } - // - - log.Println("doing llama_sample_top_p_top_k...") + // Perform the LLaMA sampling step penalizeStart := 0 penalizeLen := i @@ -149,12 +147,8 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) { } newTokenId := C.llama_sample_top_p_top_k(lcontext, - - // Penalize recent tokens - &llast_n_tokens[penalizeStart], C.int(penalizeLen), - - // Other static parameters - ParamTopK, ParamTopP, ParamTemperature, ParamRepeatPenalty) + &llast_n_tokens[penalizeStart], C.int(penalizeLen), // Penalize recent tokens + ParamTopK, ParamTopP, ParamTemperature, ParamRepeatPenalty) // Other static parameters if newTokenId == C.llama_token_eos() { // The model doesn't have anything to say @@ -170,11 +164,10 @@ func (this *Application) POST_Chat(w http.ResponseWriter, r *http.Request) { // The model did have something to say tokenStr := C.GoString(C.llama_token_to_str(lcontext, newTokenId)) - // Push this new token into the lembedding_ state, or else we'll just get it over and over again + // Push this new token into the llast_n_tokens state, or else we'll just get it over and over again llast_n_tokens[i] = newTokenId - // time.Sleep(1 * time.Second) - w.Write([]byte(tokenStr)) // fmt.Sprintf(" update %d", i))) + w.Write([]byte(tokenStr)) flusher.Flush() } }