From 3a6fcaf8007c3b5ba80277b4d262f5cad8f42043 Mon Sep 17 00:00:00 2001
From: Henry Chen <1474479+chenhunghan@users.noreply.github.com>
Date: Sat, 23 Sep 2023 11:34:52 +0300
Subject: [PATCH] Add defaults/warning for max-tokens and context-length,
 document env vars (#69)

---
 README.md                                | 35 ++++++++++++++++++++++++
 charts/ialacol/Chart.yaml                |  4 +--
 charts/ialacol/templates/deployment.yaml |  4 +--
 const.py                                 |  2 ++
 get_config.py                            |  6 +++-
 main.py                                  |  3 +-
 6 files changed, 48 insertions(+), 6 deletions(-)
 create mode 100644 const.py

diff --git a/README.md b/README.md
index 39f5c1e..76ab3e8 100644
--- a/README.md
+++ b/README.md
@@ -80,6 +80,41 @@ openai -k "sk-fake" \
      -g user "Hello world!"
 ```
 
+### Configuration
+
+All configuration is done via environmental variable.
+
+| Parameter                    | Description                                                          | Default | Example                                                                      |
+| :----------------------------| :------------------------------------------------------------------- | :------ | :--------------------------------------------------------------------------- |
+| `DEFAULT_MODEL_HG_REPO_ID`   | The Hugging Face repo id to download the model                       | `None`  | `TheBloke/orca_mini_3B-GGML`                                                 |
+| `DEFAULT_MODEL_FILE`         | The file name to download from the repo, optional for GPTQ models    | `None`  | `orca-mini-3b.ggmlv3.q4_0.bin`                                               |
+| `MODE_TYPE`                  | Model type to override the auto model type detection                 | `None`  | `gptq`, `gpt_bigcode`, `llama`, `mpt`, `replit`, `falcon`, `gpt_neox` `gptj` |
+| `LOGGING_LEVEL`              | Logging level                                                        | `INFO`  | `DEBUG`                                                                      |
+| `TOP_K`                      | top-k for sampling.                                                  | `40 `   | Integers                                                                     |
+| `TOP_P`                      | top-p for sampling.                                                  | `1.0`   | Floats                                                                       |
+| `REPETITION_PENALTY`         | rp for sampling.                                                     | `1.1`   | Floats                                                                       |
+| `LAST_N_TOKENS`              | The last n tokens for repetition penalty.                            | `1.1`   | Integers                                                                     |
+| `SEED`                       | The seed for sampling.                                               | `-1`    | Integers                                                                     |
+| `BATCH_SIZE`                 | The batch size for evaluating tokens, only for GGUF/GGML models      | `8`     | Integers                                                                     |
+| `THREADS`                    | Thread number override auto detect by CPU/2, set `1` for GPTQ models | `Auto`  | Integers                                                                     |
+| `MAX_TOKENS`                 | The max number of token to generate                                  | `512`   | Integers                                                                     |
+| `STOP`                       | The token to stop the generation                                     | `None`  | `<|endoftext>`                                                               |
+| `CONTEXT_LENGTH`             | Override the auto detect context length                              | `512`   | Integers                                                                     |
+| `GPU_LAYERS`                 | The number of layers to off load to GPU                              | `0`     | Integers                                                                     |
+| `TRUNCATE_PROMPT_LENGTH`     | Truncate the prompt if set                                           | `0`     | Integers                                                                     |
+
+Sampling parameters including `TOP_K`, `TOP_P`, `REPETITION_PENALTY`, `LAST_N_TOKENS`, `SEED`, `MAX_TOKENS`, `STOP` can be override per request via request body, for example:
+
+```sh
+curl -X POST \
+     -H 'Content-Type: application/json' \
+     -d '{ "messages": [{"role": "user", "content": "Tell me a story."}], "model": "llama-2-7b-chat.ggmlv3.q4_0.bin", "stream": false, "temperature": "2", "top_p": "1.0", "top_k": "0" }' \
+     http://localhost:8000/v1/chat/completions
+```
+
+will use `temperature=2`, `top_p=1` and `top_k=0`for this request.
+
+
 ### Run in Container
 
 #### Image from Github Registry
diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml
index de634c1..1de887f 100644
--- a/charts/ialacol/Chart.yaml
+++ b/charts/ialacol/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
-appVersion: 0.11.2
+appVersion: 0.11.3
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.11.2
+version: 0.11.3
diff --git a/charts/ialacol/templates/deployment.yaml b/charts/ialacol/templates/deployment.yaml
index acd1b4f..d3c4b1a 100644
--- a/charts/ialacol/templates/deployment.yaml
+++ b/charts/ialacol/templates/deployment.yaml
@@ -29,6 +29,8 @@ spec:
             value: {{ (.Values.deployment.env).DEFAULT_MODEL_HG_REPO_ID | quote }}
           - name: DEFAULT_MODEL_FILE
             value: {{ (.Values.deployment.env).DEFAULT_MODEL_FILE | quote }}
+          - name: MODE_TYPE
+            value: {{ (.Values.deployment.env).MODE_TYPE | quote }}
           - name: LOGGING_LEVEL
             value: {{ (.Values.deployment.env).LOGGING_LEVEL | quote }}
           - name: TOP_K
@@ -55,8 +57,6 @@ spec:
             value: {{ (.Values.deployment.env).CONTEXT_LENGTH | quote }}
           - name: GPU_LAYERS
             value: {{ (.Values.deployment.env).GPU_LAYERS | quote }}
-          - name: MODE_TYPE
-            value: {{ (.Values.deployment.env).MODE_TYPE | quote }}
           - name: TRUNCATE_PROMPT_LENGTH
             value: {{ (.Values.deployment.env).TRUNCATE_PROMPT_LENGTH | quote }}
           volumeMounts:
diff --git a/const.py b/const.py
new file mode 100644
index 0000000..de15110
--- /dev/null
+++ b/const.py
@@ -0,0 +1,2 @@
+DEFAULT_MAX_TOKENS = "512"
+DEFAULT_CONTEXT_LENGTH = "512"
\ No newline at end of file
diff --git a/get_config.py b/get_config.py
index 5e4c3f0..9c08c48 100644
--- a/get_config.py
+++ b/get_config.py
@@ -4,6 +4,7 @@ from request_body import ChatCompletionRequestBody, CompletionRequestBody
 from get_env import get_env, get_env_or_none
 from get_default_thread import get_default_thread
 from log import log
+from const import DEFAULT_MAX_TOKENS, DEFAULT_CONTEXT_LENGTH
 
 THREADS = int(get_env("THREADS", str(get_default_thread())))
 
@@ -25,7 +26,10 @@ def get_config(
     # ggml only, follow ctransformers defaults
     BATCH_SIZE = int(get_env("BATCH_SIZE", "8"))
     # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-max_tokens
-    MAX_TOKENS = int(get_env("MAX_TOKENS", "9999999"))
+    MAX_TOKENS = int(get_env("MAX_TOKENS", DEFAULT_MAX_TOKENS))
+    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", DEFAULT_CONTEXT_LENGTH))
+    if (MAX_TOKENS > CONTEXT_LENGTH):
+        log.warning("MAX_TOKENS is greater than CONTEXT_LENGTH, setting MAX_TOKENS < CONTEXT_LENGTH")
     # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-stop
     STOP = get_env_or_none("STOP")
 
diff --git a/main.py b/main.py
index 75286c3..6d74c12 100644
--- a/main.py
+++ b/main.py
@@ -24,6 +24,7 @@ from model_generate import chat_model_generate, model_generate
 from get_env import get_env
 from log import log
 from truncate import truncate
+from const import DEFAULT_CONTEXT_LENGTH
 
 DEFAULT_MODEL_HG_REPO_ID = get_env(
     "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML"
@@ -107,7 +108,7 @@ async def startup_event():
             set_downloading_model(False)
 
     # ggml only, follow ctransformers defaults
-    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", "-1"))
+    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", DEFAULT_CONTEXT_LENGTH))
     # the layers to offloading to the GPU
     GPU_LAYERS = int(get_env("GPU_LAYERS", "0"))
 
-- 
GitLab