Add defaults/warning for max-tokens and context-length, document env vars (#69)

3a6fcaf8 · Henry Chen · GitHub · ade70c45 · 3a6fcaf8 · 3a6fcaf8
Unverified Commit 3a6fcaf8 authored 1 year ago by Henry Chen Committed by GitHub 1 year ago
--- a/README.md
+++ b/README.md
@@ -80,6 +80,41 @@ openai -k "sk-fake" \
     -g user "Hello world!"
 ```

+### Configuration
+
+All configuration is done via environmental variable.
+
+| Parameter                    | Description                                                          | Default | Example                                                                      |
+| :----------------------------| :------------------------------------------------------------------- | :------ | :--------------------------------------------------------------------------- |
+| `DEFAULT_MODEL_HG_REPO_ID`   | The Hugging Face repo id to download the model                       | `None`  | `TheBloke/orca_mini_3B-GGML`                                                 |
+| `DEFAULT_MODEL_FILE`         | The file name to download from the repo, optional for GPTQ models    | `None`  | `orca-mini-3b.ggmlv3.q4_0.bin`                                               |
+| `MODE_TYPE`                  | Model type to override the auto model type detection                 | `None`  | `gptq`, `gpt_bigcode`, `llama`, `mpt`, `replit`, `falcon`, `gpt_neox` `gptj` |
+| `LOGGING_LEVEL`              | Logging level                                                        | `INFO`  | `DEBUG`                                                                      |
+| `TOP_K`                      | top-k for sampling.                                                  | `40 `   | Integers                                                                     |
+| `TOP_P`                      | top-p for sampling.                                                  | `1.0`   | Floats                                                                       |
+| `REPETITION_PENALTY`         | rp for sampling.                                                     | `1.1`   | Floats                                                                       |
+| `LAST_N_TOKENS`              | The last n tokens for repetition penalty.                            | `1.1`   | Integers                                                                     |
+| `SEED`                       | The seed for sampling.                                               | `-1`    | Integers                                                                     |
+| `BATCH_SIZE`                 | The batch size for evaluating tokens, only for GGUF/GGML models      | `8`     | Integers                                                                     |
+| `THREADS`                    | Thread number override auto detect by CPU/2, set `1` for GPTQ models | `Auto`  | Integers                                                                     |
+| `MAX_TOKENS`                 | The max number of token to generate                                  | `512`   | Integers                                                                     |
+| `STOP`                       | The token to stop the generation                                     | `None`  | `<|endoftext>`                                                               |
+| `CONTEXT_LENGTH`             | Override the auto detect context length                              | `512`   | Integers                                                                     |
+| `GPU_LAYERS`                 | The number of layers to off load to GPU                              | `0`     | Integers                                                                     |
+| `TRUNCATE_PROMPT_LENGTH`     | Truncate the prompt if set                                           | `0`     | Integers                                                                     |
+
+Sampling parameters including `TOP_K`, `TOP_P`, `REPETITION_PENALTY`, `LAST_N_TOKENS`, `SEED`, `MAX_TOKENS`, `STOP` can be override per request via request body, for example:
+
+```sh
+curl -X POST \
+     -H 'Content-Type: application/json' \
+     -d '{ "messages": [{"role": "user", "content": "Tell me a story."}], "model": "llama-2-7b-chat.ggmlv3.q4_0.bin", "stream": false, "temperature": "2", "top_p": "1.0", "top_k": "0" }' \
+     http://localhost:8000/v1/chat/completions
+```
+
+will use `temperature=2`, `top_p=1` and `top_k=0`for this request.
+
+
 ### Run in Container

 #### Image from Github Registry

--- a/charts/ialacol/Chart.yaml
+++ b/charts/ialacol/Chart.yaml
 apiVersion: v2
-appVersion: 0.11.2
+appVersion: 0.11.3
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.11.2
+version: 0.11.3
--- a/charts/ialacol/templates/deployment.yaml
+++ b/charts/ialacol/templates/deployment.yaml
@@ -29,6 +29,8 @@ spec:
            value: {{ (.Values.deployment.env).DEFAULT_MODEL_HG_REPO_ID | quote }}
          - name: DEFAULT_MODEL_FILE
            value: {{ (.Values.deployment.env).DEFAULT_MODEL_FILE | quote }}
+          - name: MODE_TYPE
+            value: {{ (.Values.deployment.env).MODE_TYPE | quote }}
          - name: LOGGING_LEVEL
            value: {{ (.Values.deployment.env).LOGGING_LEVEL | quote }}
          - name: TOP_K
@@ -55,8 +57,6 @@ spec:
            value: {{ (.Values.deployment.env).CONTEXT_LENGTH | quote }}
          - name: GPU_LAYERS
            value: {{ (.Values.deployment.env).GPU_LAYERS | quote }}
-          - name: MODE_TYPE
-            value: {{ (.Values.deployment.env).MODE_TYPE | quote }}
          - name: TRUNCATE_PROMPT_LENGTH
            value: {{ (.Values.deployment.env).TRUNCATE_PROMPT_LENGTH | quote }}
          volumeMounts:

--- a/const.py
+++ b/const.py
+DEFAULT_MAX_TOKENS = "512"
+DEFAULT_CONTEXT_LENGTH = "512"
\ No newline at end of file
--- a/get_config.py
+++ b/get_config.py
@@ -4,6 +4,7 @@ from request_body import ChatCompletionRequestBody, CompletionRequestBody
 from get_env import get_env, get_env_or_none
 from get_default_thread import get_default_thread
 from log import log
+from const import DEFAULT_MAX_TOKENS, DEFAULT_CONTEXT_LENGTH

 THREADS = int(get_env("THREADS", str(get_default_thread())))

@@ -25,7 +26,10 @@ def get_config(
    # ggml only, follow ctransformers defaults
    BATCH_SIZE = int(get_env("BATCH_SIZE", "8"))
    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-max_tokens
-    MAX_TOKENS = int(get_env("MAX_TOKENS", "9999999"))
+    MAX_TOKENS = int(get_env("MAX_TOKENS", DEFAULT_MAX_TOKENS))
+    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", DEFAULT_CONTEXT_LENGTH))
+    if (MAX_TOKENS > CONTEXT_LENGTH):
+        log.warning("MAX_TOKENS is greater than CONTEXT_LENGTH, setting MAX_TOKENS < CONTEXT_LENGTH")
    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-stop
    STOP = get_env_or_none("STOP")


--- a/main.py
+++ b/main.py
@@ -24,6 +24,7 @@ from model_generate import chat_model_generate, model_generate
 from get_env import get_env
 from log import log
 from truncate import truncate
+from const import DEFAULT_CONTEXT_LENGTH

 DEFAULT_MODEL_HG_REPO_ID = get_env(
    "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML"
@@ -107,7 +108,7 @@ async def startup_event():
            set_downloading_model(False)

    # ggml only, follow ctransformers defaults
-    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", "-1"))
+    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", DEFAULT_CONTEXT_LENGTH))
    # the layers to offloading to the GPU
    GPU_LAYERS = int(get_env("GPU_LAYERS", "0"))