diff --git a/README.md b/README.md index 39f5c1e0d33c0dd454f6f593934a09c25a5cd35a..76ab3e8e9903a008b87bc414c8545928ebae0ba7 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,41 @@ openai -k "sk-fake" \ -g user "Hello world!" ``` +### Configuration + +All configuration is done via environmental variable. + +| Parameter | Description | Default | Example | +| :----------------------------| :------------------------------------------------------------------- | :------ | :--------------------------------------------------------------------------- | +| `DEFAULT_MODEL_HG_REPO_ID` | The Hugging Face repo id to download the model | `None` | `TheBloke/orca_mini_3B-GGML` | +| `DEFAULT_MODEL_FILE` | The file name to download from the repo, optional for GPTQ models | `None` | `orca-mini-3b.ggmlv3.q4_0.bin` | +| `MODE_TYPE` | Model type to override the auto model type detection | `None` | `gptq`, `gpt_bigcode`, `llama`, `mpt`, `replit`, `falcon`, `gpt_neox` `gptj` | +| `LOGGING_LEVEL` | Logging level | `INFO` | `DEBUG` | +| `TOP_K` | top-k for sampling. | `40 ` | Integers | +| `TOP_P` | top-p for sampling. | `1.0` | Floats | +| `REPETITION_PENALTY` | rp for sampling. | `1.1` | Floats | +| `LAST_N_TOKENS` | The last n tokens for repetition penalty. | `1.1` | Integers | +| `SEED` | The seed for sampling. | `-1` | Integers | +| `BATCH_SIZE` | The batch size for evaluating tokens, only for GGUF/GGML models | `8` | Integers | +| `THREADS` | Thread number override auto detect by CPU/2, set `1` for GPTQ models | `Auto` | Integers | +| `MAX_TOKENS` | The max number of token to generate | `512` | Integers | +| `STOP` | The token to stop the generation | `None` | `<|endoftext>` | +| `CONTEXT_LENGTH` | Override the auto detect context length | `512` | Integers | +| `GPU_LAYERS` | The number of layers to off load to GPU | `0` | Integers | +| `TRUNCATE_PROMPT_LENGTH` | Truncate the prompt if set | `0` | Integers | + +Sampling parameters including `TOP_K`, `TOP_P`, `REPETITION_PENALTY`, `LAST_N_TOKENS`, `SEED`, `MAX_TOKENS`, `STOP` can be override per request via request body, for example: + +```sh +curl -X POST \ + -H 'Content-Type: application/json' \ + -d '{ "messages": [{"role": "user", "content": "Tell me a story."}], "model": "llama-2-7b-chat.ggmlv3.q4_0.bin", "stream": false, "temperature": "2", "top_p": "1.0", "top_k": "0" }' \ + http://localhost:8000/v1/chat/completions +``` + +will use `temperature=2`, `top_p=1` and `top_k=0`for this request. + + ### Run in Container #### Image from Github Registry diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml index de634c1a33fa9346a68c671f393750d42bc88ff5..1de887fe595db0c388c84bd320b2c2d3909fb6d9 100644 --- a/charts/ialacol/Chart.yaml +++ b/charts/ialacol/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 -appVersion: 0.11.2 +appVersion: 0.11.3 description: A Helm chart for ialacol name: ialacol type: application -version: 0.11.2 +version: 0.11.3 diff --git a/charts/ialacol/templates/deployment.yaml b/charts/ialacol/templates/deployment.yaml index acd1b4f5c08767c8837107ded11e890f665aa6c4..d3c4b1a2feabbb1a06a531922787ce48a1d62693 100644 --- a/charts/ialacol/templates/deployment.yaml +++ b/charts/ialacol/templates/deployment.yaml @@ -29,6 +29,8 @@ spec: value: {{ (.Values.deployment.env).DEFAULT_MODEL_HG_REPO_ID | quote }} - name: DEFAULT_MODEL_FILE value: {{ (.Values.deployment.env).DEFAULT_MODEL_FILE | quote }} + - name: MODE_TYPE + value: {{ (.Values.deployment.env).MODE_TYPE | quote }} - name: LOGGING_LEVEL value: {{ (.Values.deployment.env).LOGGING_LEVEL | quote }} - name: TOP_K @@ -55,8 +57,6 @@ spec: value: {{ (.Values.deployment.env).CONTEXT_LENGTH | quote }} - name: GPU_LAYERS value: {{ (.Values.deployment.env).GPU_LAYERS | quote }} - - name: MODE_TYPE - value: {{ (.Values.deployment.env).MODE_TYPE | quote }} - name: TRUNCATE_PROMPT_LENGTH value: {{ (.Values.deployment.env).TRUNCATE_PROMPT_LENGTH | quote }} volumeMounts: diff --git a/const.py b/const.py new file mode 100644 index 0000000000000000000000000000000000000000..de151100508b084e5cba8d65fe2e8fe11025c7ef --- /dev/null +++ b/const.py @@ -0,0 +1,2 @@ +DEFAULT_MAX_TOKENS = "512" +DEFAULT_CONTEXT_LENGTH = "512" \ No newline at end of file diff --git a/get_config.py b/get_config.py index 5e4c3f0d149f14c4dcd5f577c69a40d343b63e2f..9c08c482702e7201b06ce4e06efdfe09dc7e45ad 100644 --- a/get_config.py +++ b/get_config.py @@ -4,6 +4,7 @@ from request_body import ChatCompletionRequestBody, CompletionRequestBody from get_env import get_env, get_env_or_none from get_default_thread import get_default_thread from log import log +from const import DEFAULT_MAX_TOKENS, DEFAULT_CONTEXT_LENGTH THREADS = int(get_env("THREADS", str(get_default_thread()))) @@ -25,7 +26,10 @@ def get_config( # ggml only, follow ctransformers defaults BATCH_SIZE = int(get_env("BATCH_SIZE", "8")) # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-max_tokens - MAX_TOKENS = int(get_env("MAX_TOKENS", "9999999")) + MAX_TOKENS = int(get_env("MAX_TOKENS", DEFAULT_MAX_TOKENS)) + CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", DEFAULT_CONTEXT_LENGTH)) + if (MAX_TOKENS > CONTEXT_LENGTH): + log.warning("MAX_TOKENS is greater than CONTEXT_LENGTH, setting MAX_TOKENS < CONTEXT_LENGTH") # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-stop STOP = get_env_or_none("STOP") diff --git a/main.py b/main.py index 75286c36bb4e05f5e5070c319c6d4fe998440167..6d74c12d401a0006873d9c95c3d857f9d6737f4a 100644 --- a/main.py +++ b/main.py @@ -24,6 +24,7 @@ from model_generate import chat_model_generate, model_generate from get_env import get_env from log import log from truncate import truncate +from const import DEFAULT_CONTEXT_LENGTH DEFAULT_MODEL_HG_REPO_ID = get_env( "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML" @@ -107,7 +108,7 @@ async def startup_event(): set_downloading_model(False) # ggml only, follow ctransformers defaults - CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", "-1")) + CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", DEFAULT_CONTEXT_LENGTH)) # the layers to offloading to the GPU GPU_LAYERS = int(get_env("GPU_LAYERS", "0"))