diff --git a/README.md b/README.md
index 39f5c1e0d33c0dd454f6f593934a09c25a5cd35a..76ab3e8e9903a008b87bc414c8545928ebae0ba7 100644
--- a/README.md
+++ b/README.md
@@ -80,6 +80,41 @@ openai -k "sk-fake" \
      -g user "Hello world!"
 ```
 
+### Configuration
+
+All configuration is done via environmental variable.
+
+| Parameter                    | Description                                                          | Default | Example                                                                      |
+| :----------------------------| :------------------------------------------------------------------- | :------ | :--------------------------------------------------------------------------- |
+| `DEFAULT_MODEL_HG_REPO_ID`   | The Hugging Face repo id to download the model                       | `None`  | `TheBloke/orca_mini_3B-GGML`                                                 |
+| `DEFAULT_MODEL_FILE`         | The file name to download from the repo, optional for GPTQ models    | `None`  | `orca-mini-3b.ggmlv3.q4_0.bin`                                               |
+| `MODE_TYPE`                  | Model type to override the auto model type detection                 | `None`  | `gptq`, `gpt_bigcode`, `llama`, `mpt`, `replit`, `falcon`, `gpt_neox` `gptj` |
+| `LOGGING_LEVEL`              | Logging level                                                        | `INFO`  | `DEBUG`                                                                      |
+| `TOP_K`                      | top-k for sampling.                                                  | `40 `   | Integers                                                                     |
+| `TOP_P`                      | top-p for sampling.                                                  | `1.0`   | Floats                                                                       |
+| `REPETITION_PENALTY`         | rp for sampling.                                                     | `1.1`   | Floats                                                                       |
+| `LAST_N_TOKENS`              | The last n tokens for repetition penalty.                            | `1.1`   | Integers                                                                     |
+| `SEED`                       | The seed for sampling.                                               | `-1`    | Integers                                                                     |
+| `BATCH_SIZE`                 | The batch size for evaluating tokens, only for GGUF/GGML models      | `8`     | Integers                                                                     |
+| `THREADS`                    | Thread number override auto detect by CPU/2, set `1` for GPTQ models | `Auto`  | Integers                                                                     |
+| `MAX_TOKENS`                 | The max number of token to generate                                  | `512`   | Integers                                                                     |
+| `STOP`                       | The token to stop the generation                                     | `None`  | `<|endoftext>`                                                               |
+| `CONTEXT_LENGTH`             | Override the auto detect context length                              | `512`   | Integers                                                                     |
+| `GPU_LAYERS`                 | The number of layers to off load to GPU                              | `0`     | Integers                                                                     |
+| `TRUNCATE_PROMPT_LENGTH`     | Truncate the prompt if set                                           | `0`     | Integers                                                                     |
+
+Sampling parameters including `TOP_K`, `TOP_P`, `REPETITION_PENALTY`, `LAST_N_TOKENS`, `SEED`, `MAX_TOKENS`, `STOP` can be override per request via request body, for example:
+
+```sh
+curl -X POST \
+     -H 'Content-Type: application/json' \
+     -d '{ "messages": [{"role": "user", "content": "Tell me a story."}], "model": "llama-2-7b-chat.ggmlv3.q4_0.bin", "stream": false, "temperature": "2", "top_p": "1.0", "top_k": "0" }' \
+     http://localhost:8000/v1/chat/completions
+```
+
+will use `temperature=2`, `top_p=1` and `top_k=0`for this request.
+
+
 ### Run in Container
 
 #### Image from Github Registry
diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml
index de634c1a33fa9346a68c671f393750d42bc88ff5..1de887fe595db0c388c84bd320b2c2d3909fb6d9 100644
--- a/charts/ialacol/Chart.yaml
+++ b/charts/ialacol/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
-appVersion: 0.11.2
+appVersion: 0.11.3
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.11.2
+version: 0.11.3
diff --git a/charts/ialacol/templates/deployment.yaml b/charts/ialacol/templates/deployment.yaml
index acd1b4f5c08767c8837107ded11e890f665aa6c4..d3c4b1a2feabbb1a06a531922787ce48a1d62693 100644
--- a/charts/ialacol/templates/deployment.yaml
+++ b/charts/ialacol/templates/deployment.yaml
@@ -29,6 +29,8 @@ spec:
             value: {{ (.Values.deployment.env).DEFAULT_MODEL_HG_REPO_ID | quote }}
           - name: DEFAULT_MODEL_FILE
             value: {{ (.Values.deployment.env).DEFAULT_MODEL_FILE | quote }}
+          - name: MODE_TYPE
+            value: {{ (.Values.deployment.env).MODE_TYPE | quote }}
           - name: LOGGING_LEVEL
             value: {{ (.Values.deployment.env).LOGGING_LEVEL | quote }}
           - name: TOP_K
@@ -55,8 +57,6 @@ spec:
             value: {{ (.Values.deployment.env).CONTEXT_LENGTH | quote }}
           - name: GPU_LAYERS
             value: {{ (.Values.deployment.env).GPU_LAYERS | quote }}
-          - name: MODE_TYPE
-            value: {{ (.Values.deployment.env).MODE_TYPE | quote }}
           - name: TRUNCATE_PROMPT_LENGTH
             value: {{ (.Values.deployment.env).TRUNCATE_PROMPT_LENGTH | quote }}
           volumeMounts:
diff --git a/const.py b/const.py
new file mode 100644
index 0000000000000000000000000000000000000000..de151100508b084e5cba8d65fe2e8fe11025c7ef
--- /dev/null
+++ b/const.py
@@ -0,0 +1,2 @@
+DEFAULT_MAX_TOKENS = "512"
+DEFAULT_CONTEXT_LENGTH = "512"
\ No newline at end of file
diff --git a/get_config.py b/get_config.py
index 5e4c3f0d149f14c4dcd5f577c69a40d343b63e2f..9c08c482702e7201b06ce4e06efdfe09dc7e45ad 100644
--- a/get_config.py
+++ b/get_config.py
@@ -4,6 +4,7 @@ from request_body import ChatCompletionRequestBody, CompletionRequestBody
 from get_env import get_env, get_env_or_none
 from get_default_thread import get_default_thread
 from log import log
+from const import DEFAULT_MAX_TOKENS, DEFAULT_CONTEXT_LENGTH
 
 THREADS = int(get_env("THREADS", str(get_default_thread())))
 
@@ -25,7 +26,10 @@ def get_config(
     # ggml only, follow ctransformers defaults
     BATCH_SIZE = int(get_env("BATCH_SIZE", "8"))
     # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-max_tokens
-    MAX_TOKENS = int(get_env("MAX_TOKENS", "9999999"))
+    MAX_TOKENS = int(get_env("MAX_TOKENS", DEFAULT_MAX_TOKENS))
+    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", DEFAULT_CONTEXT_LENGTH))
+    if (MAX_TOKENS > CONTEXT_LENGTH):
+        log.warning("MAX_TOKENS is greater than CONTEXT_LENGTH, setting MAX_TOKENS < CONTEXT_LENGTH")
     # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-stop
     STOP = get_env_or_none("STOP")
 
diff --git a/main.py b/main.py
index 75286c36bb4e05f5e5070c319c6d4fe998440167..6d74c12d401a0006873d9c95c3d857f9d6737f4a 100644
--- a/main.py
+++ b/main.py
@@ -24,6 +24,7 @@ from model_generate import chat_model_generate, model_generate
 from get_env import get_env
 from log import log
 from truncate import truncate
+from const import DEFAULT_CONTEXT_LENGTH
 
 DEFAULT_MODEL_HG_REPO_ID = get_env(
     "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML"
@@ -107,7 +108,7 @@ async def startup_event():
             set_downloading_model(False)
 
     # ggml only, follow ctransformers defaults
-    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", "-1"))
+    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", DEFAULT_CONTEXT_LENGTH))
     # the layers to offloading to the GPU
     GPU_LAYERS = int(get_env("GPU_LAYERS", "0"))