diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml
index cc579dc94aaa5d2adc9036c091e4e3308ceedcc5..83908a7fafe48626ca7f485402f909161c6c18ad 100644
--- a/charts/ialacol/Chart.yaml
+++ b/charts/ialacol/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
-appVersion: 0.10.2
+appVersion: 0.10.1
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.10.2
+version: 0.10.1
diff --git a/examples/values/codellama-13b-cuda12.yaml b/examples/values/codellama-13b-cuda12.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..85d86f555afb1b0485d548bd48abd73f71ec76ec
--- /dev/null
+++ b/examples/values/codellama-13b-cuda12.yaml
@@ -0,0 +1,29 @@
+replicas: 1
+deployment:
+  image: ghcr.io/chenhunghan/ialacol-cuda12:latest
+  env:
+    DEFAULT_MODEL_HG_REPO_ID: TheBloke/CodeLlama-13B-GGML
+    DEFAULT_MODEL_FILE: codellama-13b.ggmlv3.Q4_0.bin
+    GPU_LAYERS: 40
+    TOP_K: 40
+    TOP_P: 0.1
+    TEMPERATURE: 0.1
+    THREADS: 1
+    MAX_TOKENS: 1024
+    REPETITION_PENALTY: 1.8
+    LAST_N_TOKENS: 128
+resources:
+  {}
+model:
+  persistence:
+    size: 10Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClassName: ~
+service:
+  type: ClusterIP
+  port: 8000
+  annotations: {}
+nodeSelector: {}
+tolerations: []
+affinity: {}
diff --git a/get_auto_config.py b/get_auto_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..4390b04fa92bfaab876f641597b6510ea76a4327
--- /dev/null
+++ b/get_auto_config.py
@@ -0,0 +1,107 @@
+import logging
+from ctransformers import Config, AutoConfig
+
+from request_body import ChatCompletionRequestBody, CompletionRequestBody
+from get_env import get_env, get_env_or_none
+from get_default_thread import get_default_thread
+from get_model_type import get_model_type
+
+LOGGING_LEVEL = get_env("LOGGING_LEVEL", "INFO")
+
+log = logging.getLogger("uvicorn")
+try:
+    log.setLevel(LOGGING_LEVEL)
+except ValueError:
+    log.setLevel("INFO")
+
+THREADS = int(get_env("THREADS", str(get_default_thread())))
+
+
+def get_auto_config(
+    body: CompletionRequestBody | ChatCompletionRequestBody,
+) -> AutoConfig:
+    # ggml only, follow ctransformers defaults
+    TOP_K = int(get_env("TOP_K", "40"))
+    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-top_p
+    TOP_P = float(get_env("TOP_P", "1.0"))
+    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-temperature
+    TEMPERATURE = float(get_env("TEMPERATURE", "1"))
+    # ggml only, follow ctransformers defaults
+    REPETITION_PENALTY = float(get_env("REPETITION_PENALTY", "1.1"))
+    # ggml only, follow ctransformers defaults
+    LAST_N_TOKENS = int(get_env("LAST_N_TOKENS", "64"))
+    # ggml only, follow ctransformers defaults
+    SEED = int(get_env("SEED", "-1"))
+    # ggml only, follow ctransformers defaults
+    BATCH_SIZE = int(get_env("BATCH_SIZE", "8"))
+    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-max_tokens
+    MAX_TOKENS = int(get_env("MAX_TOKENS", "9999999"))
+    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-stop
+    STOP = get_env_or_none("STOP")
+    # ggml only, follow ctransformers defaults
+    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", "-1"))
+    # the layers to offloading to the GPU
+    GPU_LAYERS = int(get_env("GPU_LAYERS", "0"))
+
+    log.debug("TOP_K: %s", TOP_K)
+    log.debug("TOP_P: %s", TOP_P)
+    log.debug("TEMPERATURE: %s", TEMPERATURE)
+    log.debug("REPETITION_PENALTY: %s", REPETITION_PENALTY)
+    log.debug("LAST_N_TOKENS: %s", LAST_N_TOKENS)
+    log.debug("SEED: %s", SEED)
+    log.debug("BATCH_SIZE: %s", BATCH_SIZE)
+    log.debug("THREADS: %s", THREADS)
+    log.debug("MAX_TOKENS: %s", MAX_TOKENS)
+    log.debug("STOP: %s", STOP)
+    log.debug("CONTEXT_LENGTH: %s", CONTEXT_LENGTH)
+    log.debug("GPU_LAYERS: %s", GPU_LAYERS)
+
+    top_k = body.top_k if body.top_k else TOP_K
+    top_p = body.top_p if body.top_p else TOP_P
+    temperature = body.temperature if body.temperature else TEMPERATURE
+    repetition_penalty = body.repetition_penalty if body.repetition_penalty else REPETITION_PENALTY
+    last_n_tokens = body.last_n_tokens if body.last_n_tokens else LAST_N_TOKENS
+    seed = body.seed if body.seed else SEED
+    batch_size = body.batch_size if body.batch_size else BATCH_SIZE
+    threads = body.threads if body.threads else THREADS
+    max_new_tokens = body.max_tokens if body.max_tokens else MAX_TOKENS
+    stop = body.stop if body.stop else STOP
+
+    log.info("top_k: %s", top_k)
+    log.info("top_p: %s", top_p)
+    log.info("temperature: %s", temperature)
+    log.info("repetition_penalty: %s", repetition_penalty)
+    log.info("last_n_tokens: %s", last_n_tokens)
+    log.info("seed: %s", seed)
+    log.info("batch_size: %s", batch_size)
+    log.info("threads: %s", threads)
+    log.info("max_new_tokens: %s", max_new_tokens)
+    log.info("stop: %s", stop)
+    
+    log.info("CONTEXT_LENGTH: %s", CONTEXT_LENGTH)
+    log.info("GPU_LAYERS: %s", GPU_LAYERS)
+    
+    config = Config(
+        top_k=top_k,
+        top_p=top_p,
+        temperature=temperature,
+        repetition_penalty=repetition_penalty,
+        last_n_tokens=last_n_tokens,
+        seed=seed,
+        batch_size=batch_size,
+        threads=threads,
+        max_new_tokens=max_new_tokens,
+        stop=stop,
+        context_length=CONTEXT_LENGTH,
+        gpu_layers=GPU_LAYERS,
+    )
+
+    model_type = get_model_type(body)
+
+    log.info("model_type: %s", model_type)
+
+    auto_config = AutoConfig(
+        config=config,
+        model_type=model_type,
+    )
+    return auto_config
diff --git a/get_config.py b/get_config.py
deleted file mode 100644
index 0c24e55f57600276f8c4ce02070be255ece36f45..0000000000000000000000000000000000000000
--- a/get_config.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import logging
-from ctransformers import Config
-
-from request_body import ChatCompletionRequestBody, CompletionRequestBody
-from get_env import get_env, get_env_or_none
-from get_default_thread import get_default_thread
-
-LOGGING_LEVEL = get_env("LOGGING_LEVEL", "INFO")
-
-log = logging.getLogger("uvicorn")
-try:
-    log.setLevel(LOGGING_LEVEL)
-except ValueError:
-    log.setLevel("INFO")
-
-THREADS = int(get_env("THREADS", str(get_default_thread())))
-
-
-def get_config(body: CompletionRequestBody | ChatCompletionRequestBody) -> Config:
-    # ggml only, follow ctransformers defaults
-    TOP_K = int(get_env("TOP_K", "40"))
-    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-top_p
-    TOP_P = float(get_env("TOP_P", "1.0"))
-    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-temperature
-    TEMPERATURE = float(get_env("TEMPERATURE", "1"))
-    # ggml only, follow ctransformers defaults
-    REPETITION_PENALTY = float(get_env("REPETITION_PENALTY", "1.1"))
-    # ggml only, follow ctransformers defaults
-    LAST_N_TOKENS = int(get_env("LAST_N_TOKENS", "64"))
-    # ggml only, follow ctransformers defaults
-    SEED = int(get_env("SEED", "-1"))
-    # ggml only, follow ctransformers defaults
-    BATCH_SIZE = int(get_env("BATCH_SIZE", "8"))
-    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-max_tokens
-    MAX_TOKENS = int(get_env("MAX_TOKENS", "9999999"))
-    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-stop
-    STOP = get_env_or_none("STOP")
-    # ggml only, follow ctransformers defaults
-    CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", "-1"))
-    # the layers to offloading to the GPU
-    GPU_LAYERS = int(get_env("GPU_LAYERS", "0"))
-
-    log.info("TOP_K: %s", TOP_K)
-    log.info("TOP_P: %s", TOP_P)
-    log.info("TEMPERATURE: %s", TEMPERATURE)
-    log.info("REPETITION_PENALTY: %s", REPETITION_PENALTY)
-    log.info("LAST_N_TOKENS: %s", LAST_N_TOKENS)
-    log.info("SEED: %s", SEED)
-    log.info("BATCH_SIZE: %s", BATCH_SIZE)
-    log.info("THREADS: %s", THREADS)
-    log.info("MAX_TOKENS: %s", MAX_TOKENS)
-    log.info("STOP: %s", STOP)
-    log.info("CONTEXT_LENGTH: %s", CONTEXT_LENGTH)
-    log.info("GPU_LAYERS: %s", GPU_LAYERS)
-    
-    config = Config(
-        top_k=body.top_k if body.top_k else TOP_K,
-        top_p=body.top_p if body.top_p else TOP_P,
-        temperature=body.temperature if body.temperature else TEMPERATURE,
-        repetition_penalty=body.repetition_penalty if body.repetition_penalty else REPETITION_PENALTY,
-        last_n_tokens=body.last_n_tokens if body.last_n_tokens else LAST_N_TOKENS,
-        seed=body.seed if body.seed else SEED,
-        batch_size=body.batch_size if body.batch_size else BATCH_SIZE,
-        threads=body.threads if body.threads else THREADS,
-        max_new_tokens=body.max_tokens if body.max_tokens else MAX_TOKENS,
-        stop=body.stop if body.stop else STOP,
-        context_length=CONTEXT_LENGTH,
-        gpu_layers=GPU_LAYERS,
-    )
-    return config
diff --git a/get_llm.py b/get_llm.py
index c92614c34057a54e41edd19308fe58c88981c17e..657f98d6bec249fbea4d97f87b4165a975226e73 100644
--- a/get_llm.py
+++ b/get_llm.py
@@ -1,9 +1,9 @@
 import os
 
-from ctransformers import LLM
+from ctransformers import LLM, AutoModelForCausalLM
 from request_body import ChatCompletionRequestBody, CompletionRequestBody
-from get_env import get_env
-from get_config import get_config
+from get_auto_config import get_auto_config
+
 
 async def get_llm(
     body: ChatCompletionRequestBody | CompletionRequestBody,
@@ -17,42 +17,12 @@ async def get_llm(
         _type_: _description_
     """
 
-    ctransformer_model_type = "llama"
-    # These are also in "starcoder" format
-    # https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML
-    # https://huggingface.co/TheBloke/minotaur-15B-GGML
-    if (
-        "star" in body.model
-        or "starchat" in body.model
-        or "WizardCoder" in body.model
-        or "minotaur-15" in body.model
-    ):
-        ctransformer_model_type = "gpt_bigcode"
-    if "llama" in body.model:
-        ctransformer_model_type = "llama"
-    if "mpt" in body.model:
-        ctransformer_model_type = "mpt"
-    if "replit" in body.model:
-        ctransformer_model_type = "replit"
-    if "falcon" in body.model:
-        ctransformer_model_type = "falcon"
-    if "dolly" in body.model:
-        ctransformer_model_type = "dolly-v2"
-    if "stablelm" in body.model:
-        ctransformer_model_type = "gpt_neox"
-    # matching https://huggingface.co/stabilityai/stablecode-completion-alpha-3b
-    if "stablecode" in body.model:
-        ctransformer_model_type = "gpt_neox"
-    # matching https://huggingface.co/EleutherAI/pythia-70m
-    if "pythia" in body.model:
-        ctransformer_model_type = "gpt_neox"
-    config = get_config(body)
-    MODE_TYPE = get_env("MODE_TYPE", "")
-    if len(MODE_TYPE) > 0:
-        ctransformer_model_type = MODE_TYPE
-
-    return LLM(
-        model_path=f"{os.getcwd()}/models/{body.model}",
-        model_type=ctransformer_model_type,
-        config=config,
+    auto_config = get_auto_config(body)
+
+    llm = AutoModelForCausalLM.from_pretrained(
+        model_path_or_repo_id=f"{os.getcwd()}/models/{body.model}",
+        local_files_only=True,
+        config=auto_config,
     )
+
+    return llm
diff --git a/get_model_type.py b/get_model_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f1bc9c8b2dc33746e66693c35cadfbe2b13fcfe
--- /dev/null
+++ b/get_model_type.py
@@ -0,0 +1,41 @@
+from request_body import ChatCompletionRequestBody, CompletionRequestBody
+from get_env import get_env
+
+
+def get_model_type(
+    body: ChatCompletionRequestBody | CompletionRequestBody,
+) -> str:
+    ctransformer_model_type = "llama"
+    # These are also in "starcoder" format
+    # https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML
+    # https://huggingface.co/TheBloke/minotaur-15B-GGML
+    if (
+        "star" in body.model
+        or "starchat" in body.model
+        or "WizardCoder" in body.model
+        or "minotaur-15" in body.model
+    ):
+        ctransformer_model_type = "gpt_bigcode"
+    if "llama" in body.model:
+        ctransformer_model_type = "llama"
+    if "mpt" in body.model:
+        ctransformer_model_type = "mpt"
+    if "replit" in body.model:
+        ctransformer_model_type = "replit"
+    if "falcon" in body.model:
+        ctransformer_model_type = "falcon"
+    if "dolly" in body.model:
+        ctransformer_model_type = "dolly-v2"
+    if "stablelm" in body.model:
+        ctransformer_model_type = "gpt_neox"
+    # matching https://huggingface.co/stabilityai/stablecode-completion-alpha-3b
+    if "stablecode" in body.model:
+        ctransformer_model_type = "gpt_neox"
+    # matching https://huggingface.co/EleutherAI/pythia-70m
+    if "pythia" in body.model:
+        ctransformer_model_type = "gpt_neox"
+
+    MODE_TYPE = get_env("MODE_TYPE", "")
+    if len(MODE_TYPE) > 0:
+        ctransformer_model_type = MODE_TYPE
+    return ctransformer_model_type
diff --git a/main.py b/main.py
index e3e927ea2f8a2ad08102073ea081b7d140d52f9a..79cc9cf6dc2caee1e3cea5cfaa3fc1a606981dba 100644
--- a/main.py
+++ b/main.py
@@ -22,7 +22,6 @@ from streamers import chat_completions_streamer, completions_streamer
 from model_generate import chat_model_generate, model_generate
 from get_env import get_env
 from get_llm import get_llm
-from get_config import get_config
 
 DEFAULT_MODEL_HG_REPO_ID = get_env(
     "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML"
@@ -77,7 +76,7 @@ async def startup_event():
                 "Downloading model... %s/%s to %s/models",
                 DEFAULT_MODEL_HG_REPO_ID,
                 DEFAULT_MODEL_FILE,
-                os.getcwd()
+                os.getcwd(),
             )
             try:
                 hf_hub_download(
@@ -141,7 +140,6 @@ async def completions(
             "n, logit_bias, user, presence_penalty and frequency_penalty are not supporte."
         )
     prompt = body.prompt
-    config = get_config(body)
 
     model_name = body.model
     if body.stream is True:
@@ -151,7 +149,6 @@ async def completions(
                 prompt,
                 model_name,
                 llm,
-                config,
                 log,
             ),
             media_type="text/event-stream",
@@ -160,7 +157,6 @@ async def completions(
         prompt,
         model_name,
         llm,
-        config,
         log,
     )
 
@@ -254,7 +250,6 @@ async def chat_completions(
     )
 
     prompt = f"{system_message_content}{assistant_message_content} {default_user_start}{user_message_content}{default_user_end} {default_assistant_start}"
-    config = get_config(body)
     model_name = body.model
     if body.stream is True:
         log.debug("Streaming response from %s", model_name)
@@ -263,7 +258,6 @@ async def chat_completions(
                 prompt,
                 model_name,
                 llm,
-                config,
                 log,
             ),
             media_type="text/event-stream",
@@ -272,6 +266,5 @@ async def chat_completions(
         prompt,
         model_name,
         llm,
-        config,
         log,
     )
diff --git a/model_generate.py b/model_generate.py
index f6412c37e7d12d7926fc2652a6ece0f3e9e16cb1..abe76b3a851a5f07c586a4812ad7a1245e234c32 100644
--- a/model_generate.py
+++ b/model_generate.py
@@ -1,13 +1,12 @@
 from time import time
 from logging import Logger
-from ctransformers import LLM, Config
+from ctransformers import LLM
 
 
 def model_generate(
     prompt: str,
     model_name: str,
     llm: LLM,
-    config: Config,
     log: Logger,
 ):
     """_summary_
@@ -15,40 +14,11 @@ def model_generate(
     """
     created = time()
 
-    top_k = config.top_k
-    log.debug("top_k: %s", top_k)
-    top_p = config.top_p
-    log.debug("top_p: %s", top_p)
-    temperature = config.temperature
-    log.debug("temperature: %s", temperature)
-    repetition_penalty = config.repetition_penalty
-    log.debug("repetition_penalty: %s", repetition_penalty)
-    last_n_tokens = config.last_n_tokens
-    log.debug("last_n_tokens: %s", last_n_tokens)
-    seed = config.seed
-    log.debug("seed: %s", seed)
-    max_new_tokens = config.max_new_tokens
-    log.debug("max_new_tokens: %s", max_new_tokens)
-    batch_size = config.batch_size
-    log.debug("batch_size: %s", batch_size)
-    threads = config.threads
-    log.debug("thread: %s", threads)
-    gpu_layers = config.gpu_layers
-    log.debug("gpu_layers: %s", gpu_layers)
     log.debug("prompt: %s", prompt)
 
     log.debug("Getting from ctransformer instance")
     result: str = llm(  # pyright: ignore [reportGeneralTypeIssues]
         prompt=prompt,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        last_n_tokens=last_n_tokens,
-        seed=seed,
-        batch_size=batch_size,
-        threads=threads,
-        max_new_tokens=max_new_tokens,
     )
     http_response = {
         "id": "id",
@@ -73,7 +43,6 @@ def chat_model_generate(
     prompt: str,
     model_name: str,
     llm: LLM,
-    config: Config,
     log: Logger,
 ):
     """_summary_
@@ -81,37 +50,11 @@ def chat_model_generate(
     """
     created = time()
 
-    top_k = config.top_k
-    log.debug("top_k: %s", top_k)
-    top_p = config.top_p
-    log.debug("top_p: %s", top_p)
-    temperature = config.temperature
-    log.debug("temperature: %s", temperature)
-    repetition_penalty = config.repetition_penalty
-    log.debug("repetition_penalty: %s", repetition_penalty)
-    last_n_tokens = config.last_n_tokens
-    log.debug("last_n_tokens: %s", last_n_tokens)
-    seed = config.seed
-    log.debug("seed: %s", seed)
-    batch_size = config.batch_size
-    log.debug("batch_size: %s", batch_size)
-    threads = config.threads
-    log.debug("thread: %s", threads)
-    gpu_layers = config.gpu_layers
-    log.debug("gpu_layers: %s", gpu_layers)
     log.debug("prompt: %s", prompt)
 
     log.debug("Getting from ctransformer instance")
     result: str = llm(  # pyright: ignore [reportGeneralTypeIssues]
         prompt=prompt,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        last_n_tokens=last_n_tokens,
-        seed=seed,
-        batch_size=batch_size,
-        threads=threads,
     )
     http_response = {
         "id": "id",
diff --git a/requirements.txt b/requirements.txt
index 1c96282fffda11266fa46a13c75d68a5d5fb4199..4838f45c591cdfd2b9f72d3814e3da030d841ce4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ blake3==0.3.3
 certifi==2023.7.22
 charset-normalizer==3.1.0
 click==8.1.3
-ctransformers==0.2.22
+ctransformers==0.2.24
 fastapi==0.95.2
 filelock==3.12.0
 fsspec==2023.5.0
diff --git a/streamers.py b/streamers.py
index 1b4e3499437f8cb5a9fd7257c8886b36352a766e..7eb9ba157e781c2916b812cff0a7b4b5a1855ca5 100644
--- a/streamers.py
+++ b/streamers.py
@@ -2,14 +2,13 @@ import json
 from logging import Logger
 from os import times
 
-from ctransformers import LLM, Config
+from ctransformers import LLM
 
 
 def completions_streamer(
     prompt: str,
     model_name: str,
     llm: LLM,
-    config: Config,
     log: Logger,
 ):
     """_summary_
@@ -17,43 +16,11 @@ def completions_streamer(
     """
     created = times()
 
-    top_k = config.top_k
-    log.debug("top_k: %s", top_k)
-    top_p = config.top_p
-    log.debug("top_p: %s", top_p)
-    temperature = config.temperature
-    log.debug("temperature: %s", temperature)
-    repetition_penalty = config.repetition_penalty
-    log.debug("repetition_penalty: %s", repetition_penalty)
-    last_n_tokens = config.last_n_tokens
-    log.debug("last_n_tokens: %s", last_n_tokens)
-    seed = config.seed
-    log.debug("seed: %s", seed)
-    max_new_tokens = config.max_new_tokens
-    log.debug("max_new_tokens: %s", max_new_tokens)
-    stop = config.stop
-    log.debug("stop: %s", stop)
-    batch_size = config.batch_size
-    log.debug("batch_size: %s", batch_size)
-    threads = config.threads
-    log.debug("thread: %s", threads)
     log.debug("prompt: %s", prompt)
 
     log.debug("Streaming from ctransformer instance!")
     for token in llm(
         prompt,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        last_n_tokens=last_n_tokens,
-        seed=seed,
-        stop=stop,
-        batch_size=batch_size,
-        threads=threads,
-        stream=True,
-        reset=True,
-        max_new_tokens=max_new_tokens,
     ):
         log.debug("Streaming token %s", token)
         data = json.dumps(
@@ -96,7 +63,6 @@ def chat_completions_streamer(
     prompt: str,
     model_name: str,
     llm: LLM,
-    config: Config,
     log: Logger,
 ):
     """_summary_
@@ -104,38 +70,11 @@ def chat_completions_streamer(
     """
     created = times()
 
-    top_k = config.top_k
-    log.debug("top_k: %s", top_k)
-    top_p = config.top_p
-    log.debug("top_p: %s", top_p)
-    temperature = config.temperature
-    log.debug("temperature: %s", temperature)
-    repetition_penalty = config.repetition_penalty
-    log.debug("repetition_penalty: %s", repetition_penalty)
-    last_n_tokens = config.last_n_tokens
-    log.debug("last_n_tokens: %s", last_n_tokens)
-    seed = config.seed
-    log.debug("seed: %s", seed)
-    stop = config.stop
-    log.debug("stop: %s", stop)
-    batch_size = config.batch_size
-    log.debug("batch_size: %s", batch_size)
-    threads = config.threads
-    log.debug("threads: %s", threads)
     log.debug("prompt: %s", prompt)
 
     log.debug("Streaming from ctransformer instance")
     for token in llm(
         prompt,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        last_n_tokens=last_n_tokens,
-        seed=seed,
-        stop=stop,
-        batch_size=batch_size,
-        threads=threads,
         stream=True,
         reset=True,
     ):