Add example `codellama.yaml`, ctransformer to 0.2.24, refactor `get_config` (#59)

Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com>

Add example `codellama.yaml`, ctransformer to 0.2.24, refactor `get_config` (#59)
c5b44504 · Henry Chen · GitHub · 14c2199e · c5b44504 · c5b44504
Unverified Commit c5b44504 authored 1 year ago by Henry Chen Committed by GitHub 1 year ago
--- a/charts/ialacol/Chart.yaml
+++ b/charts/ialacol/Chart.yaml
 apiVersion: v2
-appVersion: 0.10.2
+appVersion: 0.10.1
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.10.2
+version: 0.10.1
--- a/examples/values/codellama-13b-cuda12.yaml
+++ b/examples/values/codellama-13b-cuda12.yaml
+replicas: 1
+deployment:
+  image: ghcr.io/chenhunghan/ialacol-cuda12:latest
+  env:
+    DEFAULT_MODEL_HG_REPO_ID: TheBloke/CodeLlama-13B-GGML
+    DEFAULT_MODEL_FILE: codellama-13b.ggmlv3.Q4_0.bin
+    GPU_LAYERS: 40
+    TOP_K: 40
+    TOP_P: 0.1
+    TEMPERATURE: 0.1
+    THREADS: 1
+    MAX_TOKENS: 1024
+    REPETITION_PENALTY: 1.8
+    LAST_N_TOKENS: 128
+resources:
+  {}
+model:
+  persistence:
+    size: 10Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClassName: ~
+service:
+  type: ClusterIP
+  port: 8000
+  annotations: {}
+nodeSelector: {}
+tolerations: []
+affinity: {}
--- a/get_config.py
+++ b/get_config.py
 import logging
-from ctransformers import Config
+from ctransformers import Config, AutoConfig
 from request_body import ChatCompletionRequestBody, CompletionRequestBody
 from get_env import get_env, get_env_or_none
 from get_default_thread import get_default_thread
+from get_model_type import get_model_type
 LOGGING_LEVEL = get_env("LOGGING_LEVEL", "INFO")
@@ -16,7 +17,9 @@ except ValueError:
 THREADS = int(get_env("THREADS", str(get_default_thread())))
-def get_config(body: CompletionRequestBody | ChatCompletionRequestBody) -> Config:
+def get_auto_config(
+    body: CompletionRequestBody | ChatCompletionRequestBody,
+) -> AutoConfig:
    # ggml only, follow ctransformers defaults
    TOP_K = int(get_env("TOP_K", "40"))
    # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-top_p
@@ -40,31 +43,65 @@ def get_config(body: CompletionRequestBody | ChatCompletionRequestBody) -> Confi
    # the layers to offloading to the GPU
    GPU_LAYERS = int(get_env("GPU_LAYERS", "0"))
-    log.info("TOP_K: %s", TOP_K)
+    log.debug("TOP_K: %s", TOP_K)
-    log.info("TOP_P: %s", TOP_P)
+    log.debug("TOP_P: %s", TOP_P)
-    log.info("TEMPERATURE: %s", TEMPERATURE)
+    log.debug("TEMPERATURE: %s", TEMPERATURE)
-    log.info("REPETITION_PENALTY: %s", REPETITION_PENALTY)
+    log.debug("REPETITION_PENALTY: %s", REPETITION_PENALTY)
-    log.info("LAST_N_TOKENS: %s", LAST_N_TOKENS)
+    log.debug("LAST_N_TOKENS: %s", LAST_N_TOKENS)
-    log.info("SEED: %s", SEED)
+    log.debug("SEED: %s", SEED)
-    log.info("BATCH_SIZE: %s", BATCH_SIZE)
+    log.debug("BATCH_SIZE: %s", BATCH_SIZE)
-    log.info("THREADS: %s", THREADS)
+    log.debug("THREADS: %s", THREADS)
-    log.info("MAX_TOKENS: %s", MAX_TOKENS)
+    log.debug("MAX_TOKENS: %s", MAX_TOKENS)
-    log.info("STOP: %s", STOP)
+    log.debug("STOP: %s", STOP)
+    log.debug("CONTEXT_LENGTH: %s", CONTEXT_LENGTH)
+    log.debug("GPU_LAYERS: %s", GPU_LAYERS)
+    top_k = body.top_k if body.top_k else TOP_K
+    top_p = body.top_p if body.top_p else TOP_P
+    temperature = body.temperature if body.temperature else TEMPERATURE
+    repetition_penalty = body.repetition_penalty if body.repetition_penalty else REPETITION_PENALTY
+    last_n_tokens = body.last_n_tokens if body.last_n_tokens else LAST_N_TOKENS
+    seed = body.seed if body.seed else SEED
+    batch_size = body.batch_size if body.batch_size else BATCH_SIZE
+    threads = body.threads if body.threads else THREADS
+    max_new_tokens = body.max_tokens if body.max_tokens else MAX_TOKENS
+    stop = body.stop if body.stop else STOP
+    log.info("top_k: %s", top_k)
+    log.info("top_p: %s", top_p)
+    log.info("temperature: %s", temperature)
+    log.info("repetition_penalty: %s", repetition_penalty)
+    log.info("last_n_tokens: %s", last_n_tokens)
+    log.info("seed: %s", seed)
+    log.info("batch_size: %s", batch_size)
+    log.info("threads: %s", threads)
+    log.info("max_new_tokens: %s", max_new_tokens)
+    log.info("stop: %s", stop)
    log.info("CONTEXT_LENGTH: %s", CONTEXT_LENGTH)
    log.info("GPU_LAYERS: %s", GPU_LAYERS)
    config = Config(
-        top_k=body.top_k if body.top_k else TOP_K,
+        top_k=top_k,
-        top_p=body.top_p if body.top_p else TOP_P,
+        top_p=top_p,
-        temperature=body.temperature if body.temperature else TEMPERATURE,
+        temperature=temperature,
-        repetition_penalty=body.repetition_penalty if body.repetition_penalty else REPETITION_PENALTY,
+        repetition_penalty=repetition_penalty,
-        last_n_tokens=body.last_n_tokens if body.last_n_tokens else LAST_N_TOKENS,
+        last_n_tokens=last_n_tokens,
-        seed=body.seed if body.seed else SEED,
+        seed=seed,
-        batch_size=body.batch_size if body.batch_size else BATCH_SIZE,
+        batch_size=batch_size,
-        threads=body.threads if body.threads else THREADS,
+        threads=threads,
-        max_new_tokens=body.max_tokens if body.max_tokens else MAX_TOKENS,
+        max_new_tokens=max_new_tokens,
-        stop=body.stop if body.stop else STOP,
+        stop=stop,
        context_length=CONTEXT_LENGTH,
        gpu_layers=GPU_LAYERS,
    )
-    return config
+    model_type = get_model_type(body)
+    log.info("model_type: %s", model_type)
+    auto_config = AutoConfig(
+        config=config,
+        model_type=model_type,
+    )
+    return auto_config
--- a/get_llm.py
+++ b/get_llm.py
 import os
-from ctransformers import LLM
+from ctransformers import LLM, AutoModelForCausalLM
 from request_body import ChatCompletionRequestBody, CompletionRequestBody
-from get_env import get_env
+from get_auto_config import get_auto_config
-from get_config import get_config
 async def get_llm(
    body: ChatCompletionRequestBody | CompletionRequestBody,
@@ -17,42 +17,12 @@ async def get_llm(
        _type_: _description_
    """
-    ctransformer_model_type = "llama"
+    auto_config = get_auto_config(body)
-    # These are also in "starcoder" format
-    # https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML
+    llm = AutoModelForCausalLM.from_pretrained(
-    # https://huggingface.co/TheBloke/minotaur-15B-GGML
+        model_path_or_repo_id=f"{os.getcwd()}/models/{body.model}",
-    if (
+        local_files_only=True,
-        "star" in body.model
+        config=auto_config,
-        or "starchat" in body.model
-        or "WizardCoder" in body.model
-        or "minotaur-15" in body.model
-    ):
-        ctransformer_model_type = "gpt_bigcode"
-    if "llama" in body.model:
-        ctransformer_model_type = "llama"
-    if "mpt" in body.model:
-        ctransformer_model_type = "mpt"
-    if "replit" in body.model:
-        ctransformer_model_type = "replit"
-    if "falcon" in body.model:
-        ctransformer_model_type = "falcon"
-    if "dolly" in body.model:
-        ctransformer_model_type = "dolly-v2"
-    if "stablelm" in body.model:
-        ctransformer_model_type = "gpt_neox"
-    # matching https://huggingface.co/stabilityai/stablecode-completion-alpha-3b
-    if "stablecode" in body.model:
-        ctransformer_model_type = "gpt_neox"
-    # matching https://huggingface.co/EleutherAI/pythia-70m
-    if "pythia" in body.model:
-        ctransformer_model_type = "gpt_neox"
-    config = get_config(body)
-    MODE_TYPE = get_env("MODE_TYPE", "")
-    if len(MODE_TYPE) > 0:
-        ctransformer_model_type = MODE_TYPE
-    return LLM(
-        model_path=f"{os.getcwd()}/models/{body.model}",
-        model_type=ctransformer_model_type,
-        config=config,
    )
+    return llm
--- a/get_model_type.py
+++ b/get_model_type.py
+from request_body import ChatCompletionRequestBody, CompletionRequestBody
+from get_env import get_env
+def get_model_type(
+    body: ChatCompletionRequestBody | CompletionRequestBody,
+) -> str:
+    ctransformer_model_type = "llama"
+    # These are also in "starcoder" format
+    # https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML
+    # https://huggingface.co/TheBloke/minotaur-15B-GGML
+    if (
+        "star" in body.model
+        or "starchat" in body.model
+        or "WizardCoder" in body.model
+        or "minotaur-15" in body.model
+    ):
+        ctransformer_model_type = "gpt_bigcode"
+    if "llama" in body.model:
+        ctransformer_model_type = "llama"
+    if "mpt" in body.model:
+        ctransformer_model_type = "mpt"
+    if "replit" in body.model:
+        ctransformer_model_type = "replit"
+    if "falcon" in body.model:
+        ctransformer_model_type = "falcon"
+    if "dolly" in body.model:
+        ctransformer_model_type = "dolly-v2"
+    if "stablelm" in body.model:
+        ctransformer_model_type = "gpt_neox"
+    # matching https://huggingface.co/stabilityai/stablecode-completion-alpha-3b
+    if "stablecode" in body.model:
+        ctransformer_model_type = "gpt_neox"
+    # matching https://huggingface.co/EleutherAI/pythia-70m
+    if "pythia" in body.model:
+        ctransformer_model_type = "gpt_neox"
+    MODE_TYPE = get_env("MODE_TYPE", "")
+    if len(MODE_TYPE) > 0:
+        ctransformer_model_type = MODE_TYPE
+    return ctransformer_model_type
--- a/main.py
+++ b/main.py
@@ -22,7 +22,6 @@ from streamers import chat_completions_streamer, completions_streamer
 from model_generate import chat_model_generate, model_generate
 from get_env import get_env
 from get_llm import get_llm
-from get_config import get_config
 DEFAULT_MODEL_HG_REPO_ID = get_env(
    "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML"
@@ -77,7 +76,7 @@ async def startup_event():
                "Downloading model... %s/%s to %s/models",
                DEFAULT_MODEL_HG_REPO_ID,
                DEFAULT_MODEL_FILE,
-                os.getcwd()
+                os.getcwd(),
            )
            try:
                hf_hub_download(
@@ -141,7 +140,6 @@ async def completions(
            "n, logit_bias, user, presence_penalty and frequency_penalty are not supporte."
        )
    prompt = body.prompt
-    config = get_config(body)
    model_name = body.model
    if body.stream is True:
@@ -151,7 +149,6 @@ async def completions(
                prompt,
                model_name,
                llm,
-                config,
                log,
            ),
            media_type="text/event-stream",
@@ -160,7 +157,6 @@ async def completions(
        prompt,
        model_name,
        llm,
-        config,
        log,
    )
@@ -254,7 +250,6 @@ async def chat_completions(
    )
    prompt = f"{system_message_content}{assistant_message_content} {default_user_start}{user_message_content}{default_user_end} {default_assistant_start}"
-    config = get_config(body)
    model_name = body.model
    if body.stream is True:
        log.debug("Streaming response from %s", model_name)
@@ -263,7 +258,6 @@ async def chat_completions(
                prompt,
                model_name,
                llm,
-                config,
                log,
            ),
            media_type="text/event-stream",
@@ -272,6 +266,5 @@ async def chat_completions(
        prompt,
        model_name,
        llm,
-        config,
        log,
    )
--- a/model_generate.py
+++ b/model_generate.py
 from time import time
 from logging import Logger
-from ctransformers import LLM, Config
+from ctransformers import LLM
 def model_generate(
    prompt: str,
    model_name: str,
    llm: LLM,
-    config: Config,
    log: Logger,
 ):
    """_summary_
@@ -15,40 +14,11 @@ def model_generate(
    """
    created = time()
-    top_k = config.top_k
-    log.debug("top_k: %s", top_k)
-    top_p = config.top_p
-    log.debug("top_p: %s", top_p)
-    temperature = config.temperature
-    log.debug("temperature: %s", temperature)
-    repetition_penalty = config.repetition_penalty
-    log.debug("repetition_penalty: %s", repetition_penalty)
-    last_n_tokens = config.last_n_tokens
-    log.debug("last_n_tokens: %s", last_n_tokens)
-    seed = config.seed
-    log.debug("seed: %s", seed)
-    max_new_tokens = config.max_new_tokens
-    log.debug("max_new_tokens: %s", max_new_tokens)
-    batch_size = config.batch_size
-    log.debug("batch_size: %s", batch_size)
-    threads = config.threads
-    log.debug("thread: %s", threads)
-    gpu_layers = config.gpu_layers
-    log.debug("gpu_layers: %s", gpu_layers)
    log.debug("prompt: %s", prompt)
    log.debug("Getting from ctransformer instance")
    result: str = llm(  # pyright: ignore [reportGeneralTypeIssues]
        prompt=prompt,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        last_n_tokens=last_n_tokens,
-        seed=seed,
-        batch_size=batch_size,
-        threads=threads,
-        max_new_tokens=max_new_tokens,
    )
    http_response = {
        "id": "id",
@@ -73,7 +43,6 @@ def chat_model_generate(
    prompt: str,
    model_name: str,
    llm: LLM,
-    config: Config,
    log: Logger,
 ):
    """_summary_
@@ -81,37 +50,11 @@ def chat_model_generate(
    """
    created = time()
-    top_k = config.top_k
-    log.debug("top_k: %s", top_k)
-    top_p = config.top_p
-    log.debug("top_p: %s", top_p)
-    temperature = config.temperature
-    log.debug("temperature: %s", temperature)
-    repetition_penalty = config.repetition_penalty
-    log.debug("repetition_penalty: %s", repetition_penalty)
-    last_n_tokens = config.last_n_tokens
-    log.debug("last_n_tokens: %s", last_n_tokens)
-    seed = config.seed
-    log.debug("seed: %s", seed)
-    batch_size = config.batch_size
-    log.debug("batch_size: %s", batch_size)
-    threads = config.threads
-    log.debug("thread: %s", threads)
-    gpu_layers = config.gpu_layers
-    log.debug("gpu_layers: %s", gpu_layers)
    log.debug("prompt: %s", prompt)
    log.debug("Getting from ctransformer instance")
    result: str = llm(  # pyright: ignore [reportGeneralTypeIssues]
        prompt=prompt,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        last_n_tokens=last_n_tokens,
-        seed=seed,
-        batch_size=batch_size,
-        threads=threads,
    )
    http_response = {
        "id": "id",

--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ blake3==0.3.3
 certifi==2023.7.22
 charset-normalizer==3.1.0
 click==8.1.3
-ctransformers==0.2.22
+ctransformers==0.2.24
 fastapi==0.95.2
 filelock==3.12.0
 fsspec==2023.5.0

--- a/streamers.py
+++ b/streamers.py
@@ -2,14 +2,13 @@ import json
 from logging import Logger
 from os import times
-from ctransformers import LLM, Config
+from ctransformers import LLM
 def completions_streamer(
    prompt: str,
    model_name: str,
    llm: LLM,
-    config: Config,
    log: Logger,
 ):
    """_summary_
@@ -17,43 +16,11 @@ def completions_streamer(
    """
    created = times()
-    top_k = config.top_k
-    log.debug("top_k: %s", top_k)
-    top_p = config.top_p
-    log.debug("top_p: %s", top_p)
-    temperature = config.temperature
-    log.debug("temperature: %s", temperature)
-    repetition_penalty = config.repetition_penalty
-    log.debug("repetition_penalty: %s", repetition_penalty)
-    last_n_tokens = config.last_n_tokens
-    log.debug("last_n_tokens: %s", last_n_tokens)
-    seed = config.seed
-    log.debug("seed: %s", seed)
-    max_new_tokens = config.max_new_tokens
-    log.debug("max_new_tokens: %s", max_new_tokens)
-    stop = config.stop
-    log.debug("stop: %s", stop)
-    batch_size = config.batch_size
-    log.debug("batch_size: %s", batch_size)
-    threads = config.threads
-    log.debug("thread: %s", threads)
    log.debug("prompt: %s", prompt)
    log.debug("Streaming from ctransformer instance!")
    for token in llm(
        prompt,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        last_n_tokens=last_n_tokens,
-        seed=seed,
-        stop=stop,
-        batch_size=batch_size,
-        threads=threads,
-        stream=True,
-        reset=True,
-        max_new_tokens=max_new_tokens,
    ):
        log.debug("Streaming token %s", token)
        data = json.dumps(
@@ -96,7 +63,6 @@ def chat_completions_streamer(
    prompt: str,
    model_name: str,
    llm: LLM,
-    config: Config,
    log: Logger,
 ):
    """_summary_
@@ -104,38 +70,11 @@ def chat_completions_streamer(
    """
    created = times()
-    top_k = config.top_k
-    log.debug("top_k: %s", top_k)
-    top_p = config.top_p
-    log.debug("top_p: %s", top_p)
-    temperature = config.temperature
-    log.debug("temperature: %s", temperature)
-    repetition_penalty = config.repetition_penalty
-    log.debug("repetition_penalty: %s", repetition_penalty)
-    last_n_tokens = config.last_n_tokens
-    log.debug("last_n_tokens: %s", last_n_tokens)
-    seed = config.seed
-    log.debug("seed: %s", seed)
-    stop = config.stop
-    log.debug("stop: %s", stop)
-    batch_size = config.batch_size
-    log.debug("batch_size: %s", batch_size)
-    threads = config.threads
-    log.debug("threads: %s", threads)
    log.debug("prompt: %s", prompt)
    log.debug("Streaming from ctransformer instance")
    for token in llm(
        prompt,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        repetition_penalty=repetition_penalty,
-        last_n_tokens=last_n_tokens,
-        seed=seed,
-        stop=stop,
-        batch_size=batch_size,
-        threads=threads,
        stream=True,
        reset=True,
    ):