Skip to content
Snippets Groups Projects
Unverified Commit c5b44504 authored by Henry Chen's avatar Henry Chen Committed by GitHub
Browse files

Add example `codellama.yaml`, ctransformer to 0.2.24, refactor `get_config` (#59)

parent 14c2199e
No related branches found
No related tags found
No related merge requests found
apiVersion: v2 apiVersion: v2
appVersion: 0.10.2 appVersion: 0.10.1
description: A Helm chart for ialacol description: A Helm chart for ialacol
name: ialacol name: ialacol
type: application type: application
version: 0.10.2 version: 0.10.1
replicas: 1
deployment:
image: ghcr.io/chenhunghan/ialacol-cuda12:latest
env:
DEFAULT_MODEL_HG_REPO_ID: TheBloke/CodeLlama-13B-GGML
DEFAULT_MODEL_FILE: codellama-13b.ggmlv3.Q4_0.bin
GPU_LAYERS: 40
TOP_K: 40
TOP_P: 0.1
TEMPERATURE: 0.1
THREADS: 1
MAX_TOKENS: 1024
REPETITION_PENALTY: 1.8
LAST_N_TOKENS: 128
resources:
{}
model:
persistence:
size: 10Gi
accessModes:
- ReadWriteOnce
storageClassName: ~
service:
type: ClusterIP
port: 8000
annotations: {}
nodeSelector: {}
tolerations: []
affinity: {}
import logging import logging
from ctransformers import Config from ctransformers import Config, AutoConfig
from request_body import ChatCompletionRequestBody, CompletionRequestBody from request_body import ChatCompletionRequestBody, CompletionRequestBody
from get_env import get_env, get_env_or_none from get_env import get_env, get_env_or_none
from get_default_thread import get_default_thread from get_default_thread import get_default_thread
from get_model_type import get_model_type
LOGGING_LEVEL = get_env("LOGGING_LEVEL", "INFO") LOGGING_LEVEL = get_env("LOGGING_LEVEL", "INFO")
...@@ -16,7 +17,9 @@ except ValueError: ...@@ -16,7 +17,9 @@ except ValueError:
THREADS = int(get_env("THREADS", str(get_default_thread()))) THREADS = int(get_env("THREADS", str(get_default_thread())))
def get_config(body: CompletionRequestBody | ChatCompletionRequestBody) -> Config: def get_auto_config(
body: CompletionRequestBody | ChatCompletionRequestBody,
) -> AutoConfig:
# ggml only, follow ctransformers defaults # ggml only, follow ctransformers defaults
TOP_K = int(get_env("TOP_K", "40")) TOP_K = int(get_env("TOP_K", "40"))
# OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-top_p # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-top_p
...@@ -40,31 +43,65 @@ def get_config(body: CompletionRequestBody | ChatCompletionRequestBody) -> Confi ...@@ -40,31 +43,65 @@ def get_config(body: CompletionRequestBody | ChatCompletionRequestBody) -> Confi
# the layers to offloading to the GPU # the layers to offloading to the GPU
GPU_LAYERS = int(get_env("GPU_LAYERS", "0")) GPU_LAYERS = int(get_env("GPU_LAYERS", "0"))
log.info("TOP_K: %s", TOP_K) log.debug("TOP_K: %s", TOP_K)
log.info("TOP_P: %s", TOP_P) log.debug("TOP_P: %s", TOP_P)
log.info("TEMPERATURE: %s", TEMPERATURE) log.debug("TEMPERATURE: %s", TEMPERATURE)
log.info("REPETITION_PENALTY: %s", REPETITION_PENALTY) log.debug("REPETITION_PENALTY: %s", REPETITION_PENALTY)
log.info("LAST_N_TOKENS: %s", LAST_N_TOKENS) log.debug("LAST_N_TOKENS: %s", LAST_N_TOKENS)
log.info("SEED: %s", SEED) log.debug("SEED: %s", SEED)
log.info("BATCH_SIZE: %s", BATCH_SIZE) log.debug("BATCH_SIZE: %s", BATCH_SIZE)
log.info("THREADS: %s", THREADS) log.debug("THREADS: %s", THREADS)
log.info("MAX_TOKENS: %s", MAX_TOKENS) log.debug("MAX_TOKENS: %s", MAX_TOKENS)
log.info("STOP: %s", STOP) log.debug("STOP: %s", STOP)
log.debug("CONTEXT_LENGTH: %s", CONTEXT_LENGTH)
log.debug("GPU_LAYERS: %s", GPU_LAYERS)
top_k = body.top_k if body.top_k else TOP_K
top_p = body.top_p if body.top_p else TOP_P
temperature = body.temperature if body.temperature else TEMPERATURE
repetition_penalty = body.repetition_penalty if body.repetition_penalty else REPETITION_PENALTY
last_n_tokens = body.last_n_tokens if body.last_n_tokens else LAST_N_TOKENS
seed = body.seed if body.seed else SEED
batch_size = body.batch_size if body.batch_size else BATCH_SIZE
threads = body.threads if body.threads else THREADS
max_new_tokens = body.max_tokens if body.max_tokens else MAX_TOKENS
stop = body.stop if body.stop else STOP
log.info("top_k: %s", top_k)
log.info("top_p: %s", top_p)
log.info("temperature: %s", temperature)
log.info("repetition_penalty: %s", repetition_penalty)
log.info("last_n_tokens: %s", last_n_tokens)
log.info("seed: %s", seed)
log.info("batch_size: %s", batch_size)
log.info("threads: %s", threads)
log.info("max_new_tokens: %s", max_new_tokens)
log.info("stop: %s", stop)
log.info("CONTEXT_LENGTH: %s", CONTEXT_LENGTH) log.info("CONTEXT_LENGTH: %s", CONTEXT_LENGTH)
log.info("GPU_LAYERS: %s", GPU_LAYERS) log.info("GPU_LAYERS: %s", GPU_LAYERS)
config = Config( config = Config(
top_k=body.top_k if body.top_k else TOP_K, top_k=top_k,
top_p=body.top_p if body.top_p else TOP_P, top_p=top_p,
temperature=body.temperature if body.temperature else TEMPERATURE, temperature=temperature,
repetition_penalty=body.repetition_penalty if body.repetition_penalty else REPETITION_PENALTY, repetition_penalty=repetition_penalty,
last_n_tokens=body.last_n_tokens if body.last_n_tokens else LAST_N_TOKENS, last_n_tokens=last_n_tokens,
seed=body.seed if body.seed else SEED, seed=seed,
batch_size=body.batch_size if body.batch_size else BATCH_SIZE, batch_size=batch_size,
threads=body.threads if body.threads else THREADS, threads=threads,
max_new_tokens=body.max_tokens if body.max_tokens else MAX_TOKENS, max_new_tokens=max_new_tokens,
stop=body.stop if body.stop else STOP, stop=stop,
context_length=CONTEXT_LENGTH, context_length=CONTEXT_LENGTH,
gpu_layers=GPU_LAYERS, gpu_layers=GPU_LAYERS,
) )
return config
model_type = get_model_type(body)
log.info("model_type: %s", model_type)
auto_config = AutoConfig(
config=config,
model_type=model_type,
)
return auto_config
import os import os
from ctransformers import LLM from ctransformers import LLM, AutoModelForCausalLM
from request_body import ChatCompletionRequestBody, CompletionRequestBody from request_body import ChatCompletionRequestBody, CompletionRequestBody
from get_env import get_env from get_auto_config import get_auto_config
from get_config import get_config
async def get_llm( async def get_llm(
body: ChatCompletionRequestBody | CompletionRequestBody, body: ChatCompletionRequestBody | CompletionRequestBody,
...@@ -17,42 +17,12 @@ async def get_llm( ...@@ -17,42 +17,12 @@ async def get_llm(
_type_: _description_ _type_: _description_
""" """
ctransformer_model_type = "llama" auto_config = get_auto_config(body)
# These are also in "starcoder" format
# https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML llm = AutoModelForCausalLM.from_pretrained(
# https://huggingface.co/TheBloke/minotaur-15B-GGML model_path_or_repo_id=f"{os.getcwd()}/models/{body.model}",
if ( local_files_only=True,
"star" in body.model config=auto_config,
or "starchat" in body.model
or "WizardCoder" in body.model
or "minotaur-15" in body.model
):
ctransformer_model_type = "gpt_bigcode"
if "llama" in body.model:
ctransformer_model_type = "llama"
if "mpt" in body.model:
ctransformer_model_type = "mpt"
if "replit" in body.model:
ctransformer_model_type = "replit"
if "falcon" in body.model:
ctransformer_model_type = "falcon"
if "dolly" in body.model:
ctransformer_model_type = "dolly-v2"
if "stablelm" in body.model:
ctransformer_model_type = "gpt_neox"
# matching https://huggingface.co/stabilityai/stablecode-completion-alpha-3b
if "stablecode" in body.model:
ctransformer_model_type = "gpt_neox"
# matching https://huggingface.co/EleutherAI/pythia-70m
if "pythia" in body.model:
ctransformer_model_type = "gpt_neox"
config = get_config(body)
MODE_TYPE = get_env("MODE_TYPE", "")
if len(MODE_TYPE) > 0:
ctransformer_model_type = MODE_TYPE
return LLM(
model_path=f"{os.getcwd()}/models/{body.model}",
model_type=ctransformer_model_type,
config=config,
) )
return llm
from request_body import ChatCompletionRequestBody, CompletionRequestBody
from get_env import get_env
def get_model_type(
body: ChatCompletionRequestBody | CompletionRequestBody,
) -> str:
ctransformer_model_type = "llama"
# These are also in "starcoder" format
# https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML
# https://huggingface.co/TheBloke/minotaur-15B-GGML
if (
"star" in body.model
or "starchat" in body.model
or "WizardCoder" in body.model
or "minotaur-15" in body.model
):
ctransformer_model_type = "gpt_bigcode"
if "llama" in body.model:
ctransformer_model_type = "llama"
if "mpt" in body.model:
ctransformer_model_type = "mpt"
if "replit" in body.model:
ctransformer_model_type = "replit"
if "falcon" in body.model:
ctransformer_model_type = "falcon"
if "dolly" in body.model:
ctransformer_model_type = "dolly-v2"
if "stablelm" in body.model:
ctransformer_model_type = "gpt_neox"
# matching https://huggingface.co/stabilityai/stablecode-completion-alpha-3b
if "stablecode" in body.model:
ctransformer_model_type = "gpt_neox"
# matching https://huggingface.co/EleutherAI/pythia-70m
if "pythia" in body.model:
ctransformer_model_type = "gpt_neox"
MODE_TYPE = get_env("MODE_TYPE", "")
if len(MODE_TYPE) > 0:
ctransformer_model_type = MODE_TYPE
return ctransformer_model_type
...@@ -22,7 +22,6 @@ from streamers import chat_completions_streamer, completions_streamer ...@@ -22,7 +22,6 @@ from streamers import chat_completions_streamer, completions_streamer
from model_generate import chat_model_generate, model_generate from model_generate import chat_model_generate, model_generate
from get_env import get_env from get_env import get_env
from get_llm import get_llm from get_llm import get_llm
from get_config import get_config
DEFAULT_MODEL_HG_REPO_ID = get_env( DEFAULT_MODEL_HG_REPO_ID = get_env(
"DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML" "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML"
...@@ -77,7 +76,7 @@ async def startup_event(): ...@@ -77,7 +76,7 @@ async def startup_event():
"Downloading model... %s/%s to %s/models", "Downloading model... %s/%s to %s/models",
DEFAULT_MODEL_HG_REPO_ID, DEFAULT_MODEL_HG_REPO_ID,
DEFAULT_MODEL_FILE, DEFAULT_MODEL_FILE,
os.getcwd() os.getcwd(),
) )
try: try:
hf_hub_download( hf_hub_download(
...@@ -141,7 +140,6 @@ async def completions( ...@@ -141,7 +140,6 @@ async def completions(
"n, logit_bias, user, presence_penalty and frequency_penalty are not supporte." "n, logit_bias, user, presence_penalty and frequency_penalty are not supporte."
) )
prompt = body.prompt prompt = body.prompt
config = get_config(body)
model_name = body.model model_name = body.model
if body.stream is True: if body.stream is True:
...@@ -151,7 +149,6 @@ async def completions( ...@@ -151,7 +149,6 @@ async def completions(
prompt, prompt,
model_name, model_name,
llm, llm,
config,
log, log,
), ),
media_type="text/event-stream", media_type="text/event-stream",
...@@ -160,7 +157,6 @@ async def completions( ...@@ -160,7 +157,6 @@ async def completions(
prompt, prompt,
model_name, model_name,
llm, llm,
config,
log, log,
) )
...@@ -254,7 +250,6 @@ async def chat_completions( ...@@ -254,7 +250,6 @@ async def chat_completions(
) )
prompt = f"{system_message_content}{assistant_message_content} {default_user_start}{user_message_content}{default_user_end} {default_assistant_start}" prompt = f"{system_message_content}{assistant_message_content} {default_user_start}{user_message_content}{default_user_end} {default_assistant_start}"
config = get_config(body)
model_name = body.model model_name = body.model
if body.stream is True: if body.stream is True:
log.debug("Streaming response from %s", model_name) log.debug("Streaming response from %s", model_name)
...@@ -263,7 +258,6 @@ async def chat_completions( ...@@ -263,7 +258,6 @@ async def chat_completions(
prompt, prompt,
model_name, model_name,
llm, llm,
config,
log, log,
), ),
media_type="text/event-stream", media_type="text/event-stream",
...@@ -272,6 +266,5 @@ async def chat_completions( ...@@ -272,6 +266,5 @@ async def chat_completions(
prompt, prompt,
model_name, model_name,
llm, llm,
config,
log, log,
) )
from time import time from time import time
from logging import Logger from logging import Logger
from ctransformers import LLM, Config from ctransformers import LLM
def model_generate( def model_generate(
prompt: str, prompt: str,
model_name: str, model_name: str,
llm: LLM, llm: LLM,
config: Config,
log: Logger, log: Logger,
): ):
"""_summary_ """_summary_
...@@ -15,40 +14,11 @@ def model_generate( ...@@ -15,40 +14,11 @@ def model_generate(
""" """
created = time() created = time()
top_k = config.top_k
log.debug("top_k: %s", top_k)
top_p = config.top_p
log.debug("top_p: %s", top_p)
temperature = config.temperature
log.debug("temperature: %s", temperature)
repetition_penalty = config.repetition_penalty
log.debug("repetition_penalty: %s", repetition_penalty)
last_n_tokens = config.last_n_tokens
log.debug("last_n_tokens: %s", last_n_tokens)
seed = config.seed
log.debug("seed: %s", seed)
max_new_tokens = config.max_new_tokens
log.debug("max_new_tokens: %s", max_new_tokens)
batch_size = config.batch_size
log.debug("batch_size: %s", batch_size)
threads = config.threads
log.debug("thread: %s", threads)
gpu_layers = config.gpu_layers
log.debug("gpu_layers: %s", gpu_layers)
log.debug("prompt: %s", prompt) log.debug("prompt: %s", prompt)
log.debug("Getting from ctransformer instance") log.debug("Getting from ctransformer instance")
result: str = llm( # pyright: ignore [reportGeneralTypeIssues] result: str = llm( # pyright: ignore [reportGeneralTypeIssues]
prompt=prompt, prompt=prompt,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty,
last_n_tokens=last_n_tokens,
seed=seed,
batch_size=batch_size,
threads=threads,
max_new_tokens=max_new_tokens,
) )
http_response = { http_response = {
"id": "id", "id": "id",
...@@ -73,7 +43,6 @@ def chat_model_generate( ...@@ -73,7 +43,6 @@ def chat_model_generate(
prompt: str, prompt: str,
model_name: str, model_name: str,
llm: LLM, llm: LLM,
config: Config,
log: Logger, log: Logger,
): ):
"""_summary_ """_summary_
...@@ -81,37 +50,11 @@ def chat_model_generate( ...@@ -81,37 +50,11 @@ def chat_model_generate(
""" """
created = time() created = time()
top_k = config.top_k
log.debug("top_k: %s", top_k)
top_p = config.top_p
log.debug("top_p: %s", top_p)
temperature = config.temperature
log.debug("temperature: %s", temperature)
repetition_penalty = config.repetition_penalty
log.debug("repetition_penalty: %s", repetition_penalty)
last_n_tokens = config.last_n_tokens
log.debug("last_n_tokens: %s", last_n_tokens)
seed = config.seed
log.debug("seed: %s", seed)
batch_size = config.batch_size
log.debug("batch_size: %s", batch_size)
threads = config.threads
log.debug("thread: %s", threads)
gpu_layers = config.gpu_layers
log.debug("gpu_layers: %s", gpu_layers)
log.debug("prompt: %s", prompt) log.debug("prompt: %s", prompt)
log.debug("Getting from ctransformer instance") log.debug("Getting from ctransformer instance")
result: str = llm( # pyright: ignore [reportGeneralTypeIssues] result: str = llm( # pyright: ignore [reportGeneralTypeIssues]
prompt=prompt, prompt=prompt,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty,
last_n_tokens=last_n_tokens,
seed=seed,
batch_size=batch_size,
threads=threads,
) )
http_response = { http_response = {
"id": "id", "id": "id",
......
...@@ -3,7 +3,7 @@ blake3==0.3.3 ...@@ -3,7 +3,7 @@ blake3==0.3.3
certifi==2023.7.22 certifi==2023.7.22
charset-normalizer==3.1.0 charset-normalizer==3.1.0
click==8.1.3 click==8.1.3
ctransformers==0.2.22 ctransformers==0.2.24
fastapi==0.95.2 fastapi==0.95.2
filelock==3.12.0 filelock==3.12.0
fsspec==2023.5.0 fsspec==2023.5.0
......
...@@ -2,14 +2,13 @@ import json ...@@ -2,14 +2,13 @@ import json
from logging import Logger from logging import Logger
from os import times from os import times
from ctransformers import LLM, Config from ctransformers import LLM
def completions_streamer( def completions_streamer(
prompt: str, prompt: str,
model_name: str, model_name: str,
llm: LLM, llm: LLM,
config: Config,
log: Logger, log: Logger,
): ):
"""_summary_ """_summary_
...@@ -17,43 +16,11 @@ def completions_streamer( ...@@ -17,43 +16,11 @@ def completions_streamer(
""" """
created = times() created = times()
top_k = config.top_k
log.debug("top_k: %s", top_k)
top_p = config.top_p
log.debug("top_p: %s", top_p)
temperature = config.temperature
log.debug("temperature: %s", temperature)
repetition_penalty = config.repetition_penalty
log.debug("repetition_penalty: %s", repetition_penalty)
last_n_tokens = config.last_n_tokens
log.debug("last_n_tokens: %s", last_n_tokens)
seed = config.seed
log.debug("seed: %s", seed)
max_new_tokens = config.max_new_tokens
log.debug("max_new_tokens: %s", max_new_tokens)
stop = config.stop
log.debug("stop: %s", stop)
batch_size = config.batch_size
log.debug("batch_size: %s", batch_size)
threads = config.threads
log.debug("thread: %s", threads)
log.debug("prompt: %s", prompt) log.debug("prompt: %s", prompt)
log.debug("Streaming from ctransformer instance!") log.debug("Streaming from ctransformer instance!")
for token in llm( for token in llm(
prompt, prompt,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty,
last_n_tokens=last_n_tokens,
seed=seed,
stop=stop,
batch_size=batch_size,
threads=threads,
stream=True,
reset=True,
max_new_tokens=max_new_tokens,
): ):
log.debug("Streaming token %s", token) log.debug("Streaming token %s", token)
data = json.dumps( data = json.dumps(
...@@ -96,7 +63,6 @@ def chat_completions_streamer( ...@@ -96,7 +63,6 @@ def chat_completions_streamer(
prompt: str, prompt: str,
model_name: str, model_name: str,
llm: LLM, llm: LLM,
config: Config,
log: Logger, log: Logger,
): ):
"""_summary_ """_summary_
...@@ -104,38 +70,11 @@ def chat_completions_streamer( ...@@ -104,38 +70,11 @@ def chat_completions_streamer(
""" """
created = times() created = times()
top_k = config.top_k
log.debug("top_k: %s", top_k)
top_p = config.top_p
log.debug("top_p: %s", top_p)
temperature = config.temperature
log.debug("temperature: %s", temperature)
repetition_penalty = config.repetition_penalty
log.debug("repetition_penalty: %s", repetition_penalty)
last_n_tokens = config.last_n_tokens
log.debug("last_n_tokens: %s", last_n_tokens)
seed = config.seed
log.debug("seed: %s", seed)
stop = config.stop
log.debug("stop: %s", stop)
batch_size = config.batch_size
log.debug("batch_size: %s", batch_size)
threads = config.threads
log.debug("threads: %s", threads)
log.debug("prompt: %s", prompt) log.debug("prompt: %s", prompt)
log.debug("Streaming from ctransformer instance") log.debug("Streaming from ctransformer instance")
for token in llm( for token in llm(
prompt, prompt,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty,
last_n_tokens=last_n_tokens,
seed=seed,
stop=stop,
batch_size=batch_size,
threads=threads,
stream=True, stream=True,
reset=True, reset=True,
): ):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment