diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml index cc579dc94aaa5d2adc9036c091e4e3308ceedcc5..83908a7fafe48626ca7f485402f909161c6c18ad 100644 --- a/charts/ialacol/Chart.yaml +++ b/charts/ialacol/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 -appVersion: 0.10.2 +appVersion: 0.10.1 description: A Helm chart for ialacol name: ialacol type: application -version: 0.10.2 +version: 0.10.1 diff --git a/examples/values/codellama-13b-cuda12.yaml b/examples/values/codellama-13b-cuda12.yaml new file mode 100644 index 0000000000000000000000000000000000000000..85d86f555afb1b0485d548bd48abd73f71ec76ec --- /dev/null +++ b/examples/values/codellama-13b-cuda12.yaml @@ -0,0 +1,29 @@ +replicas: 1 +deployment: + image: ghcr.io/chenhunghan/ialacol-cuda12:latest + env: + DEFAULT_MODEL_HG_REPO_ID: TheBloke/CodeLlama-13B-GGML + DEFAULT_MODEL_FILE: codellama-13b.ggmlv3.Q4_0.bin + GPU_LAYERS: 40 + TOP_K: 40 + TOP_P: 0.1 + TEMPERATURE: 0.1 + THREADS: 1 + MAX_TOKENS: 1024 + REPETITION_PENALTY: 1.8 + LAST_N_TOKENS: 128 +resources: + {} +model: + persistence: + size: 10Gi + accessModes: + - ReadWriteOnce + storageClassName: ~ +service: + type: ClusterIP + port: 8000 + annotations: {} +nodeSelector: {} +tolerations: [] +affinity: {} diff --git a/get_auto_config.py b/get_auto_config.py new file mode 100644 index 0000000000000000000000000000000000000000..4390b04fa92bfaab876f641597b6510ea76a4327 --- /dev/null +++ b/get_auto_config.py @@ -0,0 +1,107 @@ +import logging +from ctransformers import Config, AutoConfig + +from request_body import ChatCompletionRequestBody, CompletionRequestBody +from get_env import get_env, get_env_or_none +from get_default_thread import get_default_thread +from get_model_type import get_model_type + +LOGGING_LEVEL = get_env("LOGGING_LEVEL", "INFO") + +log = logging.getLogger("uvicorn") +try: + log.setLevel(LOGGING_LEVEL) +except ValueError: + log.setLevel("INFO") + +THREADS = int(get_env("THREADS", str(get_default_thread()))) + + +def get_auto_config( + body: CompletionRequestBody | ChatCompletionRequestBody, +) -> AutoConfig: + # ggml only, follow ctransformers defaults + TOP_K = int(get_env("TOP_K", "40")) + # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-top_p + TOP_P = float(get_env("TOP_P", "1.0")) + # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-temperature + TEMPERATURE = float(get_env("TEMPERATURE", "1")) + # ggml only, follow ctransformers defaults + REPETITION_PENALTY = float(get_env("REPETITION_PENALTY", "1.1")) + # ggml only, follow ctransformers defaults + LAST_N_TOKENS = int(get_env("LAST_N_TOKENS", "64")) + # ggml only, follow ctransformers defaults + SEED = int(get_env("SEED", "-1")) + # ggml only, follow ctransformers defaults + BATCH_SIZE = int(get_env("BATCH_SIZE", "8")) + # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-max_tokens + MAX_TOKENS = int(get_env("MAX_TOKENS", "9999999")) + # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-stop + STOP = get_env_or_none("STOP") + # ggml only, follow ctransformers defaults + CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", "-1")) + # the layers to offloading to the GPU + GPU_LAYERS = int(get_env("GPU_LAYERS", "0")) + + log.debug("TOP_K: %s", TOP_K) + log.debug("TOP_P: %s", TOP_P) + log.debug("TEMPERATURE: %s", TEMPERATURE) + log.debug("REPETITION_PENALTY: %s", REPETITION_PENALTY) + log.debug("LAST_N_TOKENS: %s", LAST_N_TOKENS) + log.debug("SEED: %s", SEED) + log.debug("BATCH_SIZE: %s", BATCH_SIZE) + log.debug("THREADS: %s", THREADS) + log.debug("MAX_TOKENS: %s", MAX_TOKENS) + log.debug("STOP: %s", STOP) + log.debug("CONTEXT_LENGTH: %s", CONTEXT_LENGTH) + log.debug("GPU_LAYERS: %s", GPU_LAYERS) + + top_k = body.top_k if body.top_k else TOP_K + top_p = body.top_p if body.top_p else TOP_P + temperature = body.temperature if body.temperature else TEMPERATURE + repetition_penalty = body.repetition_penalty if body.repetition_penalty else REPETITION_PENALTY + last_n_tokens = body.last_n_tokens if body.last_n_tokens else LAST_N_TOKENS + seed = body.seed if body.seed else SEED + batch_size = body.batch_size if body.batch_size else BATCH_SIZE + threads = body.threads if body.threads else THREADS + max_new_tokens = body.max_tokens if body.max_tokens else MAX_TOKENS + stop = body.stop if body.stop else STOP + + log.info("top_k: %s", top_k) + log.info("top_p: %s", top_p) + log.info("temperature: %s", temperature) + log.info("repetition_penalty: %s", repetition_penalty) + log.info("last_n_tokens: %s", last_n_tokens) + log.info("seed: %s", seed) + log.info("batch_size: %s", batch_size) + log.info("threads: %s", threads) + log.info("max_new_tokens: %s", max_new_tokens) + log.info("stop: %s", stop) + + log.info("CONTEXT_LENGTH: %s", CONTEXT_LENGTH) + log.info("GPU_LAYERS: %s", GPU_LAYERS) + + config = Config( + top_k=top_k, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty, + last_n_tokens=last_n_tokens, + seed=seed, + batch_size=batch_size, + threads=threads, + max_new_tokens=max_new_tokens, + stop=stop, + context_length=CONTEXT_LENGTH, + gpu_layers=GPU_LAYERS, + ) + + model_type = get_model_type(body) + + log.info("model_type: %s", model_type) + + auto_config = AutoConfig( + config=config, + model_type=model_type, + ) + return auto_config diff --git a/get_config.py b/get_config.py deleted file mode 100644 index 0c24e55f57600276f8c4ce02070be255ece36f45..0000000000000000000000000000000000000000 --- a/get_config.py +++ /dev/null @@ -1,70 +0,0 @@ -import logging -from ctransformers import Config - -from request_body import ChatCompletionRequestBody, CompletionRequestBody -from get_env import get_env, get_env_or_none -from get_default_thread import get_default_thread - -LOGGING_LEVEL = get_env("LOGGING_LEVEL", "INFO") - -log = logging.getLogger("uvicorn") -try: - log.setLevel(LOGGING_LEVEL) -except ValueError: - log.setLevel("INFO") - -THREADS = int(get_env("THREADS", str(get_default_thread()))) - - -def get_config(body: CompletionRequestBody | ChatCompletionRequestBody) -> Config: - # ggml only, follow ctransformers defaults - TOP_K = int(get_env("TOP_K", "40")) - # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-top_p - TOP_P = float(get_env("TOP_P", "1.0")) - # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-temperature - TEMPERATURE = float(get_env("TEMPERATURE", "1")) - # ggml only, follow ctransformers defaults - REPETITION_PENALTY = float(get_env("REPETITION_PENALTY", "1.1")) - # ggml only, follow ctransformers defaults - LAST_N_TOKENS = int(get_env("LAST_N_TOKENS", "64")) - # ggml only, follow ctransformers defaults - SEED = int(get_env("SEED", "-1")) - # ggml only, follow ctransformers defaults - BATCH_SIZE = int(get_env("BATCH_SIZE", "8")) - # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-max_tokens - MAX_TOKENS = int(get_env("MAX_TOKENS", "9999999")) - # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-stop - STOP = get_env_or_none("STOP") - # ggml only, follow ctransformers defaults - CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", "-1")) - # the layers to offloading to the GPU - GPU_LAYERS = int(get_env("GPU_LAYERS", "0")) - - log.info("TOP_K: %s", TOP_K) - log.info("TOP_P: %s", TOP_P) - log.info("TEMPERATURE: %s", TEMPERATURE) - log.info("REPETITION_PENALTY: %s", REPETITION_PENALTY) - log.info("LAST_N_TOKENS: %s", LAST_N_TOKENS) - log.info("SEED: %s", SEED) - log.info("BATCH_SIZE: %s", BATCH_SIZE) - log.info("THREADS: %s", THREADS) - log.info("MAX_TOKENS: %s", MAX_TOKENS) - log.info("STOP: %s", STOP) - log.info("CONTEXT_LENGTH: %s", CONTEXT_LENGTH) - log.info("GPU_LAYERS: %s", GPU_LAYERS) - - config = Config( - top_k=body.top_k if body.top_k else TOP_K, - top_p=body.top_p if body.top_p else TOP_P, - temperature=body.temperature if body.temperature else TEMPERATURE, - repetition_penalty=body.repetition_penalty if body.repetition_penalty else REPETITION_PENALTY, - last_n_tokens=body.last_n_tokens if body.last_n_tokens else LAST_N_TOKENS, - seed=body.seed if body.seed else SEED, - batch_size=body.batch_size if body.batch_size else BATCH_SIZE, - threads=body.threads if body.threads else THREADS, - max_new_tokens=body.max_tokens if body.max_tokens else MAX_TOKENS, - stop=body.stop if body.stop else STOP, - context_length=CONTEXT_LENGTH, - gpu_layers=GPU_LAYERS, - ) - return config diff --git a/get_llm.py b/get_llm.py index c92614c34057a54e41edd19308fe58c88981c17e..657f98d6bec249fbea4d97f87b4165a975226e73 100644 --- a/get_llm.py +++ b/get_llm.py @@ -1,9 +1,9 @@ import os -from ctransformers import LLM +from ctransformers import LLM, AutoModelForCausalLM from request_body import ChatCompletionRequestBody, CompletionRequestBody -from get_env import get_env -from get_config import get_config +from get_auto_config import get_auto_config + async def get_llm( body: ChatCompletionRequestBody | CompletionRequestBody, @@ -17,42 +17,12 @@ async def get_llm( _type_: _description_ """ - ctransformer_model_type = "llama" - # These are also in "starcoder" format - # https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML - # https://huggingface.co/TheBloke/minotaur-15B-GGML - if ( - "star" in body.model - or "starchat" in body.model - or "WizardCoder" in body.model - or "minotaur-15" in body.model - ): - ctransformer_model_type = "gpt_bigcode" - if "llama" in body.model: - ctransformer_model_type = "llama" - if "mpt" in body.model: - ctransformer_model_type = "mpt" - if "replit" in body.model: - ctransformer_model_type = "replit" - if "falcon" in body.model: - ctransformer_model_type = "falcon" - if "dolly" in body.model: - ctransformer_model_type = "dolly-v2" - if "stablelm" in body.model: - ctransformer_model_type = "gpt_neox" - # matching https://huggingface.co/stabilityai/stablecode-completion-alpha-3b - if "stablecode" in body.model: - ctransformer_model_type = "gpt_neox" - # matching https://huggingface.co/EleutherAI/pythia-70m - if "pythia" in body.model: - ctransformer_model_type = "gpt_neox" - config = get_config(body) - MODE_TYPE = get_env("MODE_TYPE", "") - if len(MODE_TYPE) > 0: - ctransformer_model_type = MODE_TYPE - - return LLM( - model_path=f"{os.getcwd()}/models/{body.model}", - model_type=ctransformer_model_type, - config=config, + auto_config = get_auto_config(body) + + llm = AutoModelForCausalLM.from_pretrained( + model_path_or_repo_id=f"{os.getcwd()}/models/{body.model}", + local_files_only=True, + config=auto_config, ) + + return llm diff --git a/get_model_type.py b/get_model_type.py new file mode 100644 index 0000000000000000000000000000000000000000..2f1bc9c8b2dc33746e66693c35cadfbe2b13fcfe --- /dev/null +++ b/get_model_type.py @@ -0,0 +1,41 @@ +from request_body import ChatCompletionRequestBody, CompletionRequestBody +from get_env import get_env + + +def get_model_type( + body: ChatCompletionRequestBody | CompletionRequestBody, +) -> str: + ctransformer_model_type = "llama" + # These are also in "starcoder" format + # https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML + # https://huggingface.co/TheBloke/minotaur-15B-GGML + if ( + "star" in body.model + or "starchat" in body.model + or "WizardCoder" in body.model + or "minotaur-15" in body.model + ): + ctransformer_model_type = "gpt_bigcode" + if "llama" in body.model: + ctransformer_model_type = "llama" + if "mpt" in body.model: + ctransformer_model_type = "mpt" + if "replit" in body.model: + ctransformer_model_type = "replit" + if "falcon" in body.model: + ctransformer_model_type = "falcon" + if "dolly" in body.model: + ctransformer_model_type = "dolly-v2" + if "stablelm" in body.model: + ctransformer_model_type = "gpt_neox" + # matching https://huggingface.co/stabilityai/stablecode-completion-alpha-3b + if "stablecode" in body.model: + ctransformer_model_type = "gpt_neox" + # matching https://huggingface.co/EleutherAI/pythia-70m + if "pythia" in body.model: + ctransformer_model_type = "gpt_neox" + + MODE_TYPE = get_env("MODE_TYPE", "") + if len(MODE_TYPE) > 0: + ctransformer_model_type = MODE_TYPE + return ctransformer_model_type diff --git a/main.py b/main.py index e3e927ea2f8a2ad08102073ea081b7d140d52f9a..79cc9cf6dc2caee1e3cea5cfaa3fc1a606981dba 100644 --- a/main.py +++ b/main.py @@ -22,7 +22,6 @@ from streamers import chat_completions_streamer, completions_streamer from model_generate import chat_model_generate, model_generate from get_env import get_env from get_llm import get_llm -from get_config import get_config DEFAULT_MODEL_HG_REPO_ID = get_env( "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML" @@ -77,7 +76,7 @@ async def startup_event(): "Downloading model... %s/%s to %s/models", DEFAULT_MODEL_HG_REPO_ID, DEFAULT_MODEL_FILE, - os.getcwd() + os.getcwd(), ) try: hf_hub_download( @@ -141,7 +140,6 @@ async def completions( "n, logit_bias, user, presence_penalty and frequency_penalty are not supporte." ) prompt = body.prompt - config = get_config(body) model_name = body.model if body.stream is True: @@ -151,7 +149,6 @@ async def completions( prompt, model_name, llm, - config, log, ), media_type="text/event-stream", @@ -160,7 +157,6 @@ async def completions( prompt, model_name, llm, - config, log, ) @@ -254,7 +250,6 @@ async def chat_completions( ) prompt = f"{system_message_content}{assistant_message_content} {default_user_start}{user_message_content}{default_user_end} {default_assistant_start}" - config = get_config(body) model_name = body.model if body.stream is True: log.debug("Streaming response from %s", model_name) @@ -263,7 +258,6 @@ async def chat_completions( prompt, model_name, llm, - config, log, ), media_type="text/event-stream", @@ -272,6 +266,5 @@ async def chat_completions( prompt, model_name, llm, - config, log, ) diff --git a/model_generate.py b/model_generate.py index f6412c37e7d12d7926fc2652a6ece0f3e9e16cb1..abe76b3a851a5f07c586a4812ad7a1245e234c32 100644 --- a/model_generate.py +++ b/model_generate.py @@ -1,13 +1,12 @@ from time import time from logging import Logger -from ctransformers import LLM, Config +from ctransformers import LLM def model_generate( prompt: str, model_name: str, llm: LLM, - config: Config, log: Logger, ): """_summary_ @@ -15,40 +14,11 @@ def model_generate( """ created = time() - top_k = config.top_k - log.debug("top_k: %s", top_k) - top_p = config.top_p - log.debug("top_p: %s", top_p) - temperature = config.temperature - log.debug("temperature: %s", temperature) - repetition_penalty = config.repetition_penalty - log.debug("repetition_penalty: %s", repetition_penalty) - last_n_tokens = config.last_n_tokens - log.debug("last_n_tokens: %s", last_n_tokens) - seed = config.seed - log.debug("seed: %s", seed) - max_new_tokens = config.max_new_tokens - log.debug("max_new_tokens: %s", max_new_tokens) - batch_size = config.batch_size - log.debug("batch_size: %s", batch_size) - threads = config.threads - log.debug("thread: %s", threads) - gpu_layers = config.gpu_layers - log.debug("gpu_layers: %s", gpu_layers) log.debug("prompt: %s", prompt) log.debug("Getting from ctransformer instance") result: str = llm( # pyright: ignore [reportGeneralTypeIssues] prompt=prompt, - top_k=top_k, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - last_n_tokens=last_n_tokens, - seed=seed, - batch_size=batch_size, - threads=threads, - max_new_tokens=max_new_tokens, ) http_response = { "id": "id", @@ -73,7 +43,6 @@ def chat_model_generate( prompt: str, model_name: str, llm: LLM, - config: Config, log: Logger, ): """_summary_ @@ -81,37 +50,11 @@ def chat_model_generate( """ created = time() - top_k = config.top_k - log.debug("top_k: %s", top_k) - top_p = config.top_p - log.debug("top_p: %s", top_p) - temperature = config.temperature - log.debug("temperature: %s", temperature) - repetition_penalty = config.repetition_penalty - log.debug("repetition_penalty: %s", repetition_penalty) - last_n_tokens = config.last_n_tokens - log.debug("last_n_tokens: %s", last_n_tokens) - seed = config.seed - log.debug("seed: %s", seed) - batch_size = config.batch_size - log.debug("batch_size: %s", batch_size) - threads = config.threads - log.debug("thread: %s", threads) - gpu_layers = config.gpu_layers - log.debug("gpu_layers: %s", gpu_layers) log.debug("prompt: %s", prompt) log.debug("Getting from ctransformer instance") result: str = llm( # pyright: ignore [reportGeneralTypeIssues] prompt=prompt, - top_k=top_k, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - last_n_tokens=last_n_tokens, - seed=seed, - batch_size=batch_size, - threads=threads, ) http_response = { "id": "id", diff --git a/requirements.txt b/requirements.txt index 1c96282fffda11266fa46a13c75d68a5d5fb4199..4838f45c591cdfd2b9f72d3814e3da030d841ce4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ blake3==0.3.3 certifi==2023.7.22 charset-normalizer==3.1.0 click==8.1.3 -ctransformers==0.2.22 +ctransformers==0.2.24 fastapi==0.95.2 filelock==3.12.0 fsspec==2023.5.0 diff --git a/streamers.py b/streamers.py index 1b4e3499437f8cb5a9fd7257c8886b36352a766e..7eb9ba157e781c2916b812cff0a7b4b5a1855ca5 100644 --- a/streamers.py +++ b/streamers.py @@ -2,14 +2,13 @@ import json from logging import Logger from os import times -from ctransformers import LLM, Config +from ctransformers import LLM def completions_streamer( prompt: str, model_name: str, llm: LLM, - config: Config, log: Logger, ): """_summary_ @@ -17,43 +16,11 @@ def completions_streamer( """ created = times() - top_k = config.top_k - log.debug("top_k: %s", top_k) - top_p = config.top_p - log.debug("top_p: %s", top_p) - temperature = config.temperature - log.debug("temperature: %s", temperature) - repetition_penalty = config.repetition_penalty - log.debug("repetition_penalty: %s", repetition_penalty) - last_n_tokens = config.last_n_tokens - log.debug("last_n_tokens: %s", last_n_tokens) - seed = config.seed - log.debug("seed: %s", seed) - max_new_tokens = config.max_new_tokens - log.debug("max_new_tokens: %s", max_new_tokens) - stop = config.stop - log.debug("stop: %s", stop) - batch_size = config.batch_size - log.debug("batch_size: %s", batch_size) - threads = config.threads - log.debug("thread: %s", threads) log.debug("prompt: %s", prompt) log.debug("Streaming from ctransformer instance!") for token in llm( prompt, - top_k=top_k, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - last_n_tokens=last_n_tokens, - seed=seed, - stop=stop, - batch_size=batch_size, - threads=threads, - stream=True, - reset=True, - max_new_tokens=max_new_tokens, ): log.debug("Streaming token %s", token) data = json.dumps( @@ -96,7 +63,6 @@ def chat_completions_streamer( prompt: str, model_name: str, llm: LLM, - config: Config, log: Logger, ): """_summary_ @@ -104,38 +70,11 @@ def chat_completions_streamer( """ created = times() - top_k = config.top_k - log.debug("top_k: %s", top_k) - top_p = config.top_p - log.debug("top_p: %s", top_p) - temperature = config.temperature - log.debug("temperature: %s", temperature) - repetition_penalty = config.repetition_penalty - log.debug("repetition_penalty: %s", repetition_penalty) - last_n_tokens = config.last_n_tokens - log.debug("last_n_tokens: %s", last_n_tokens) - seed = config.seed - log.debug("seed: %s", seed) - stop = config.stop - log.debug("stop: %s", stop) - batch_size = config.batch_size - log.debug("batch_size: %s", batch_size) - threads = config.threads - log.debug("threads: %s", threads) log.debug("prompt: %s", prompt) log.debug("Streaming from ctransformer instance") for token in llm( prompt, - top_k=top_k, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - last_n_tokens=last_n_tokens, - seed=seed, - stop=stop, - batch_size=batch_size, - threads=threads, stream=True, reset=True, ):