diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml index 2728cc05a0a0ef5131979b8ef7f6325230bad592..1c19c57f51a05dde1fa72dfde54e0fe7e83e8566 100644 --- a/charts/ialacol/Chart.yaml +++ b/charts/ialacol/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 -appVersion: 0.10.3 +appVersion: 0.10.4 description: A Helm chart for ialacol name: ialacol type: application -version: 0.10.3 +version: 0.10.4 diff --git a/charts/ialacol/templates/deployment.yaml b/charts/ialacol/templates/deployment.yaml index 96c57f2a9cfaf2d52a2f87092ff065196fabdccd..f17c2cf967700d59d9eff6d3ac93ed425c3bcd8a 100644 --- a/charts/ialacol/templates/deployment.yaml +++ b/charts/ialacol/templates/deployment.yaml @@ -29,8 +29,6 @@ spec: value: {{ (.Values.deployment.env).DEFAULT_MODEL_HG_REPO_ID | quote }} - name: DEFAULT_MODEL_FILE value: {{ (.Values.deployment.env).DEFAULT_MODEL_FILE | quote }} - - name: DOWNLOAD_DEFAULT_MODEL - value: {{ (.Values.deployment.env).DOWNLOAD_DEFAULT_MODEL | quote }} - name: LOGGING_LEVEL value: {{ (.Values.deployment.env).LOGGING_LEVEL | quote }} - name: TOP_K diff --git a/get_auto_config.py b/get_config.py similarity index 64% rename from get_auto_config.py rename to get_config.py index 4390b04fa92bfaab876f641597b6510ea76a4327..94a51ff2877d1e6e61664258e46399b6297bd684 100644 --- a/get_auto_config.py +++ b/get_config.py @@ -1,25 +1,16 @@ -import logging -from ctransformers import Config, AutoConfig +from ctransformers import Config from request_body import ChatCompletionRequestBody, CompletionRequestBody from get_env import get_env, get_env_or_none from get_default_thread import get_default_thread -from get_model_type import get_model_type - -LOGGING_LEVEL = get_env("LOGGING_LEVEL", "INFO") - -log = logging.getLogger("uvicorn") -try: - log.setLevel(LOGGING_LEVEL) -except ValueError: - log.setLevel("INFO") +from log import log THREADS = int(get_env("THREADS", str(get_default_thread()))) -def get_auto_config( +def get_config( body: CompletionRequestBody | ChatCompletionRequestBody, -) -> AutoConfig: +) -> Config: # ggml only, follow ctransformers defaults TOP_K = int(get_env("TOP_K", "40")) # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-top_p @@ -38,10 +29,6 @@ def get_auto_config( MAX_TOKENS = int(get_env("MAX_TOKENS", "9999999")) # OpenAI API defaults https://platform.openai.com/docs/api-reference/chat/create#chat/create-stop STOP = get_env_or_none("STOP") - # ggml only, follow ctransformers defaults - CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", "-1")) - # the layers to offloading to the GPU - GPU_LAYERS = int(get_env("GPU_LAYERS", "0")) log.debug("TOP_K: %s", TOP_K) log.debug("TOP_P: %s", TOP_P) @@ -53,13 +40,13 @@ def get_auto_config( log.debug("THREADS: %s", THREADS) log.debug("MAX_TOKENS: %s", MAX_TOKENS) log.debug("STOP: %s", STOP) - log.debug("CONTEXT_LENGTH: %s", CONTEXT_LENGTH) - log.debug("GPU_LAYERS: %s", GPU_LAYERS) top_k = body.top_k if body.top_k else TOP_K top_p = body.top_p if body.top_p else TOP_P temperature = body.temperature if body.temperature else TEMPERATURE - repetition_penalty = body.repetition_penalty if body.repetition_penalty else REPETITION_PENALTY + repetition_penalty = ( + body.repetition_penalty if body.repetition_penalty else REPETITION_PENALTY + ) last_n_tokens = body.last_n_tokens if body.last_n_tokens else LAST_N_TOKENS seed = body.seed if body.seed else SEED batch_size = body.batch_size if body.batch_size else BATCH_SIZE @@ -67,20 +54,6 @@ def get_auto_config( max_new_tokens = body.max_tokens if body.max_tokens else MAX_TOKENS stop = body.stop if body.stop else STOP - log.info("top_k: %s", top_k) - log.info("top_p: %s", top_p) - log.info("temperature: %s", temperature) - log.info("repetition_penalty: %s", repetition_penalty) - log.info("last_n_tokens: %s", last_n_tokens) - log.info("seed: %s", seed) - log.info("batch_size: %s", batch_size) - log.info("threads: %s", threads) - log.info("max_new_tokens: %s", max_new_tokens) - log.info("stop: %s", stop) - - log.info("CONTEXT_LENGTH: %s", CONTEXT_LENGTH) - log.info("GPU_LAYERS: %s", GPU_LAYERS) - config = Config( top_k=top_k, top_p=top_p, @@ -92,16 +65,6 @@ def get_auto_config( threads=threads, max_new_tokens=max_new_tokens, stop=stop, - context_length=CONTEXT_LENGTH, - gpu_layers=GPU_LAYERS, ) - model_type = get_model_type(body) - - log.info("model_type: %s", model_type) - - auto_config = AutoConfig( - config=config, - model_type=model_type, - ) - return auto_config + return config diff --git a/get_llm.py b/get_llm.py deleted file mode 100644 index 657f98d6bec249fbea4d97f87b4165a975226e73..0000000000000000000000000000000000000000 --- a/get_llm.py +++ /dev/null @@ -1,28 +0,0 @@ -import os - -from ctransformers import LLM, AutoModelForCausalLM -from request_body import ChatCompletionRequestBody, CompletionRequestBody -from get_auto_config import get_auto_config - - -async def get_llm( - body: ChatCompletionRequestBody | CompletionRequestBody, -) -> LLM: - """_summary_ - - Args: - body (ChatCompletionRequestBody): _description_ - - Returns: - _type_: _description_ - """ - - auto_config = get_auto_config(body) - - llm = AutoModelForCausalLM.from_pretrained( - model_path_or_repo_id=f"{os.getcwd()}/models/{body.model}", - local_files_only=True, - config=auto_config, - ) - - return llm diff --git a/get_model_type.py b/get_model_type.py index 2f1bc9c8b2dc33746e66693c35cadfbe2b13fcfe..b71ede8e30891ea7e0c2f7f247744dad44818216 100644 --- a/get_model_type.py +++ b/get_model_type.py @@ -3,36 +3,36 @@ from get_env import get_env def get_model_type( - body: ChatCompletionRequestBody | CompletionRequestBody, + filename: str, ) -> str: ctransformer_model_type = "llama" # These are also in "starcoder" format # https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML # https://huggingface.co/TheBloke/minotaur-15B-GGML if ( - "star" in body.model - or "starchat" in body.model - or "WizardCoder" in body.model - or "minotaur-15" in body.model + "star" in filename + or "starchat" in filename + or "WizardCoder" in filename + or "minotaur-15" in filename ): ctransformer_model_type = "gpt_bigcode" - if "llama" in body.model: + if "llama" in filename: ctransformer_model_type = "llama" - if "mpt" in body.model: + if "mpt" in filename: ctransformer_model_type = "mpt" - if "replit" in body.model: + if "replit" in filename: ctransformer_model_type = "replit" - if "falcon" in body.model: + if "falcon" in filename: ctransformer_model_type = "falcon" - if "dolly" in body.model: + if "dolly" in filename: ctransformer_model_type = "dolly-v2" - if "stablelm" in body.model: + if "stablelm" in filename: ctransformer_model_type = "gpt_neox" # matching https://huggingface.co/stabilityai/stablecode-completion-alpha-3b - if "stablecode" in body.model: + if "stablecode" in filename: ctransformer_model_type = "gpt_neox" # matching https://huggingface.co/EleutherAI/pythia-70m - if "pythia" in body.model: + if "pythia" in filename: ctransformer_model_type = "gpt_neox" MODE_TYPE = get_env("MODE_TYPE", "") diff --git a/log.py b/log.py new file mode 100644 index 0000000000000000000000000000000000000000..4f5930b47747b7715da78a0a3897b065cb91782c --- /dev/null +++ b/log.py @@ -0,0 +1,12 @@ +import logging + +from get_env import get_env + + +LOGGING_LEVEL = get_env("LOGGING_LEVEL", "INFO") + +log = logging.getLogger("uvicorn") +try: + log.setLevel(LOGGING_LEVEL) +except ValueError: + log.setLevel("INFO") diff --git a/main.py b/main.py index 79cc9cf6dc2caee1e3cea5cfaa3fc1a606981dba..9ccbee9db0537b066e813f28dea0cfb61c6cc404 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,6 @@ This module contains the main FastAPI application. """ import os -import logging from typing import ( Awaitable, @@ -11,33 +10,30 @@ from typing import ( Union, Annotated, ) -from fastapi import FastAPI, Depends, HTTPException, Body +from fastapi import FastAPI, Depends, HTTPException, Body, Request from fastapi.responses import StreamingResponse -from ctransformers import LLM +from ctransformers import LLM, Config from huggingface_hub import hf_hub_download +from get_config import get_config +from get_model_type import get_model_type from request_body import ChatCompletionRequestBody, CompletionRequestBody from response_body import ChatCompletionResponseBody, CompletionResponseBody from streamers import chat_completions_streamer, completions_streamer from model_generate import chat_model_generate, model_generate from get_env import get_env -from get_llm import get_llm +from log import log DEFAULT_MODEL_HG_REPO_ID = get_env( "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML" ) DEFAULT_MODEL_FILE = get_env("DEFAULT_MODEL_FILE", "llama-2-7b-chat.ggmlv3.q4_0.bin") -DOWNLOAD_DEFAULT_MODEL = get_env("DOWNLOAD_DEFAULT_MODEL", "True") == "True" -LOGGING_LEVEL = get_env("LOGGING_LEVEL", "INFO") - -log = logging.getLogger("uvicorn") log.info("DEFAULT_MODEL_HG_REPO_ID: %s", DEFAULT_MODEL_HG_REPO_ID) log.info("DEFAULT_MODEL_FILE: %s", DEFAULT_MODEL_FILE) -log.info("DOWNLOAD_DEFAULT_MODEL: %s", DOWNLOAD_DEFAULT_MODEL) -log.info("LOGGING_LEVEL: %s", LOGGING_LEVEL) DOWNLOADING_MODEL = False +LOADING_MODEL = False def set_downloading_model(boolean: bool): @@ -47,7 +43,16 @@ def set_downloading_model(boolean: bool): boolean (bool): the boolean value to set DOWNLOADING_MODEL to """ globals()["DOWNLOADING_MODEL"] = boolean - log.info("DOWNLOADING_MODEL set to %s", globals()["DOWNLOADING_MODEL"]) + log.debug("DOWNLOADING_MODEL set to %s", globals()["DOWNLOADING_MODEL"]) + +def set_loading_model(boolean: bool): + """_summary_ + + Args: + boolean (bool): the boolean value to set LOADING_MODEL to + """ + globals()["LOADING_MODEL"] = boolean + log.debug("LOADING_MODEL set to %s", globals()["LOADING_MODEL"]) Sender = Callable[[Union[str, bytes]], Awaitable[None]] @@ -63,32 +68,50 @@ async def startup_event(): Starts up the server, setting log level, downloading the default model if necessary. """ log.info("Starting up...") - try: - log.setLevel(LOGGING_LEVEL) - log.info("Log level set to %s", LOGGING_LEVEL) - except ValueError: - log.setLevel("INFO") - log.info("Unknown Log level %s, fallback to INFO", LOGGING_LEVEL) - if DOWNLOAD_DEFAULT_MODEL is True: - if DEFAULT_MODEL_FILE and DEFAULT_MODEL_HG_REPO_ID: - set_downloading_model(True) - log.info( - "Downloading model... %s/%s to %s/models", - DEFAULT_MODEL_HG_REPO_ID, - DEFAULT_MODEL_FILE, - os.getcwd(), + if DEFAULT_MODEL_FILE and DEFAULT_MODEL_HG_REPO_ID: + set_downloading_model(True) + log.info( + "Downloading model... %s/%s to %s/models", + DEFAULT_MODEL_HG_REPO_ID, + DEFAULT_MODEL_FILE, + os.getcwd(), + ) + try: + hf_hub_download( + repo_id=DEFAULT_MODEL_HG_REPO_ID, + cache_dir="models/.cache", + local_dir="models", + filename=DEFAULT_MODEL_FILE, + resume_download=True, ) - try: - hf_hub_download( - repo_id=DEFAULT_MODEL_HG_REPO_ID, - cache_dir="models/.cache", - local_dir="models", - filename=DEFAULT_MODEL_FILE, - ) - except Exception as exception: - log.error("Error downloading model: %s", exception) - finally: - set_downloading_model(False) + except Exception as exception: + log.error("Error downloading model: %s", exception) + finally: + set_downloading_model(False) + + # ggml only, follow ctransformers defaults + CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", "-1")) + # the layers to offloading to the GPU + GPU_LAYERS = int(get_env("GPU_LAYERS", "0")) + + log.debug("CONTEXT_LENGTH: %s", CONTEXT_LENGTH) + log.debug("GPU_LAYERS: %s", GPU_LAYERS) + + config = Config( + context_length=CONTEXT_LENGTH, + gpu_layers=GPU_LAYERS, + ) + model_type = get_model_type(DEFAULT_MODEL_FILE) + log.info("Creating llm singleton with model_type: %s for DEFAULT_MODEL_FILE %s", model_type, DEFAULT_MODEL_FILE) + set_loading_model(True) + llm = LLM( + model_path=f"{os.getcwd()}/models/{DEFAULT_MODEL_FILE}", + config=config, + model_type=model_type, + ) + log.info("llm singleton created.") + app.state.llm = llm + set_loading_model(False) @app.get("/v1/models") @@ -100,6 +123,8 @@ async def models(): """ if DOWNLOADING_MODEL is True: raise HTTPException(status_code=503, detail="Downloading model") + if LOADING_MODEL is True: + raise HTTPException(status_code=503, detail="Loading model in memory") return { "data": [ { @@ -116,7 +141,8 @@ async def models(): @app.post("/v1/completions", response_model=CompletionResponseBody) async def completions( body: Annotated[CompletionRequestBody, Body()], - llm: Annotated[LLM, Depends(get_llm)], + config: Annotated[Config, Depends(get_config)], + request: Request, ): """_summary_ Compatible with https://platform.openai.com/docs/api-reference/completions @@ -142,29 +168,21 @@ async def completions( prompt = body.prompt model_name = body.model + llm = request.app.state.llm if body.stream is True: log.debug("Streaming response from %s", model_name) return StreamingResponse( - completions_streamer( - prompt, - model_name, - llm, - log, - ), + completions_streamer(prompt, model_name, llm, config), media_type="text/event-stream", ) - return model_generate( - prompt, - model_name, - llm, - log, - ) + return model_generate(prompt, model_name, llm, config) @app.post("/v1/chat/completions", response_model=ChatCompletionResponseBody) async def chat_completions( body: Annotated[ChatCompletionRequestBody, Body()], - llm: Annotated[LLM, Depends(get_llm)], + config: Annotated[Config, Depends(get_config)], + request: Request, ): """_summary_ Compatible with https://platform.openai.com/docs/api-reference/chat @@ -251,20 +269,11 @@ async def chat_completions( prompt = f"{system_message_content}{assistant_message_content} {default_user_start}{user_message_content}{default_user_end} {default_assistant_start}" model_name = body.model + llm = request.app.state.llm if body.stream is True: log.debug("Streaming response from %s", model_name) return StreamingResponse( - chat_completions_streamer( - prompt, - model_name, - llm, - log, - ), + chat_completions_streamer(prompt, model_name, llm, config), media_type="text/event-stream", ) - return chat_model_generate( - prompt, - model_name, - llm, - log, - ) + return chat_model_generate(prompt, model_name, llm, config) diff --git a/model_generate.py b/model_generate.py index abe76b3a851a5f07c586a4812ad7a1245e234c32..76466848edbf087223e5360a9c6eb2fb36391aa3 100644 --- a/model_generate.py +++ b/model_generate.py @@ -1,24 +1,57 @@ from time import time -from logging import Logger -from ctransformers import LLM +from ctransformers import LLM, Config + +from log import log def model_generate( prompt: str, model_name: str, llm: LLM, - log: Logger, + config: Config, ): """_summary_ returns the response body for /chat/completions """ created = time() + top_k = config.top_k + log.debug("top_k: %s", top_k) + top_p = config.top_p + log.debug("top_p: %s", top_p) + temperature = config.temperature + log.debug("temperature: %s", temperature) + repetition_penalty = config.repetition_penalty + log.debug("repetition_penalty: %s", repetition_penalty) + last_n_tokens = config.last_n_tokens + log.debug("last_n_tokens: %s", last_n_tokens) + seed = config.seed + log.debug("seed: %s", seed) + batch_size = config.batch_size + log.debug("batch_size: %s", batch_size) + threads = config.threads + log.debug("threads: %s", threads) + max_new_tokens = config.max_new_tokens + log.debug("max_new_tokens: %s", max_new_tokens) + stop = config.stop + log.debug("stop: %s", stop) log.debug("prompt: %s", prompt) log.debug("Getting from ctransformer instance") result: str = llm( # pyright: ignore [reportGeneralTypeIssues] prompt=prompt, + stream=False, + reset=True, + top_k=top_k, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty, + last_n_tokens=last_n_tokens, + seed=seed, + batch_size=batch_size, + threads=threads, + max_new_tokens=max_new_tokens, + stop=stop, ) http_response = { "id": "id", @@ -43,18 +76,51 @@ def chat_model_generate( prompt: str, model_name: str, llm: LLM, - log: Logger, + config: Config, ): """_summary_ returns the response body for /chat/completions """ created = time() + top_k = config.top_k + log.debug("top_k: %s", top_k) + top_p = config.top_p + log.debug("top_p: %s", top_p) + temperature = config.temperature + log.debug("temperature: %s", temperature) + repetition_penalty = config.repetition_penalty + log.debug("repetition_penalty: %s", repetition_penalty) + last_n_tokens = config.last_n_tokens + log.debug("last_n_tokens: %s", last_n_tokens) + seed = config.seed + log.debug("seed: %s", seed) + batch_size = config.batch_size + log.debug("batch_size: %s", batch_size) + threads = config.threads + log.debug("threads: %s", threads) + max_new_tokens = config.max_new_tokens + log.debug("max_new_tokens: %s", max_new_tokens) + stop = config.stop + log.debug("stop: %s", stop) + log.debug("prompt: %s", prompt) log.debug("prompt: %s", prompt) log.debug("Getting from ctransformer instance") result: str = llm( # pyright: ignore [reportGeneralTypeIssues] prompt=prompt, + stream=False, + reset=True, + top_k=top_k, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty, + last_n_tokens=last_n_tokens, + seed=seed, + batch_size=batch_size, + threads=threads, + max_new_tokens=max_new_tokens, + stop=stop, ) http_response = { "id": "id", diff --git a/streamers.py b/streamers.py index 7eb9ba157e781c2916b812cff0a7b4b5a1855ca5..d5a05afef2e90745bfc4e65397846f408a86afa6 100644 --- a/streamers.py +++ b/streamers.py @@ -1,26 +1,58 @@ import json -from logging import Logger from os import times +from ctransformers import LLM, Config -from ctransformers import LLM +from log import log def completions_streamer( prompt: str, model_name: str, llm: LLM, - log: Logger, + config: Config, ): """_summary_ returns a generator that yields a stream of responses """ created = times() + top_k = config.top_k + log.debug("top_k: %s", top_k) + top_p = config.top_p + log.debug("top_p: %s", top_p) + temperature = config.temperature + log.debug("temperature: %s", temperature) + repetition_penalty = config.repetition_penalty + log.debug("repetition_penalty: %s", repetition_penalty) + last_n_tokens = config.last_n_tokens + log.debug("last_n_tokens: %s", last_n_tokens) + seed = config.seed + log.debug("seed: %s", seed) + batch_size = config.batch_size + log.debug("batch_size: %s", batch_size) + threads = config.threads + log.debug("threads: %s", threads) + max_new_tokens = config.max_new_tokens + log.debug("max_new_tokens: %s", max_new_tokens) + stop = config.stop + log.debug("stop: %s", stop) log.debug("prompt: %s", prompt) log.debug("Streaming from ctransformer instance!") for token in llm( prompt, + stream=True, + reset=True, + top_k=top_k, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty, + last_n_tokens=last_n_tokens, + seed=seed, + batch_size=batch_size, + threads=threads, + max_new_tokens=max_new_tokens, + stop=stop, ): log.debug("Streaming token %s", token) data = json.dumps( @@ -63,13 +95,33 @@ def chat_completions_streamer( prompt: str, model_name: str, llm: LLM, - log: Logger, + config: Config, ): """_summary_ returns a generator that yields a stream of responses """ created = times() + top_k = config.top_k + log.debug("top_k: %s", top_k) + top_p = config.top_p + log.debug("top_p: %s", top_p) + temperature = config.temperature + log.debug("temperature: %s", temperature) + repetition_penalty = config.repetition_penalty + log.debug("repetition_penalty: %s", repetition_penalty) + last_n_tokens = config.last_n_tokens + log.debug("last_n_tokens: %s", last_n_tokens) + seed = config.seed + log.debug("seed: %s", seed) + batch_size = config.batch_size + log.debug("batch_size: %s", batch_size) + threads = config.threads + log.debug("threads: %s", threads) + max_new_tokens = config.max_new_tokens + log.debug("max_new_tokens: %s", max_new_tokens) + stop = config.stop + log.debug("stop: %s", stop) log.debug("prompt: %s", prompt) log.debug("Streaming from ctransformer instance") @@ -77,6 +129,16 @@ def chat_completions_streamer( prompt, stream=True, reset=True, + top_k=top_k, + top_p=top_p, + temperature=temperature, + repetition_penalty=repetition_penalty, + last_n_tokens=last_n_tokens, + seed=seed, + batch_size=batch_size, + threads=threads, + max_new_tokens=max_new_tokens, + stop=stop, ): log.debug("Streaming token %s", token) data = json.dumps(