diff --git a/Dockerfile.gptq b/Dockerfile.gptq index 964a3b7dc57d9600640a60d5566a4367e099cc73..d62efa6bae1621abe9faf6e81b2df0a65cee6096 100644 --- a/Dockerfile.gptq +++ b/Dockerfile.gptq @@ -1,9 +1,15 @@ # syntax=docker/dockerfile:1 -FROM python:3.11-slim +FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 +RUN apt-get update \ + && apt-get install -y --no-install-recommends g++ python3-dev python3-pip \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get purge -y --auto-remove g++ python3-dev python3-pip WORKDIR /app COPY requirements.txt requirements.txt RUN pip3 install -r requirements.txt +# Fixes exllama/cuda_ext.py:82: UserWarning: Failed to initialize NumPy: No module named 'numpy' +RUN pip3 install numpy # https://github.com/marella/ctransformers#gptq RUN pip3 install ctransformers[gptq] COPY . . diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml index c404da0c256faaa59daf2bc053307842e1efb083..de634c1a33fa9346a68c671f393750d42bc88ff5 100644 --- a/charts/ialacol/Chart.yaml +++ b/charts/ialacol/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 -appVersion: 0.11.1 +appVersion: 0.11.2 description: A Helm chart for ialacol name: ialacol type: application -version: 0.11.1 +version: 0.11.2 diff --git a/get_model_type.py b/get_model_type.py index b71ede8e30891ea7e0c2f7f247744dad44818216..f064e45709d06837dc51a561d6e9b2daaa12283d 100644 --- a/get_model_type.py +++ b/get_model_type.py @@ -1,4 +1,3 @@ -from request_body import ChatCompletionRequestBody, CompletionRequestBody from get_env import get_env @@ -6,6 +5,7 @@ def get_model_type( filename: str, ) -> str: ctransformer_model_type = "llama" + filename = filename.lower() # These are also in "starcoder" format # https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML # https://huggingface.co/TheBloke/minotaur-15B-GGML @@ -34,6 +34,15 @@ def get_model_type( # matching https://huggingface.co/EleutherAI/pythia-70m if "pythia" in filename: ctransformer_model_type = "gpt_neox" + # codegen family are in gptj, codegen2 isn't but not supported by ggml/ctransformer yet + # https://huggingface.co/Salesforce/codegen-2B-multi + # https://huggingface.co/ravenscroftj/CodeGen-2B-multi-ggml-quant + if "codegen" in filename: + ctransformer_model_type = "gptj" + + DEFAULT_MODEL_HG_REPO_ID = get_env("DEFAULT_MODEL_HG_REPO_ID", "") + if "gptq" in str(DEFAULT_MODEL_HG_REPO_ID).lower() or "gptq" in filename: + ctransformer_model_type = "gptq" MODE_TYPE = get_env("MODE_TYPE", "") if len(MODE_TYPE) > 0: diff --git a/main.py b/main.py index a0daab5fce08260c6d790f8f8f78db1f500d297b..75286c36bb4e05f5e5070c319c6d4fe998440167 100644 --- a/main.py +++ b/main.py @@ -12,8 +12,8 @@ from typing import ( ) from fastapi import FastAPI, Depends, HTTPException, Body, Request from fastapi.responses import StreamingResponse -from ctransformers import LLM, Config -from huggingface_hub import hf_hub_download +from ctransformers import LLM, AutoModelForCausalLM, Config +from huggingface_hub import hf_hub_download, snapshot_download from get_config import get_config from get_model_type import get_model_type @@ -70,22 +70,37 @@ async def startup_event(): Starts up the server, setting log level, downloading the default model if necessary. """ log.info("Starting up...") - if DEFAULT_MODEL_FILE and DEFAULT_MODEL_HG_REPO_ID: + model_type = get_model_type(DEFAULT_MODEL_FILE) + if DEFAULT_MODEL_HG_REPO_ID: set_downloading_model(True) - log.info( - "Downloading model... %s/%s to %s/models", - DEFAULT_MODEL_HG_REPO_ID, - DEFAULT_MODEL_FILE, - os.getcwd(), - ) + try: - hf_hub_download( - repo_id=DEFAULT_MODEL_HG_REPO_ID, - cache_dir="models/.cache", - local_dir="models", - filename=DEFAULT_MODEL_FILE, - resume_download=True, - ) + if model_type == "gptq": + log.info( + "Downloading repo %s to %s/models", + DEFAULT_MODEL_HG_REPO_ID, + os.getcwd(), + ) + snapshot_download( + repo_id=DEFAULT_MODEL_HG_REPO_ID, + cache_dir="models/.cache", + local_dir="models", + resume_download=True, + ) + elif DEFAULT_MODEL_FILE: + log.info( + "Downloading model... %s/%s to %s/models", + DEFAULT_MODEL_HG_REPO_ID, + DEFAULT_MODEL_FILE, + os.getcwd(), + ) + hf_hub_download( + repo_id=DEFAULT_MODEL_HG_REPO_ID, + cache_dir="models/.cache", + local_dir="models", + filename=DEFAULT_MODEL_FILE, + resume_download=True, + ) except Exception as exception: log.error("Error downloading model: %s", exception) finally: @@ -103,20 +118,29 @@ async def startup_event(): context_length=CONTEXT_LENGTH, gpu_layers=GPU_LAYERS, ) - model_type = get_model_type(DEFAULT_MODEL_FILE) + log.info( - "Creating llm singleton with model_type: %s for DEFAULT_MODEL_FILE %s", + "Creating llm singleton with model_type: %s", model_type, - DEFAULT_MODEL_FILE, ) set_loading_model(True) - llm = LLM( - model_path=f"{os.getcwd()}/models/{DEFAULT_MODEL_FILE}", - config=config, - model_type=model_type, - ) + if model_type == "gptq": + log.debug("Creating llm/gptq instance...") + llm = AutoModelForCausalLM.from_pretrained( + model_path_or_repo_id=f"{os.getcwd()}/models", + model_type="gptq", + local_files_only=True, + ) + app.state.llm = llm + else: + log.debug("Creating llm/ggml instance...") + llm = LLM( + model_path=f"{os.getcwd()}/models/{DEFAULT_MODEL_FILE}", + config=config, + model_type=model_type, + ) + app.state.llm = llm log.info("llm singleton created.") - app.state.llm = llm set_loading_model(False) @@ -143,6 +167,7 @@ async def models(): "object": "list", } + @app.post("/v1/completions", response_model=CompletionResponseBody) async def completions( body: Annotated[CompletionRequestBody, Body()], @@ -182,6 +207,7 @@ async def completions( ) return model_generate(prompt, model_name, llm, config) + @app.post("/v1/engines/{engine}/completions") async def engine_completions( # Can't use body as FastAPI require corrent context-type header