Skip to content
Snippets Groups Projects
Unverified Commit 4d1fc25e authored by Henry Chen's avatar Henry Chen Committed by GitHub
Browse files

Fixes for gptq image, improve `codegen` mapping (to gptj) (#64)

parent 4f651e38
No related branches found
No related tags found
No related merge requests found
# syntax=docker/dockerfile:1
FROM python:3.11-slim
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
RUN apt-get update \
&& apt-get install -y --no-install-recommends g++ python3-dev python3-pip \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get purge -y --auto-remove g++ python3-dev python3-pip
WORKDIR /app
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt
# Fixes exllama/cuda_ext.py:82: UserWarning: Failed to initialize NumPy: No module named 'numpy'
RUN pip3 install numpy
# https://github.com/marella/ctransformers#gptq
RUN pip3 install ctransformers[gptq]
COPY . .
......
apiVersion: v2
appVersion: 0.11.1
appVersion: 0.11.2
description: A Helm chart for ialacol
name: ialacol
type: application
version: 0.11.1
version: 0.11.2
from request_body import ChatCompletionRequestBody, CompletionRequestBody
from get_env import get_env
......@@ -6,6 +5,7 @@ def get_model_type(
filename: str,
) -> str:
ctransformer_model_type = "llama"
filename = filename.lower()
# These are also in "starcoder" format
# https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML
# https://huggingface.co/TheBloke/minotaur-15B-GGML
......@@ -34,6 +34,15 @@ def get_model_type(
# matching https://huggingface.co/EleutherAI/pythia-70m
if "pythia" in filename:
ctransformer_model_type = "gpt_neox"
# codegen family are in gptj, codegen2 isn't but not supported by ggml/ctransformer yet
# https://huggingface.co/Salesforce/codegen-2B-multi
# https://huggingface.co/ravenscroftj/CodeGen-2B-multi-ggml-quant
if "codegen" in filename:
ctransformer_model_type = "gptj"
DEFAULT_MODEL_HG_REPO_ID = get_env("DEFAULT_MODEL_HG_REPO_ID", "")
if "gptq" in str(DEFAULT_MODEL_HG_REPO_ID).lower() or "gptq" in filename:
ctransformer_model_type = "gptq"
MODE_TYPE = get_env("MODE_TYPE", "")
if len(MODE_TYPE) > 0:
......
......@@ -12,8 +12,8 @@ from typing import (
)
from fastapi import FastAPI, Depends, HTTPException, Body, Request
from fastapi.responses import StreamingResponse
from ctransformers import LLM, Config
from huggingface_hub import hf_hub_download
from ctransformers import LLM, AutoModelForCausalLM, Config
from huggingface_hub import hf_hub_download, snapshot_download
from get_config import get_config
from get_model_type import get_model_type
......@@ -70,22 +70,37 @@ async def startup_event():
Starts up the server, setting log level, downloading the default model if necessary.
"""
log.info("Starting up...")
if DEFAULT_MODEL_FILE and DEFAULT_MODEL_HG_REPO_ID:
model_type = get_model_type(DEFAULT_MODEL_FILE)
if DEFAULT_MODEL_HG_REPO_ID:
set_downloading_model(True)
log.info(
"Downloading model... %s/%s to %s/models",
DEFAULT_MODEL_HG_REPO_ID,
DEFAULT_MODEL_FILE,
os.getcwd(),
)
try:
hf_hub_download(
repo_id=DEFAULT_MODEL_HG_REPO_ID,
cache_dir="models/.cache",
local_dir="models",
filename=DEFAULT_MODEL_FILE,
resume_download=True,
)
if model_type == "gptq":
log.info(
"Downloading repo %s to %s/models",
DEFAULT_MODEL_HG_REPO_ID,
os.getcwd(),
)
snapshot_download(
repo_id=DEFAULT_MODEL_HG_REPO_ID,
cache_dir="models/.cache",
local_dir="models",
resume_download=True,
)
elif DEFAULT_MODEL_FILE:
log.info(
"Downloading model... %s/%s to %s/models",
DEFAULT_MODEL_HG_REPO_ID,
DEFAULT_MODEL_FILE,
os.getcwd(),
)
hf_hub_download(
repo_id=DEFAULT_MODEL_HG_REPO_ID,
cache_dir="models/.cache",
local_dir="models",
filename=DEFAULT_MODEL_FILE,
resume_download=True,
)
except Exception as exception:
log.error("Error downloading model: %s", exception)
finally:
......@@ -103,20 +118,29 @@ async def startup_event():
context_length=CONTEXT_LENGTH,
gpu_layers=GPU_LAYERS,
)
model_type = get_model_type(DEFAULT_MODEL_FILE)
log.info(
"Creating llm singleton with model_type: %s for DEFAULT_MODEL_FILE %s",
"Creating llm singleton with model_type: %s",
model_type,
DEFAULT_MODEL_FILE,
)
set_loading_model(True)
llm = LLM(
model_path=f"{os.getcwd()}/models/{DEFAULT_MODEL_FILE}",
config=config,
model_type=model_type,
)
if model_type == "gptq":
log.debug("Creating llm/gptq instance...")
llm = AutoModelForCausalLM.from_pretrained(
model_path_or_repo_id=f"{os.getcwd()}/models",
model_type="gptq",
local_files_only=True,
)
app.state.llm = llm
else:
log.debug("Creating llm/ggml instance...")
llm = LLM(
model_path=f"{os.getcwd()}/models/{DEFAULT_MODEL_FILE}",
config=config,
model_type=model_type,
)
app.state.llm = llm
log.info("llm singleton created.")
app.state.llm = llm
set_loading_model(False)
......@@ -143,6 +167,7 @@ async def models():
"object": "list",
}
@app.post("/v1/completions", response_model=CompletionResponseBody)
async def completions(
body: Annotated[CompletionRequestBody, Body()],
......@@ -182,6 +207,7 @@ async def completions(
)
return model_generate(prompt, model_name, llm, config)
@app.post("/v1/engines/{engine}/completions")
async def engine_completions(
# Can't use body as FastAPI require corrent context-type header
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment