Fixes for gptq image, improve `codegen` mapping (to gptj) (#64)

Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com>

Fixes for gptq image, improve `codegen` mapping (to gptj) (#64)
4d1fc25e · Henry Chen · GitHub · 4f651e38 · 4d1fc25e · 4d1fc25e
Unverified Commit 4d1fc25e authored 1 year ago by Henry Chen Committed by GitHub 1 year ago
--- a/Dockerfile.gptq
+++ b/Dockerfile.gptq
 # syntax=docker/dockerfile:1

-FROM python:3.11-slim
+FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends g++ python3-dev python3-pip \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get purge -y --auto-remove g++ python3-dev python3-pip
 WORKDIR /app
 COPY requirements.txt requirements.txt
 RUN pip3 install -r requirements.txt
+# Fixes exllama/cuda_ext.py:82: UserWarning: Failed to initialize NumPy: No module named 'numpy'
+RUN pip3 install numpy
 # https://github.com/marella/ctransformers#gptq
 RUN pip3 install ctransformers[gptq]
 COPY . .

--- a/charts/ialacol/Chart.yaml
+++ b/charts/ialacol/Chart.yaml
 apiVersion: v2
-appVersion: 0.11.1
+appVersion: 0.11.2
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.11.1
+version: 0.11.2
--- a/get_model_type.py
+++ b/get_model_type.py
-from request_body import ChatCompletionRequestBody, CompletionRequestBody
 from get_env import get_env


@@ -6,6 +5,7 @@ def get_model_type(
    filename: str,
 ) -> str:
    ctransformer_model_type = "llama"
+    filename = filename.lower()
    # These are also in "starcoder" format
    # https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML
    # https://huggingface.co/TheBloke/minotaur-15B-GGML
@@ -34,6 +34,15 @@ def get_model_type(
    # matching https://huggingface.co/EleutherAI/pythia-70m
    if "pythia" in filename:
        ctransformer_model_type = "gpt_neox"
+    # codegen family are in gptj, codegen2 isn't but not supported by ggml/ctransformer yet
+    # https://huggingface.co/Salesforce/codegen-2B-multi
+    # https://huggingface.co/ravenscroftj/CodeGen-2B-multi-ggml-quant
+    if "codegen" in filename:
+        ctransformer_model_type = "gptj"
+
+    DEFAULT_MODEL_HG_REPO_ID = get_env("DEFAULT_MODEL_HG_REPO_ID", "")
+    if "gptq" in str(DEFAULT_MODEL_HG_REPO_ID).lower() or "gptq" in filename:
+        ctransformer_model_type = "gptq"

    MODE_TYPE = get_env("MODE_TYPE", "")
    if len(MODE_TYPE) > 0:

--- a/main.py
+++ b/main.py
@@ -12,8 +12,8 @@ from typing import (
 )
 from fastapi import FastAPI, Depends, HTTPException, Body, Request
 from fastapi.responses import StreamingResponse
-from ctransformers import LLM, Config
-from huggingface_hub import hf_hub_download
+from ctransformers import LLM, AutoModelForCausalLM, Config
+from huggingface_hub import hf_hub_download, snapshot_download
 from get_config import get_config
 from get_model_type import get_model_type

@@ -70,22 +70,37 @@ async def startup_event():
    Starts up the server, setting log level, downloading the default model if necessary.
    """
    log.info("Starting up...")
-    if DEFAULT_MODEL_FILE and DEFAULT_MODEL_HG_REPO_ID:
+    model_type = get_model_type(DEFAULT_MODEL_FILE)
+    if DEFAULT_MODEL_HG_REPO_ID:
        set_downloading_model(True)
-        log.info(
-            "Downloading model... %s/%s to %s/models",
-            DEFAULT_MODEL_HG_REPO_ID,
-            DEFAULT_MODEL_FILE,
-            os.getcwd(),
-        )
+
        try:
-            hf_hub_download(
-                repo_id=DEFAULT_MODEL_HG_REPO_ID,
-                cache_dir="models/.cache",
-                local_dir="models",
-                filename=DEFAULT_MODEL_FILE,
-                resume_download=True,
-            )
+            if model_type == "gptq":
+                log.info(
+                    "Downloading repo %s to %s/models",
+                    DEFAULT_MODEL_HG_REPO_ID,
+                    os.getcwd(),
+                )
+                snapshot_download(
+                    repo_id=DEFAULT_MODEL_HG_REPO_ID,
+                    cache_dir="models/.cache",
+                    local_dir="models",
+                    resume_download=True,
+                )
+            elif DEFAULT_MODEL_FILE:
+                log.info(
+                    "Downloading model... %s/%s to %s/models",
+                    DEFAULT_MODEL_HG_REPO_ID,
+                    DEFAULT_MODEL_FILE,
+                    os.getcwd(),
+                )
+                hf_hub_download(
+                    repo_id=DEFAULT_MODEL_HG_REPO_ID,
+                    cache_dir="models/.cache",
+                    local_dir="models",
+                    filename=DEFAULT_MODEL_FILE,
+                    resume_download=True,
+                )
        except Exception as exception:
            log.error("Error downloading model: %s", exception)
        finally:
@@ -103,20 +118,29 @@ async def startup_event():
        context_length=CONTEXT_LENGTH,
        gpu_layers=GPU_LAYERS,
    )
-    model_type = get_model_type(DEFAULT_MODEL_FILE)
+
    log.info(
-        "Creating llm singleton with model_type: %s for DEFAULT_MODEL_FILE %s",
+        "Creating llm singleton with model_type: %s",
        model_type,
-        DEFAULT_MODEL_FILE,
    )
    set_loading_model(True)
-    llm = LLM(
-        model_path=f"{os.getcwd()}/models/{DEFAULT_MODEL_FILE}",
-        config=config,
-        model_type=model_type,
-    )
+    if model_type == "gptq":
+        log.debug("Creating llm/gptq instance...")
+        llm = AutoModelForCausalLM.from_pretrained(
+            model_path_or_repo_id=f"{os.getcwd()}/models",
+            model_type="gptq",
+            local_files_only=True,
+        )
+        app.state.llm = llm
+    else:
+        log.debug("Creating llm/ggml instance...")
+        llm = LLM(
+            model_path=f"{os.getcwd()}/models/{DEFAULT_MODEL_FILE}",
+            config=config,
+            model_type=model_type,
+        )
+        app.state.llm = llm
    log.info("llm singleton created.")
-    app.state.llm = llm
    set_loading_model(False)


@@ -143,6 +167,7 @@ async def models():
        "object": "list",
    }

+
 @app.post("/v1/completions", response_model=CompletionResponseBody)
 async def completions(
    body: Annotated[CompletionRequestBody, Body()],
@@ -182,6 +207,7 @@ async def completions(
        )
    return model_generate(prompt, model_name, llm, config)

+
 @app.post("/v1/engines/{engine}/completions")
 async def engine_completions(
    # Can't use body as FastAPI require corrent context-type header