From 28d515db9f255a52d1d08bebe9c99293292b88cd Mon Sep 17 00:00:00 2001 From: Henry Chen <1474479+chenhunghan@users.noreply.github.com> Date: Wed, 9 Aug 2023 19:55:30 +0300 Subject: [PATCH] Add experimental support for GPTQ models (#50) Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com> --- .github/workflows/gptq_image.yaml | 46 ++++++++++++++++++++++++ Dockerfile.gptq | 11 ++++++ README.md | 18 ++++++++++ charts/ialacol/Chart.yaml | 4 +-- examples/values/llama2-7b-chat-gptq.yaml | 30 ++++++++++++++++ 5 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/gptq_image.yaml create mode 100644 Dockerfile.gptq create mode 100644 examples/values/llama2-7b-chat-gptq.yaml diff --git a/.github/workflows/gptq_image.yaml b/.github/workflows/gptq_image.yaml new file mode 100644 index 0000000..35c973c --- /dev/null +++ b/.github/workflows/gptq_image.yaml @@ -0,0 +1,46 @@ +name: Build and Push GPTQ Image to Github Container Registry + +on: + push: + branches: + - main + paths: + - '**.py' + - 'requirements.txt' + - 'Dockerfile.gptq' + - '.github/workflows/gptq_image.yaml' + +env: + REGISTRY: ghcr.io + GPTQ_IMAGE_NAME: ialacol-gptq +jobs: + gptq_image_to_gcr: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + - uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.REGISTRY }}/${{ env.GPTQ_IMAGE_NAME }} + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + context: . + file: ./Dockerfile.gptq + push: true + tags: | + ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.GPTQ_IMAGE_NAME }}:${{ github.sha }} + ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.GPTQ_IMAGE_NAME }}:latest + labels: ${{ steps.meta.outputs.labels }} diff --git a/Dockerfile.gptq b/Dockerfile.gptq new file mode 100644 index 0000000..964a3b7 --- /dev/null +++ b/Dockerfile.gptq @@ -0,0 +1,11 @@ +# syntax=docker/dockerfile:1 + +FROM python:3.11-slim +WORKDIR /app +COPY requirements.txt requirements.txt +RUN pip3 install -r requirements.txt +# https://github.com/marella/ctransformers#gptq +RUN pip3 install ctransformers[gptq] +COPY . . +EXPOSE 8000 +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/README.md b/README.md index 876c20d..5e7a9d6 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,24 @@ For example helm install llama2-7b-chat-metal ialacol/ialacol -f examples/values/llama2-7b-chat-metal.yaml.yaml ``` +### GPTQ + +To use GPTQ, you must + +- `deployment.image` = `ghcr.io/chenhunghan/ialacol-gptq:latest` +- `deployment.env.MODEL_TYPE` = `gptq` + +For example + +```sh +helm install llama2-7b-chat-gptq ialacol/ialacol -f examples/values/llama2-7b-chat-gptq.yaml.yaml +``` + +```sh +kubectl port-forward svc/llama2-7b-chat-gptq 8000:8000 +openai -k "sk-fake" -b http://localhost:8000/v1 -vvvvv api chat_completions.create -m gptq_model-4bit-128g.safetensors -g user "Hello world!" +``` + ## Tips ### Creative v.s. Conservative diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml index 316ae40..736a142 100644 --- a/charts/ialacol/Chart.yaml +++ b/charts/ialacol/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 -appVersion: 0.8.0 +appVersion: 0.9.0 description: A Helm chart for ialacol name: ialacol type: application -version: 0.8.0 +version: 0.9.0 diff --git a/examples/values/llama2-7b-chat-gptq.yaml b/examples/values/llama2-7b-chat-gptq.yaml new file mode 100644 index 0000000..c318435 --- /dev/null +++ b/examples/values/llama2-7b-chat-gptq.yaml @@ -0,0 +1,30 @@ +replicas: 1 +deployment: + image: ghcr.io/chenhunghan/ialacol-gptq:latest + env: + DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7b-Chat-GPTQ + DEFAULT_MODEL_FILE: gptq_model-4bit-128g.safetensors + MODEL_TYPE: "gptq" +resources: + {} +cache: + persistence: + size: 5Gi + accessModes: + - ReadWriteOnce + storageClassName: ~ +cacheMountPath: /app/cache +model: + persistence: + size: 5Gi + accessModes: + - ReadWriteOnce + storageClassName: ~ +modelMountPath: /app/models +service: + type: ClusterIP + port: 8000 + annotations: {} +nodeSelector: {} +tolerations: [] +affinity: {} -- GitLab