From 28d515db9f255a52d1d08bebe9c99293292b88cd Mon Sep 17 00:00:00 2001
From: Henry Chen <1474479+chenhunghan@users.noreply.github.com>
Date: Wed, 9 Aug 2023 19:55:30 +0300
Subject: [PATCH] Add experimental support for GPTQ models (#50)

Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com>
---
 .github/workflows/gptq_image.yaml        | 46 ++++++++++++++++++++++++
 Dockerfile.gptq                          | 11 ++++++
 README.md                                | 18 ++++++++++
 charts/ialacol/Chart.yaml                |  4 +--
 examples/values/llama2-7b-chat-gptq.yaml | 30 ++++++++++++++++
 5 files changed, 107 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/gptq_image.yaml
 create mode 100644 Dockerfile.gptq
 create mode 100644 examples/values/llama2-7b-chat-gptq.yaml

diff --git a/.github/workflows/gptq_image.yaml b/.github/workflows/gptq_image.yaml
new file mode 100644
index 0000000..35c973c
--- /dev/null
+++ b/.github/workflows/gptq_image.yaml
@@ -0,0 +1,46 @@
+name: Build and Push GPTQ Image to Github Container Registry
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+    - '**.py'
+    - 'requirements.txt'
+    - 'Dockerfile.gptq'
+    - '.github/workflows/gptq_image.yaml'
+
+env:
+  REGISTRY: ghcr.io
+  GPTQ_IMAGE_NAME: ialacol-gptq
+jobs:
+  gptq_image_to_gcr:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - uses: docker/login-action@v2
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.GPTQ_IMAGE_NAME }}
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./Dockerfile.gptq
+          push: true
+          tags: |
+            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.GPTQ_IMAGE_NAME }}:${{ github.sha }}
+            ${{ env.REGISTRY }}/${{ github.repository_owner }}/${{ env.GPTQ_IMAGE_NAME }}:latest
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/Dockerfile.gptq b/Dockerfile.gptq
new file mode 100644
index 0000000..964a3b7
--- /dev/null
+++ b/Dockerfile.gptq
@@ -0,0 +1,11 @@
+# syntax=docker/dockerfile:1
+
+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+# https://github.com/marella/ctransformers#gptq
+RUN pip3 install ctransformers[gptq]
+COPY . .
+EXPOSE 8000
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/README.md b/README.md
index 876c20d..5e7a9d6 100644
--- a/README.md
+++ b/README.md
@@ -103,6 +103,24 @@ For example
 helm install llama2-7b-chat-metal ialacol/ialacol -f examples/values/llama2-7b-chat-metal.yaml.yaml
 ```
 
+### GPTQ
+
+To use GPTQ, you must
+
+- `deployment.image` = `ghcr.io/chenhunghan/ialacol-gptq:latest`
+- `deployment.env.MODEL_TYPE` = `gptq`
+
+For example
+
+```sh
+helm install llama2-7b-chat-gptq ialacol/ialacol -f examples/values/llama2-7b-chat-gptq.yaml.yaml
+```
+
+```sh
+kubectl port-forward svc/llama2-7b-chat-gptq 8000:8000
+openai -k "sk-fake" -b http://localhost:8000/v1 -vvvvv api chat_completions.create -m gptq_model-4bit-128g.safetensors -g user "Hello world!"
+```
+
 ## Tips
 
 ### Creative v.s. Conservative
diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml
index 316ae40..736a142 100644
--- a/charts/ialacol/Chart.yaml
+++ b/charts/ialacol/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
-appVersion: 0.8.0
+appVersion: 0.9.0
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.8.0
+version: 0.9.0
diff --git a/examples/values/llama2-7b-chat-gptq.yaml b/examples/values/llama2-7b-chat-gptq.yaml
new file mode 100644
index 0000000..c318435
--- /dev/null
+++ b/examples/values/llama2-7b-chat-gptq.yaml
@@ -0,0 +1,30 @@
+replicas: 1
+deployment:
+  image: ghcr.io/chenhunghan/ialacol-gptq:latest
+  env:
+    DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7b-Chat-GPTQ
+    DEFAULT_MODEL_FILE: gptq_model-4bit-128g.safetensors
+    MODEL_TYPE: "gptq"
+resources:
+  {}
+cache:
+  persistence:
+    size: 5Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClassName: ~
+cacheMountPath: /app/cache
+model:
+  persistence:
+    size: 5Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClassName: ~
+modelMountPath: /app/models
+service:
+  type: ClusterIP
+  port: 8000
+  annotations: {}
+nodeSelector: {}
+tolerations: []
+affinity: {}
-- 
GitLab