From 06da3dbceac44250379c3db6f41db35d801e7498 Mon Sep 17 00:00:00 2001
From: Henry Chen <1474479+chenhunghan@users.noreply.github.com>
Date: Sun, 13 Aug 2023 08:46:01 +0300
Subject: [PATCH] =?UTF-8?q?Upgrade=20ctransformer=20to=200.2.22,=20add=20G?=
 =?UTF-8?q?PUT=20support=20for=20StarCoder,=20make=20=E2=80=A6=20(#51)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com>
---
 README.md                                     | 16 ++++++++--
 charts/ialacol/Chart.yaml                     |  4 +--
 charts/ialacol/values.yaml                    |  2 +-
 ...cuda11.yaml => llama2-7b-chat-cuda12.yaml} |  2 +-
 .../values/starcoderplus-guanaco-cuda12.yaml  | 30 +++++++++++++++++++
 get_llm.py                                    |  2 +-
 requirements.txt                              |  2 +-
 7 files changed, 49 insertions(+), 9 deletions(-)
 rename examples/values/{llama2-7b-chat-cuda11.yaml => llama2-7b-chat-cuda12.yaml} (91%)
 create mode 100644 examples/values/starcoderplus-guanaco-cuda12.yaml

diff --git a/README.md b/README.md
index 1c3439d..1c518f7 100644
--- a/README.md
+++ b/README.md
@@ -86,13 +86,23 @@ To enable GPU/CUDA acceleration, you need to use the container image built for G
 - `deployment.image` = `ghcr.io/chenhunghan/ialacol-cuda12:latest`
 - `deployment.env.GPU_LAYERS` is the layer to off loading to GPU.
 
-For example
+Only `llama`, `falcon`, `mpt` and `gpt_bigcode`(StarCoder/StarChat) support CUDA.
+
+#### Llama with CUDA12
+
+```sh
+helm install llama2-7b-chat-cuda12 ialacol/ialacol -f examples/values/llama2-7b-chat-cuda12.yaml
+```
+
+Deploys llama2 7b model with 40 layers offloadind to GPU. The inference is accelerated by CUDA 12.
+
+#### StarCoderPlus with CUDA12
 
 ```sh
-helm install llama2-7b-chat-cuda11 ialacol/ialacol -f examples/values/llama2-7b-chat-cuda11.yaml
+helm install starcoderplus-guanaco-cuda12 ialacol/ialacol -f examples/values/starcoderplus-guanaco-cuda12.yaml
 ```
 
-Deploys llama2 7b model with 40 layers offloadind to GPU. The inference is accelerated by CUDA 11.
+Deploys [Starcoderplus-Guanaco-GPT4-15B-V1.0 model](https://huggingface.co/LoupGarou/Starcoderplus-Guanaco-GPT4-15B-V1.0) with 40 layers offloadind to GPU. The inference is accelerated by CUDA 12.
 
 ### CUDA Driver Issues
 
diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml
index 736a142..38c8bb0 100644
--- a/charts/ialacol/Chart.yaml
+++ b/charts/ialacol/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
-appVersion: 0.9.0
+appVersion: 0.10.0
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.9.0
+version: 0.10.0
diff --git a/charts/ialacol/values.yaml b/charts/ialacol/values.yaml
index 31519fb..4e16a4a 100644
--- a/charts/ialacol/values.yaml
+++ b/charts/ialacol/values.yaml
@@ -2,7 +2,7 @@ replicas: 1
 
 deployment:
   image: quay.io/chenhunghan/ialacol:latest
-  # or use CUDA11 image `ghcr.io/chenhunghan/ialacol-cuda11:latest`
+  # or use CUDA image `ghcr.io/chenhunghan/ialacol-cuda12:latest`
   # env:
     # DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML
     # DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin
diff --git a/examples/values/llama2-7b-chat-cuda11.yaml b/examples/values/llama2-7b-chat-cuda12.yaml
similarity index 91%
rename from examples/values/llama2-7b-chat-cuda11.yaml
rename to examples/values/llama2-7b-chat-cuda12.yaml
index 81dfe01..5fb64da 100644
--- a/examples/values/llama2-7b-chat-cuda11.yaml
+++ b/examples/values/llama2-7b-chat-cuda12.yaml
@@ -1,6 +1,6 @@
 replicas: 1
 deployment:
-  image: ghcr.io/chenhunghan/ialacol-cuda11:latest
+  image: ghcr.io/chenhunghan/ialacol-cuda12:latest
   env:
     DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML
     DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin
diff --git a/examples/values/starcoderplus-guanaco-cuda12.yaml b/examples/values/starcoderplus-guanaco-cuda12.yaml
new file mode 100644
index 0000000..74ca583
--- /dev/null
+++ b/examples/values/starcoderplus-guanaco-cuda12.yaml
@@ -0,0 +1,30 @@
+replicas: 1
+deployment:
+  image: quay.io/chenhunghan/ialacol-cuda12:latest
+  env:
+    DEFAULT_MODEL_HG_REPO_ID: TheBloke/Starcoderplus-Guanaco-GPT4-15B-V1.0-GGML
+    DEFAULT_MODEL_FILE: starcoderplus-guanaco-gpt4.ggmlv1.q4_0.bin
+    GPU_LAYERS: 40
+resources:
+  {}
+cache:
+  persistence:
+    size: 20Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClassName: ~
+cacheMountPath: /app/cache
+model:
+  persistence:
+    size: 20Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClassName: ~
+modelMountPath: /app/models
+service:
+  type: ClusterIP
+  port: 8000
+  annotations: {}
+nodeSelector: {}
+tolerations: []
+affinity: {}
diff --git a/get_llm.py b/get_llm.py
index b82a73d..6159ef5 100644
--- a/get_llm.py
+++ b/get_llm.py
@@ -25,7 +25,7 @@ async def get_llm(
         or "WizardCoder" in body.model
         or "minotaur-15" in body.model
     ):
-        ctransformer_model_type = "starcoder"
+        ctransformer_model_type = "gpt_bigcode"
     if "llama" in body.model:
         ctransformer_model_type = "llama"
     if "mpt" in body.model:
diff --git a/requirements.txt b/requirements.txt
index 8e9d765..1c96282 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ blake3==0.3.3
 certifi==2023.7.22
 charset-normalizer==3.1.0
 click==8.1.3
-ctransformers==0.2.21
+ctransformers==0.2.22
 fastapi==0.95.2
 filelock==3.12.0
 fsspec==2023.5.0
-- 
GitLab