From 06da3dbceac44250379c3db6f41db35d801e7498 Mon Sep 17 00:00:00 2001 From: Henry Chen <1474479+chenhunghan@users.noreply.github.com> Date: Sun, 13 Aug 2023 08:46:01 +0300 Subject: [PATCH] =?UTF-8?q?Upgrade=20ctransformer=20to=200.2.22,=20add=20G?= =?UTF-8?q?PUT=20support=20for=20StarCoder,=20make=20=E2=80=A6=20(#51)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com> --- README.md | 16 ++++++++-- charts/ialacol/Chart.yaml | 4 +-- charts/ialacol/values.yaml | 2 +- ...cuda11.yaml => llama2-7b-chat-cuda12.yaml} | 2 +- .../values/starcoderplus-guanaco-cuda12.yaml | 30 +++++++++++++++++++ get_llm.py | 2 +- requirements.txt | 2 +- 7 files changed, 49 insertions(+), 9 deletions(-) rename examples/values/{llama2-7b-chat-cuda11.yaml => llama2-7b-chat-cuda12.yaml} (91%) create mode 100644 examples/values/starcoderplus-guanaco-cuda12.yaml diff --git a/README.md b/README.md index 1c3439d..1c518f7 100644 --- a/README.md +++ b/README.md @@ -86,13 +86,23 @@ To enable GPU/CUDA acceleration, you need to use the container image built for G - `deployment.image` = `ghcr.io/chenhunghan/ialacol-cuda12:latest` - `deployment.env.GPU_LAYERS` is the layer to off loading to GPU. -For example +Only `llama`, `falcon`, `mpt` and `gpt_bigcode`(StarCoder/StarChat) support CUDA. + +#### Llama with CUDA12 + +```sh +helm install llama2-7b-chat-cuda12 ialacol/ialacol -f examples/values/llama2-7b-chat-cuda12.yaml +``` + +Deploys llama2 7b model with 40 layers offloadind to GPU. The inference is accelerated by CUDA 12. + +#### StarCoderPlus with CUDA12 ```sh -helm install llama2-7b-chat-cuda11 ialacol/ialacol -f examples/values/llama2-7b-chat-cuda11.yaml +helm install starcoderplus-guanaco-cuda12 ialacol/ialacol -f examples/values/starcoderplus-guanaco-cuda12.yaml ``` -Deploys llama2 7b model with 40 layers offloadind to GPU. The inference is accelerated by CUDA 11. +Deploys [Starcoderplus-Guanaco-GPT4-15B-V1.0 model](https://huggingface.co/LoupGarou/Starcoderplus-Guanaco-GPT4-15B-V1.0) with 40 layers offloadind to GPU. The inference is accelerated by CUDA 12. ### CUDA Driver Issues diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml index 736a142..38c8bb0 100644 --- a/charts/ialacol/Chart.yaml +++ b/charts/ialacol/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 -appVersion: 0.9.0 +appVersion: 0.10.0 description: A Helm chart for ialacol name: ialacol type: application -version: 0.9.0 +version: 0.10.0 diff --git a/charts/ialacol/values.yaml b/charts/ialacol/values.yaml index 31519fb..4e16a4a 100644 --- a/charts/ialacol/values.yaml +++ b/charts/ialacol/values.yaml @@ -2,7 +2,7 @@ replicas: 1 deployment: image: quay.io/chenhunghan/ialacol:latest - # or use CUDA11 image `ghcr.io/chenhunghan/ialacol-cuda11:latest` + # or use CUDA image `ghcr.io/chenhunghan/ialacol-cuda12:latest` # env: # DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML # DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin diff --git a/examples/values/llama2-7b-chat-cuda11.yaml b/examples/values/llama2-7b-chat-cuda12.yaml similarity index 91% rename from examples/values/llama2-7b-chat-cuda11.yaml rename to examples/values/llama2-7b-chat-cuda12.yaml index 81dfe01..5fb64da 100644 --- a/examples/values/llama2-7b-chat-cuda11.yaml +++ b/examples/values/llama2-7b-chat-cuda12.yaml @@ -1,6 +1,6 @@ replicas: 1 deployment: - image: ghcr.io/chenhunghan/ialacol-cuda11:latest + image: ghcr.io/chenhunghan/ialacol-cuda12:latest env: DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin diff --git a/examples/values/starcoderplus-guanaco-cuda12.yaml b/examples/values/starcoderplus-guanaco-cuda12.yaml new file mode 100644 index 0000000..74ca583 --- /dev/null +++ b/examples/values/starcoderplus-guanaco-cuda12.yaml @@ -0,0 +1,30 @@ +replicas: 1 +deployment: + image: quay.io/chenhunghan/ialacol-cuda12:latest + env: + DEFAULT_MODEL_HG_REPO_ID: TheBloke/Starcoderplus-Guanaco-GPT4-15B-V1.0-GGML + DEFAULT_MODEL_FILE: starcoderplus-guanaco-gpt4.ggmlv1.q4_0.bin + GPU_LAYERS: 40 +resources: + {} +cache: + persistence: + size: 20Gi + accessModes: + - ReadWriteOnce + storageClassName: ~ +cacheMountPath: /app/cache +model: + persistence: + size: 20Gi + accessModes: + - ReadWriteOnce + storageClassName: ~ +modelMountPath: /app/models +service: + type: ClusterIP + port: 8000 + annotations: {} +nodeSelector: {} +tolerations: [] +affinity: {} diff --git a/get_llm.py b/get_llm.py index b82a73d..6159ef5 100644 --- a/get_llm.py +++ b/get_llm.py @@ -25,7 +25,7 @@ async def get_llm( or "WizardCoder" in body.model or "minotaur-15" in body.model ): - ctransformer_model_type = "starcoder" + ctransformer_model_type = "gpt_bigcode" if "llama" in body.model: ctransformer_model_type = "llama" if "mpt" in body.model: diff --git a/requirements.txt b/requirements.txt index 8e9d765..1c96282 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ blake3==0.3.3 certifi==2023.7.22 charset-normalizer==3.1.0 click==8.1.3 -ctransformers==0.2.21 +ctransformers==0.2.22 fastapi==0.95.2 filelock==3.12.0 fsspec==2023.5.0 -- GitLab