diff --git a/README.md b/README.md index 1c3439d9b701795e3bbb305d092a97f85a3d4d8a..1c518f76a81e4218684cadcccd82398c50b7faa9 100644 --- a/README.md +++ b/README.md @@ -86,13 +86,23 @@ To enable GPU/CUDA acceleration, you need to use the container image built for G - `deployment.image` = `ghcr.io/chenhunghan/ialacol-cuda12:latest` - `deployment.env.GPU_LAYERS` is the layer to off loading to GPU. -For example +Only `llama`, `falcon`, `mpt` and `gpt_bigcode`(StarCoder/StarChat) support CUDA. + +#### Llama with CUDA12 + +```sh +helm install llama2-7b-chat-cuda12 ialacol/ialacol -f examples/values/llama2-7b-chat-cuda12.yaml +``` + +Deploys llama2 7b model with 40 layers offloadind to GPU. The inference is accelerated by CUDA 12. + +#### StarCoderPlus with CUDA12 ```sh -helm install llama2-7b-chat-cuda11 ialacol/ialacol -f examples/values/llama2-7b-chat-cuda11.yaml +helm install starcoderplus-guanaco-cuda12 ialacol/ialacol -f examples/values/starcoderplus-guanaco-cuda12.yaml ``` -Deploys llama2 7b model with 40 layers offloadind to GPU. The inference is accelerated by CUDA 11. +Deploys [Starcoderplus-Guanaco-GPT4-15B-V1.0 model](https://huggingface.co/LoupGarou/Starcoderplus-Guanaco-GPT4-15B-V1.0) with 40 layers offloadind to GPU. The inference is accelerated by CUDA 12. ### CUDA Driver Issues diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml index 736a142c132da9c97b1acea524ea3ef8972c245a..38c8bb01f8dc7f72804112532c79e7776d33ede6 100644 --- a/charts/ialacol/Chart.yaml +++ b/charts/ialacol/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 -appVersion: 0.9.0 +appVersion: 0.10.0 description: A Helm chart for ialacol name: ialacol type: application -version: 0.9.0 +version: 0.10.0 diff --git a/charts/ialacol/values.yaml b/charts/ialacol/values.yaml index 31519fb846dc992772910d8dd39fb66e910fd407..4e16a4a3c4e55d0ca9b315091ffde1cfdbd26b0f 100644 --- a/charts/ialacol/values.yaml +++ b/charts/ialacol/values.yaml @@ -2,7 +2,7 @@ replicas: 1 deployment: image: quay.io/chenhunghan/ialacol:latest - # or use CUDA11 image `ghcr.io/chenhunghan/ialacol-cuda11:latest` + # or use CUDA image `ghcr.io/chenhunghan/ialacol-cuda12:latest` # env: # DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML # DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin diff --git a/examples/values/llama2-7b-chat-cuda11.yaml b/examples/values/llama2-7b-chat-cuda12.yaml similarity index 91% rename from examples/values/llama2-7b-chat-cuda11.yaml rename to examples/values/llama2-7b-chat-cuda12.yaml index 81dfe01ecd626aa63a0e54bafd2a75c635d1181f..5fb64da584f9caf1e57a9753aa8fe8119a4a1439 100644 --- a/examples/values/llama2-7b-chat-cuda11.yaml +++ b/examples/values/llama2-7b-chat-cuda12.yaml @@ -1,6 +1,6 @@ replicas: 1 deployment: - image: ghcr.io/chenhunghan/ialacol-cuda11:latest + image: ghcr.io/chenhunghan/ialacol-cuda12:latest env: DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin diff --git a/examples/values/starcoderplus-guanaco-cuda12.yaml b/examples/values/starcoderplus-guanaco-cuda12.yaml new file mode 100644 index 0000000000000000000000000000000000000000..74ca5833f0ceced1cd3f4edcb0e1311ea782a71a --- /dev/null +++ b/examples/values/starcoderplus-guanaco-cuda12.yaml @@ -0,0 +1,30 @@ +replicas: 1 +deployment: + image: quay.io/chenhunghan/ialacol-cuda12:latest + env: + DEFAULT_MODEL_HG_REPO_ID: TheBloke/Starcoderplus-Guanaco-GPT4-15B-V1.0-GGML + DEFAULT_MODEL_FILE: starcoderplus-guanaco-gpt4.ggmlv1.q4_0.bin + GPU_LAYERS: 40 +resources: + {} +cache: + persistence: + size: 20Gi + accessModes: + - ReadWriteOnce + storageClassName: ~ +cacheMountPath: /app/cache +model: + persistence: + size: 20Gi + accessModes: + - ReadWriteOnce + storageClassName: ~ +modelMountPath: /app/models +service: + type: ClusterIP + port: 8000 + annotations: {} +nodeSelector: {} +tolerations: [] +affinity: {} diff --git a/get_llm.py b/get_llm.py index b82a73d233721e64bd4cb388bd3b39237589ccff..6159ef51deea688e72670ad7fb9386bee3da6c00 100644 --- a/get_llm.py +++ b/get_llm.py @@ -25,7 +25,7 @@ async def get_llm( or "WizardCoder" in body.model or "minotaur-15" in body.model ): - ctransformer_model_type = "starcoder" + ctransformer_model_type = "gpt_bigcode" if "llama" in body.model: ctransformer_model_type = "llama" if "mpt" in body.model: diff --git a/requirements.txt b/requirements.txt index 8e9d76531253e000c571e335d9b60a729ea8a4c8..1c96282fffda11266fa46a13c75d68a5d5fb4199 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ blake3==0.3.3 certifi==2023.7.22 charset-normalizer==3.1.0 click==8.1.3 -ctransformers==0.2.21 +ctransformers==0.2.22 fastapi==0.95.2 filelock==3.12.0 fsspec==2023.5.0