diff --git a/.github/workflows/cuda_image.yaml b/.github/workflows/cuda_image.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a8e3c97b174cc451545c05554dce3c4187a8fe7e --- /dev/null +++ b/.github/workflows/cuda_image.yaml @@ -0,0 +1,76 @@ +name: Build and Push CUDA Image to Github Container Registry + +on: + push: + branches: + - main + paths: + - '**.py' + - 'requirements.txt' + - 'Dockerfile.cuda11' + - 'Dockerfile.cuda12' + - '.github/workflows/cuda_image.yaml' + paths-ignore: + - 'examples/**' + +env: + REGISTRY: ghcr.io + CUDA_11_IMAGE_NAME: ialacol-cuda11 + CUDA_12_IMAGE_NAME: ialacol-cuda12 +jobs: + cuda11_image_to_gcr: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + - uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.REGISTRY }}/${{ env.CUDA_11_IMAGE_NAME }} + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + context: . + file: ./Dockerfile.cuda11 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cuda12_image_to_gcr: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + - uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v4 + with: + images: ${{ env.REGISTRY }}/${{ env.CUDA_12_IMAGE_NAME }} + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + context: . + file: ./Dockerfile.cuda12 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/Dockerfile.cuda11 b/Dockerfile.cuda11 new file mode 100644 index 0000000000000000000000000000000000000000..c0e2f1aec9766899106016269e1a7228621e84de --- /dev/null +++ b/Dockerfile.cuda11 @@ -0,0 +1,13 @@ +# syntax=docker/dockerfile:1 + +# 11.7.1 https://github.com/ggerganov/llama.cpp/blob/master/.devops/main-cuda.Dockerfile +FROM nvidia/cuda:11.7.1-base-ubuntu22.04 +RUN apt-get update && apt-get install -y -q python3 python3-pip +WORKDIR /app +COPY requirements.txt requirements.txt +RUN pip3 install -r requirements.txt +# https://github.com/marella/ctransformers#cuda +RUN pip3 install ctransformers[cuda] +COPY . . +EXPOSE 8000 +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/Dockerfile.cuda12 b/Dockerfile.cuda12 new file mode 100644 index 0000000000000000000000000000000000000000..7d6767e71be4062544508a44c488d30deacc18dc --- /dev/null +++ b/Dockerfile.cuda12 @@ -0,0 +1,12 @@ +# syntax=docker/dockerfile:1 + +FROM nvidia/cuda:12-base-ubuntu22.04 +RUN apt-get update && apt-get install -y -q python3 python3-pip +WORKDIR /app +COPY requirements.txt requirements.txt +RUN pip3 install -r requirements.txt +# https://github.com/marella/ctransformers#cuda +RUN pip3 install ctransformers[cuda] +COPY . . +EXPOSE 8000 +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/README.md b/README.md index 5032fecfac6b350a49d2b70f051229336d58d175..a4bcb9a92075f01dd9a787c2c5f40a7d3d7d2ed1 100644 --- a/README.md +++ b/README.md @@ -25,10 +25,10 @@ And all LLMs supported by [ctransformers](https://github.com/marella/ctransforme ## Features -- Compatibility with OpenAI APIs, allowing you to use OpenAI's Python client or any frameworks that are built on top of OpenAI APIs such as [langchain](https://github.com/hwchase17/langchain). +- Compatibility with OpenAI APIs, allowing you to use any frameworks that are built on top of OpenAI APIs such as [langchain](https://github.com/hwchase17/langchain). - Lightweight, easy deployment on Kubernetes clusters with a 1-click Helm installation. -- Support for various commercially usable models. - Streaming first! For better UX. +- Optional CUDA acceleration. ## Quick Start @@ -63,6 +63,28 @@ Alternatively, using OpenAI's client library (see more examples in the `examples openai -k "sk-fake" -b http://localhost:8000/v1 -vvvvv api chat_completions.create -m llama-2-7b-chat.ggmlv3.q4_0.bin -g user "Hello world!" ``` +## GPU Acceleration + +To enable GPU/CUDA acceleration, you need to use the container image built for GPU and add `GPU_LAYERS` environment variable. `GPU_LAYERS` is determine by the size of your GPU memory. See the PR/discussion in [llama.cpp](https://github.com/ggerganov/llama.cpp/pull/1412) to find the best value. + +### CUDA 11 + +- `deployment.image` = `ghcr.io/chenhunghan/ialacol-cuda11:latest` +- `deployment.env.GPU_LAYERS` is the layer to off loading to GPU. + +### CUDA 12 + +- `deployment.image` = `ghcr.io/chenhunghan/ialacol-cuda11:latest` +- `deployment.env.GPU_LAYERS` is the layer to off loading to GPU. + +For example + +```sh +helm install llama2-7b-chat-cuda11 ialacol/ialacol -f examples/values/llama2-7b-chat-cuda11.yaml +``` + +Deploys llama2 7b model with 40 layers offloadind to GPU. The inference is accelerated by CUDA 11. + ## Tips ### Creative v.s. Conservative @@ -94,7 +116,7 @@ curl -X POST \ - StarCoder <https://huggingface.co/TheBloke/starcoder-GGML> - StarCoderPlus <https://huggingface.co/TheBloke/starcoderplus-GGML> - [x] Mimic restof OpenAI API, including `GET /models` and `POST /completions` -- [ ] GPU acceleration (CUDA/METAL) +- [ ] GPU acceleration (CUDA/METAL) - [ ] Support `POST /embeddings` backed by huggingface Apache-2.0 embedding models such as [Sentence Transformers](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) and [hkunlp/instructor](https://huggingface.co/hkunlp/instructor-large) - [ ] Suuport Apache-2.0 [fastchat-t5-3b](https://huggingface.co/lmsys/fastchat-t5-3b-v1.0) - [ ] Support more Apache-2.0 models such as [codet5p](https://huggingface.co/Salesforce/codet5p-16b) and others listed [here](https://github.com/eugeneyan/open-llms) @@ -135,7 +157,7 @@ helm install llama2-70b-chat ialacol/ialacol -f examples/values/llama2-70b-chat. ### OpenLM Research's OpenLLaMA Models -Deploy [OpenLLaMA 7B](https://github.com/openlm-research/open_llama) model quantized by [rustformers](https://huggingface.co/rustformers/open-llama-ggml). +Deploy [OpenLLaMA 7B](https://github.com/openlm-research/open_llama) model quantized by [rustformers](https://huggingface.co/rustformers/open-llama-ggml). â„¹ï¸ This is a base model, likely only useful for text completion. diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml index 382e4cd00a519ace45df53663959412d76c4d8ac..87542e27b4529460577adead5c56f58a31d6c694 100644 --- a/charts/ialacol/Chart.yaml +++ b/charts/ialacol/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 -appVersion: 0.6.2 +appVersion: 0.7.0 description: A Helm chart for ialacol name: ialacol type: application -version: 0.6.3 +version: 0.7.0 diff --git a/charts/ialacol/values.yaml b/charts/ialacol/values.yaml index d94f6d8e2be635261c4a948d46203f9c5baade01..31519fb846dc992772910d8dd39fb66e910fd407 100644 --- a/charts/ialacol/values.yaml +++ b/charts/ialacol/values.yaml @@ -2,6 +2,7 @@ replicas: 1 deployment: image: quay.io/chenhunghan/ialacol:latest + # or use CUDA11 image `ghcr.io/chenhunghan/ialacol-cuda11:latest` # env: # DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML # DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin diff --git a/examples/values/llama2-7b-chat-cuda11.yaml b/examples/values/llama2-7b-chat-cuda11.yaml new file mode 100644 index 0000000000000000000000000000000000000000..81dfe01ecd626aa63a0e54bafd2a75c635d1181f --- /dev/null +++ b/examples/values/llama2-7b-chat-cuda11.yaml @@ -0,0 +1,30 @@ +replicas: 1 +deployment: + image: ghcr.io/chenhunghan/ialacol-cuda11:latest + env: + DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML + DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin + GPU_LAYERS: 40 +resources: + {} +cache: + persistence: + size: 5Gi + accessModes: + - ReadWriteOnce + storageClassName: ~ +cacheMountPath: /app/cache +model: + persistence: + size: 5Gi + accessModes: + - ReadWriteOnce + storageClassName: ~ +modelMountPath: /app/models +service: + type: ClusterIP + port: 8000 + annotations: {} +nodeSelector: {} +tolerations: [] +affinity: {} diff --git a/get_config.py b/get_config.py index 1ddab19944709446aa5738c1b19fecf13c18a311..eeda6e16d4d590a98b5ef7cd4b16d2f3149a0d19 100644 --- a/get_config.py +++ b/get_config.py @@ -37,6 +37,8 @@ def get_config(body: CompletionRequestBody | ChatCompletionRequestBody) -> Confi STOP = get_env_or_none("STOP") # ggml only, follow ctransformers defaults CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", "-1")) + # the layers to offloading to the GPU + GPU_LAYERS = int(get_env("GPU_LAYERS", "0")) log.info("TOP_K: %s", TOP_K) log.info("TOP_P: %s", TOP_P) @@ -49,6 +51,7 @@ def get_config(body: CompletionRequestBody | ChatCompletionRequestBody) -> Confi log.info("MAX_TOKENS: %s", MAX_TOKENS) log.info("STOP: %s", STOP) log.info("CONTEXT_LENGTH: %s", CONTEXT_LENGTH) + log.info("GPU_LAYERS: %s", GPU_LAYERS) config = Config( top_k=body.top_k if body.top_k else TOP_K, @@ -62,5 +65,6 @@ def get_config(body: CompletionRequestBody | ChatCompletionRequestBody) -> Confi max_new_tokens=body.max_tokens if body.max_tokens else MAX_TOKENS, stop=body.stop if body.stop else STOP, context_length=CONTEXT_LENGTH, + gpu_layers=GPU_LAYERS, ) return config diff --git a/get_llm.py b/get_llm.py index f112c98aeb4fbfa8e4b4c23f19b98e2ac5f442eb..b82a73d233721e64bd4cb388bd3b39237589ccff 100644 --- a/get_llm.py +++ b/get_llm.py @@ -1,7 +1,7 @@ -from ctransformers import LLM, AutoModelForCausalLM +from ctransformers import LLM from request_body import ChatCompletionRequestBody, CompletionRequestBody from get_env import get_env - +from get_config import get_config async def get_llm( body: ChatCompletionRequestBody | CompletionRequestBody, @@ -38,12 +38,14 @@ async def get_llm( ctransformer_model_type = "dolly-v2" if "stablelm" in body.model: ctransformer_model_type = "gpt_neox" - + config = get_config(body) MODE_TYPE = get_env("MODE_TYPE", "") if len(MODE_TYPE) > 0: ctransformer_model_type = MODE_TYPE MODELS_FOLDER = get_env("MODELS_FOLDER", "models") - return AutoModelForCausalLM.from_pretrained( - f"./{MODELS_FOLDER}/{body.model}", model_type=ctransformer_model_type + return LLM( + model_path=f"./{MODELS_FOLDER}/{body.model}", + model_type=ctransformer_model_type, + config=config, ) diff --git a/model_generate.py b/model_generate.py index 510bd2bb5c0a35b1286533c31bdb8e18410142db..f6412c37e7d12d7926fc2652a6ece0f3e9e16cb1 100644 --- a/model_generate.py +++ b/model_generate.py @@ -33,6 +33,8 @@ def model_generate( log.debug("batch_size: %s", batch_size) threads = config.threads log.debug("thread: %s", threads) + gpu_layers = config.gpu_layers + log.debug("gpu_layers: %s", gpu_layers) log.debug("prompt: %s", prompt) log.debug("Getting from ctransformer instance") @@ -95,6 +97,8 @@ def chat_model_generate( log.debug("batch_size: %s", batch_size) threads = config.threads log.debug("thread: %s", threads) + gpu_layers = config.gpu_layers + log.debug("gpu_layers: %s", gpu_layers) log.debug("prompt: %s", prompt) log.debug("Getting from ctransformer instance") diff --git a/requirements.txt b/requirements.txt index 2de892e6efab84364a763359a7fef7a830ca524f..8e9d76531253e000c571e335d9b60a729ea8a4c8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ blake3==0.3.3 certifi==2023.7.22 charset-normalizer==3.1.0 click==8.1.3 -ctransformers==0.2.20 +ctransformers==0.2.21 fastapi==0.95.2 filelock==3.12.0 fsspec==2023.5.0