diff --git a/.github/workflows/cuda_image.yaml b/.github/workflows/cuda_image.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a8e3c97b174cc451545c05554dce3c4187a8fe7e
--- /dev/null
+++ b/.github/workflows/cuda_image.yaml
@@ -0,0 +1,76 @@
+name: Build and Push CUDA Image to Github Container Registry
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+    - '**.py'
+    - 'requirements.txt'
+    - 'Dockerfile.cuda11'
+    - 'Dockerfile.cuda12'
+    - '.github/workflows/cuda_image.yaml'
+    paths-ignore:
+    - 'examples/**'
+
+env:
+  REGISTRY: ghcr.io
+  CUDA_11_IMAGE_NAME: ialacol-cuda11
+  CUDA_12_IMAGE_NAME: ialacol-cuda12
+jobs:
+  cuda11_image_to_gcr:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - uses: docker/login-action@v2
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.CUDA_11_IMAGE_NAME }}
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./Dockerfile.cuda11
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+  cuda12_image_to_gcr:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - uses: docker/login-action@v2
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.CUDA_12_IMAGE_NAME }}
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./Dockerfile.cuda12
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/Dockerfile.cuda11 b/Dockerfile.cuda11
new file mode 100644
index 0000000000000000000000000000000000000000..c0e2f1aec9766899106016269e1a7228621e84de
--- /dev/null
+++ b/Dockerfile.cuda11
@@ -0,0 +1,13 @@
+# syntax=docker/dockerfile:1
+
+# 11.7.1 https://github.com/ggerganov/llama.cpp/blob/master/.devops/main-cuda.Dockerfile
+FROM nvidia/cuda:11.7.1-base-ubuntu22.04
+RUN apt-get update && apt-get install -y -q python3 python3-pip
+WORKDIR /app
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+# https://github.com/marella/ctransformers#cuda
+RUN pip3 install ctransformers[cuda]
+COPY . .
+EXPOSE 8000
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/Dockerfile.cuda12 b/Dockerfile.cuda12
new file mode 100644
index 0000000000000000000000000000000000000000..7d6767e71be4062544508a44c488d30deacc18dc
--- /dev/null
+++ b/Dockerfile.cuda12
@@ -0,0 +1,12 @@
+# syntax=docker/dockerfile:1
+
+FROM nvidia/cuda:12-base-ubuntu22.04
+RUN apt-get update && apt-get install -y -q python3 python3-pip
+WORKDIR /app
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+# https://github.com/marella/ctransformers#cuda
+RUN pip3 install ctransformers[cuda]
+COPY . .
+EXPOSE 8000
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/README.md b/README.md
index 5032fecfac6b350a49d2b70f051229336d58d175..a4bcb9a92075f01dd9a787c2c5f40a7d3d7d2ed1 100644
--- a/README.md
+++ b/README.md
@@ -25,10 +25,10 @@ And all LLMs supported by [ctransformers](https://github.com/marella/ctransforme
 
 ## Features
 
-- Compatibility with OpenAI APIs, allowing you to use OpenAI's Python client or any frameworks that are built on top of OpenAI APIs such as [langchain](https://github.com/hwchase17/langchain).
+- Compatibility with OpenAI APIs, allowing you to use any frameworks that are built on top of OpenAI APIs such as [langchain](https://github.com/hwchase17/langchain).
 - Lightweight, easy deployment on Kubernetes clusters with a 1-click Helm installation.
-- Support for various commercially usable models.
 - Streaming first! For better UX.
+- Optional CUDA acceleration.
 
 ## Quick Start
 
@@ -63,6 +63,28 @@ Alternatively, using OpenAI's client library (see more examples in the `examples
 openai -k "sk-fake" -b http://localhost:8000/v1 -vvvvv api chat_completions.create -m llama-2-7b-chat.ggmlv3.q4_0.bin -g user "Hello world!"
 ```
 
+## GPU Acceleration
+
+To enable GPU/CUDA acceleration, you need to use the container image built for GPU and add `GPU_LAYERS` environment variable. `GPU_LAYERS` is determine by the size of your GPU memory. See the PR/discussion in [llama.cpp](https://github.com/ggerganov/llama.cpp/pull/1412) to find the best value.
+
+### CUDA 11
+
+- `deployment.image` = `ghcr.io/chenhunghan/ialacol-cuda11:latest`
+- `deployment.env.GPU_LAYERS` is the layer to off loading to GPU.
+
+### CUDA 12
+
+- `deployment.image` = `ghcr.io/chenhunghan/ialacol-cuda11:latest`
+- `deployment.env.GPU_LAYERS` is the layer to off loading to GPU.
+
+For example
+
+```sh
+helm install llama2-7b-chat-cuda11 ialacol/ialacol -f examples/values/llama2-7b-chat-cuda11.yaml
+```
+
+Deploys llama2 7b model with 40 layers offloadind to GPU. The inference is accelerated by CUDA 11.
+
 ## Tips
 
 ### Creative v.s. Conservative
@@ -94,7 +116,7 @@ curl -X POST \
   - StarCoder <https://huggingface.co/TheBloke/starcoder-GGML>
   - StarCoderPlus <https://huggingface.co/TheBloke/starcoderplus-GGML>
 - [x] Mimic restof OpenAI API, including `GET /models` and `POST /completions`
-- [ ] GPU acceleration (CUDA/METAL) 
+- [ ] GPU acceleration (CUDA/METAL)
 - [ ] Support `POST /embeddings` backed by huggingface Apache-2.0 embedding models such as [Sentence Transformers](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) and [hkunlp/instructor](https://huggingface.co/hkunlp/instructor-large)
 - [ ] Suuport Apache-2.0 [fastchat-t5-3b](https://huggingface.co/lmsys/fastchat-t5-3b-v1.0)
 - [ ] Support more Apache-2.0 models such as [codet5p](https://huggingface.co/Salesforce/codet5p-16b) and others listed [here](https://github.com/eugeneyan/open-llms)
@@ -135,7 +157,7 @@ helm install llama2-70b-chat ialacol/ialacol -f examples/values/llama2-70b-chat.
 
 ### OpenLM Research's OpenLLaMA Models
 
-Deploy [OpenLLaMA 7B](https://github.com/openlm-research/open_llama) model quantized by [rustformers](https://huggingface.co/rustformers/open-llama-ggml). 
+Deploy [OpenLLaMA 7B](https://github.com/openlm-research/open_llama) model quantized by [rustformers](https://huggingface.co/rustformers/open-llama-ggml).
 
 â„¹ï¸ This is a base model, likely only useful for text completion.
 
diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml
index 382e4cd00a519ace45df53663959412d76c4d8ac..87542e27b4529460577adead5c56f58a31d6c694 100644
--- a/charts/ialacol/Chart.yaml
+++ b/charts/ialacol/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
-appVersion: 0.6.2
+appVersion: 0.7.0
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.6.3
+version: 0.7.0
diff --git a/charts/ialacol/values.yaml b/charts/ialacol/values.yaml
index d94f6d8e2be635261c4a948d46203f9c5baade01..31519fb846dc992772910d8dd39fb66e910fd407 100644
--- a/charts/ialacol/values.yaml
+++ b/charts/ialacol/values.yaml
@@ -2,6 +2,7 @@ replicas: 1
 
 deployment:
   image: quay.io/chenhunghan/ialacol:latest
+  # or use CUDA11 image `ghcr.io/chenhunghan/ialacol-cuda11:latest`
   # env:
     # DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML
     # DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin
diff --git a/examples/values/llama2-7b-chat-cuda11.yaml b/examples/values/llama2-7b-chat-cuda11.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..81dfe01ecd626aa63a0e54bafd2a75c635d1181f
--- /dev/null
+++ b/examples/values/llama2-7b-chat-cuda11.yaml
@@ -0,0 +1,30 @@
+replicas: 1
+deployment:
+  image: ghcr.io/chenhunghan/ialacol-cuda11:latest
+  env:
+    DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML
+    DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin
+    GPU_LAYERS: 40
+resources:
+  {}
+cache:
+  persistence:
+    size: 5Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClassName: ~
+cacheMountPath: /app/cache
+model:
+  persistence:
+    size: 5Gi
+    accessModes:
+      - ReadWriteOnce
+    storageClassName: ~
+modelMountPath: /app/models
+service:
+  type: ClusterIP
+  port: 8000
+  annotations: {}
+nodeSelector: {}
+tolerations: []
+affinity: {}
diff --git a/get_config.py b/get_config.py
index 1ddab19944709446aa5738c1b19fecf13c18a311..eeda6e16d4d590a98b5ef7cd4b16d2f3149a0d19 100644
--- a/get_config.py
+++ b/get_config.py
@@ -37,6 +37,8 @@ def get_config(body: CompletionRequestBody | ChatCompletionRequestBody) -> Confi
     STOP = get_env_or_none("STOP")
     # ggml only, follow ctransformers defaults
     CONTEXT_LENGTH = int(get_env("CONTEXT_LENGTH", "-1"))
+    # the layers to offloading to the GPU
+    GPU_LAYERS = int(get_env("GPU_LAYERS", "0"))
 
     log.info("TOP_K: %s", TOP_K)
     log.info("TOP_P: %s", TOP_P)
@@ -49,6 +51,7 @@ def get_config(body: CompletionRequestBody | ChatCompletionRequestBody) -> Confi
     log.info("MAX_TOKENS: %s", MAX_TOKENS)
     log.info("STOP: %s", STOP)
     log.info("CONTEXT_LENGTH: %s", CONTEXT_LENGTH)
+    log.info("GPU_LAYERS: %s", GPU_LAYERS)
     
     config = Config(
         top_k=body.top_k if body.top_k else TOP_K,
@@ -62,5 +65,6 @@ def get_config(body: CompletionRequestBody | ChatCompletionRequestBody) -> Confi
         max_new_tokens=body.max_tokens if body.max_tokens else MAX_TOKENS,
         stop=body.stop if body.stop else STOP,
         context_length=CONTEXT_LENGTH,
+        gpu_layers=GPU_LAYERS,
     )
     return config
diff --git a/get_llm.py b/get_llm.py
index f112c98aeb4fbfa8e4b4c23f19b98e2ac5f442eb..b82a73d233721e64bd4cb388bd3b39237589ccff 100644
--- a/get_llm.py
+++ b/get_llm.py
@@ -1,7 +1,7 @@
-from ctransformers import LLM, AutoModelForCausalLM
+from ctransformers import LLM
 from request_body import ChatCompletionRequestBody, CompletionRequestBody
 from get_env import get_env
-
+from get_config import get_config
 
 async def get_llm(
     body: ChatCompletionRequestBody | CompletionRequestBody,
@@ -38,12 +38,14 @@ async def get_llm(
         ctransformer_model_type = "dolly-v2"
     if "stablelm" in body.model:
         ctransformer_model_type = "gpt_neox"
-        
+    config = get_config(body)
     MODE_TYPE = get_env("MODE_TYPE", "")
     if len(MODE_TYPE) > 0:
         ctransformer_model_type = MODE_TYPE
     MODELS_FOLDER = get_env("MODELS_FOLDER", "models")
 
-    return AutoModelForCausalLM.from_pretrained(
-        f"./{MODELS_FOLDER}/{body.model}", model_type=ctransformer_model_type
+    return LLM(
+        model_path=f"./{MODELS_FOLDER}/{body.model}",
+        model_type=ctransformer_model_type,
+        config=config,
     )
diff --git a/model_generate.py b/model_generate.py
index 510bd2bb5c0a35b1286533c31bdb8e18410142db..f6412c37e7d12d7926fc2652a6ece0f3e9e16cb1 100644
--- a/model_generate.py
+++ b/model_generate.py
@@ -33,6 +33,8 @@ def model_generate(
     log.debug("batch_size: %s", batch_size)
     threads = config.threads
     log.debug("thread: %s", threads)
+    gpu_layers = config.gpu_layers
+    log.debug("gpu_layers: %s", gpu_layers)
     log.debug("prompt: %s", prompt)
 
     log.debug("Getting from ctransformer instance")
@@ -95,6 +97,8 @@ def chat_model_generate(
     log.debug("batch_size: %s", batch_size)
     threads = config.threads
     log.debug("thread: %s", threads)
+    gpu_layers = config.gpu_layers
+    log.debug("gpu_layers: %s", gpu_layers)
     log.debug("prompt: %s", prompt)
 
     log.debug("Getting from ctransformer instance")
diff --git a/requirements.txt b/requirements.txt
index 2de892e6efab84364a763359a7fef7a830ca524f..8e9d76531253e000c571e335d9b60a729ea8a4c8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ blake3==0.3.3
 certifi==2023.7.22
 charset-normalizer==3.1.0
 click==8.1.3
-ctransformers==0.2.20
+ctransformers==0.2.21
 fastapi==0.95.2
 filelock==3.12.0
 fsspec==2023.5.0