diff --git a/README.md b/README.md
index 01b4d62ab2de6c186459f6ee26f3736a8ffa93d1..5da10f46d1013d39b30fa5232240376614166b11 100644
--- a/README.md
+++ b/README.md
@@ -12,10 +12,11 @@ ialacol is inspired by other similar projects like [LocalAI](https://github.com/
 
 ## Features
 
-- Compatibility with OpenAI APIs, allowing you to use any frameworks that are built on top of OpenAI APIs such as [langchain](https://github.com/hwchase17/langchain).
+- Compatibility with OpenAI APIs, compatible with [langchain](https://github.com/hwchase17/langchain).
 - Lightweight, easy deployment on Kubernetes clusters with a 1-click Helm installation.
 - Streaming first! For better UX.
 - Optional CUDA acceleration.
+- Compatible with [Github Copilot VSCode Extension](https://marketplace.visualstudio.com/items?itemName=GitHub.copilot), see [Copilot](#copilot)
 
 ## Supported Models
 
@@ -96,6 +97,17 @@ docker run --rm -it -p 8000:8000 \
 
 For developers/contributors
 
+##### Python
+
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+python3 -m pip install -r requirements.txt
+DEFAULT_MODEL_HG_REPO_ID="TheBloke/stablecode-completion-alpha-3b-4k-GGML" DEFAULT_MODEL_FILE="stablecode-completion-alpha-3b-4k.ggmlv1.q4_0.bin" LOGGING_LEVEL="DEBUG" THREAD=4 uvicorn main:app --reload --host 0.0.0.0 --port 9999
+```
+
+##### Docker
+
 Build image
 
 ```sh
@@ -182,6 +194,46 @@ openai -k "sk-fake" -b http://localhost:8000/v1 -vvvvv api chat_completions.crea
 
 ## Tips
 
+### Copilot
+
+`ialacol` can be use as a copilot client as GitHub's Copilot is almost identical API as OpenAI completion API.
+
+However, few things need to keep in mind:
+
+1. Copilot client sends a lenthy prompt, to include all the related context for code completion, see [copilot-explorer](https://github.com/thakkarparth007/copilot-explorer), which give heavy load on the server, if you are trying to run `ialacol` locally, opt-in `TRUNCATE_PROMPT_LENGTH` environmental variable to truncate the prompt from the beginning to reduce the workload.
+
+2. Copilot sends request in parallel, to increase the throughput, you probably need a queue like [text-inference-batcher]([text-inference-batcher](https://github.com/ialacol/text-inference-batcher).
+
+Start two instances of ialacol:
+
+```bash
+gh repo clone chenhunghan/ialacol && cd ialacol && python3 -m venv .venv && source .venv/bin/activate && python3 -m pip install -r requirements.txt
+LOGGING_LEVEL="DEBUG"
+THREAD=2
+DEFAULT_MODEL_HG_REPO_ID="TheBloke/stablecode-completion-alpha-3b-4k-GGML"
+DEFAULT_MODEL_FILE="stablecode-completion-alpha-3b-4k.ggmlv1.q4_0.bin"
+TRUNCATE_PROMPT_LENGTH=100 # optional
+uvicorn main:app --host 0.0.0.0 --port 9998
+uvicorn main:app --host 0.0.0.0 --port 9999
+```
+
+Start [tib](https://github.com/ialacol/text-inference-batcher), pointing to upstream ialacol instances.
+
+```bash
+gh repo clone ialacol/text-inference-batcher && cd text-inference-batcher && npm install
+UPSTREAMS="http://localhost:9998,http://localhost:9999" npm start
+```
+
+Configure VSCode Github Copilot to use [tib](https://github.com/ialacol/text-inference-batcher).
+
+```json
+"github.copilot.advanced": {
+     "debug.overrideEngine": "stablecode-completion-alpha-3b-4k.ggmlv1.q4_0.bin",
+     "debug.testOverrideProxyUrl": "http://localhost:8000",
+     "debug.overrideProxyUrl": "http://localhost:8000"
+}
+```
+
 ### Creative v.s. Conservative
 
 LLMs are known to be sensitive to parameters, the higher `temperature` leads to more "randomness" hence LLM becomes more "creative", `top_p` and `top_k` also contribute to the "randomness"
diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml
index 1c19c57f51a05dde1fa72dfde54e0fe7e83e8566..8cf5f32400f76fb72661a39d31114ebcc7e2f4ec 100644
--- a/charts/ialacol/Chart.yaml
+++ b/charts/ialacol/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
-appVersion: 0.10.4
+appVersion: 0.11.0
 description: A Helm chart for ialacol
 name: ialacol
 type: application
-version: 0.10.4
+version: 0.11.0
diff --git a/get_config.py b/get_config.py
index 94a51ff2877d1e6e61664258e46399b6297bd684..5e4c3f0d149f14c4dcd5f577c69a40d343b63e2f 100644
--- a/get_config.py
+++ b/get_config.py
@@ -7,7 +7,6 @@ from log import log
 
 THREADS = int(get_env("THREADS", str(get_default_thread())))
 
-
 def get_config(
     body: CompletionRequestBody | ChatCompletionRequestBody,
 ) -> Config:
diff --git a/main.py b/main.py
index 9ccbee9db0537b066e813f28dea0cfb61c6cc404..a0daab5fce08260c6d790f8f8f78db1f500d297b 100644
--- a/main.py
+++ b/main.py
@@ -23,6 +23,7 @@ from streamers import chat_completions_streamer, completions_streamer
 from model_generate import chat_model_generate, model_generate
 from get_env import get_env
 from log import log
+from truncate import truncate
 
 DEFAULT_MODEL_HG_REPO_ID = get_env(
     "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML"
@@ -44,7 +45,8 @@ def set_downloading_model(boolean: bool):
     """
     globals()["DOWNLOADING_MODEL"] = boolean
     log.debug("DOWNLOADING_MODEL set to %s", globals()["DOWNLOADING_MODEL"])
-    
+
+
 def set_loading_model(boolean: bool):
     """_summary_
 
@@ -102,7 +104,11 @@ async def startup_event():
         gpu_layers=GPU_LAYERS,
     )
     model_type = get_model_type(DEFAULT_MODEL_FILE)
-    log.info("Creating llm singleton with model_type: %s for DEFAULT_MODEL_FILE %s", model_type, DEFAULT_MODEL_FILE)
+    log.info(
+        "Creating llm singleton with model_type: %s for DEFAULT_MODEL_FILE %s",
+        model_type,
+        DEFAULT_MODEL_FILE,
+    )
     set_loading_model(True)
     llm = LLM(
         model_path=f"{os.getcwd()}/models/{DEFAULT_MODEL_FILE}",
@@ -137,7 +143,6 @@ async def models():
         "object": "list",
     }
 
-
 @app.post("/v1/completions", response_model=CompletionResponseBody)
 async def completions(
     body: Annotated[CompletionRequestBody, Body()],
@@ -177,6 +182,40 @@ async def completions(
         )
     return model_generate(prompt, model_name, llm, config)
 
+@app.post("/v1/engines/{engine}/completions")
+async def engine_completions(
+    # Can't use body as FastAPI require corrent context-type header
+    # But copilot client maybe not send such header
+    request: Request,
+    # copilot client ONLY request param
+    engine: str,
+):
+    """_summary_
+        Similar to https://platform.openai.com/docs/api-reference/completions
+        but with engine param and with /v1/engines
+    Args:
+        body (CompletionRequestBody): parsed request body
+    Returns:
+        StreamingResponse: streaming response
+    """
+    if DOWNLOADING_MODEL is True:
+        raise HTTPException(status_code=503, detail="Downloading model")
+    json = await request.json()
+    log.debug("Body:%s", str(json))
+
+    body = CompletionRequestBody(**json, model=engine)
+    prompt = truncate(body.prompt)
+
+    config = get_config(body)
+    llm = request.app.state.llm
+    if body.stream is True:
+        log.debug("Streaming response from %s", engine)
+        return StreamingResponse(
+            completions_streamer(prompt, engine, llm, config),
+            media_type="text/event-stream",
+        )
+    return model_generate(prompt, engine, llm, config)
+
 
 @app.post("/v1/chat/completions", response_model=ChatCompletionResponseBody)
 async def chat_completions(
diff --git a/truncate.py b/truncate.py
new file mode 100644
index 0000000000000000000000000000000000000000..49013a5db22d1acfd6e4710a2bce7512c917bfa4
--- /dev/null
+++ b/truncate.py
@@ -0,0 +1,27 @@
+from get_env import get_env_or_none
+
+def truncate(string, beginning=True):
+    """Shorten the given string to the given length.
+
+    :Parameters:
+        length (int) = The maximum allowed length before truncating.
+        beginning (bool) = Trim starting chars, else; ending.
+
+    :Return:
+        (str)
+
+    ex. call: truncate('12345678', 4)
+        returns: '5678'
+    """
+    TRUNCATE_PROMPT_LENGTH = get_env_or_none("TRUNCATE_PROMPT_LENGTH")
+    if (TRUNCATE_PROMPT_LENGTH is None):
+      return string
+    length = int(TRUNCATE_PROMPT_LENGTH)
+    if len(string) > length:
+        # trim starting chars.
+        if beginning:
+            string = string[-length:]
+        # trim ending chars.
+        else:
+            string = string[:length]
+    return string