diff --git a/README.md b/README.md index 01b4d62ab2de6c186459f6ee26f3736a8ffa93d1..5da10f46d1013d39b30fa5232240376614166b11 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,11 @@ ialacol is inspired by other similar projects like [LocalAI](https://github.com/ ## Features -- Compatibility with OpenAI APIs, allowing you to use any frameworks that are built on top of OpenAI APIs such as [langchain](https://github.com/hwchase17/langchain). +- Compatibility with OpenAI APIs, compatible with [langchain](https://github.com/hwchase17/langchain). - Lightweight, easy deployment on Kubernetes clusters with a 1-click Helm installation. - Streaming first! For better UX. - Optional CUDA acceleration. +- Compatible with [Github Copilot VSCode Extension](https://marketplace.visualstudio.com/items?itemName=GitHub.copilot), see [Copilot](#copilot) ## Supported Models @@ -96,6 +97,17 @@ docker run --rm -it -p 8000:8000 \ For developers/contributors +##### Python + +```bash +python3 -m venv .venv +source .venv/bin/activate +python3 -m pip install -r requirements.txt +DEFAULT_MODEL_HG_REPO_ID="TheBloke/stablecode-completion-alpha-3b-4k-GGML" DEFAULT_MODEL_FILE="stablecode-completion-alpha-3b-4k.ggmlv1.q4_0.bin" LOGGING_LEVEL="DEBUG" THREAD=4 uvicorn main:app --reload --host 0.0.0.0 --port 9999 +``` + +##### Docker + Build image ```sh @@ -182,6 +194,46 @@ openai -k "sk-fake" -b http://localhost:8000/v1 -vvvvv api chat_completions.crea ## Tips +### Copilot + +`ialacol` can be use as a copilot client as GitHub's Copilot is almost identical API as OpenAI completion API. + +However, few things need to keep in mind: + +1. Copilot client sends a lenthy prompt, to include all the related context for code completion, see [copilot-explorer](https://github.com/thakkarparth007/copilot-explorer), which give heavy load on the server, if you are trying to run `ialacol` locally, opt-in `TRUNCATE_PROMPT_LENGTH` environmental variable to truncate the prompt from the beginning to reduce the workload. + +2. Copilot sends request in parallel, to increase the throughput, you probably need a queue like [text-inference-batcher]([text-inference-batcher](https://github.com/ialacol/text-inference-batcher). + +Start two instances of ialacol: + +```bash +gh repo clone chenhunghan/ialacol && cd ialacol && python3 -m venv .venv && source .venv/bin/activate && python3 -m pip install -r requirements.txt +LOGGING_LEVEL="DEBUG" +THREAD=2 +DEFAULT_MODEL_HG_REPO_ID="TheBloke/stablecode-completion-alpha-3b-4k-GGML" +DEFAULT_MODEL_FILE="stablecode-completion-alpha-3b-4k.ggmlv1.q4_0.bin" +TRUNCATE_PROMPT_LENGTH=100 # optional +uvicorn main:app --host 0.0.0.0 --port 9998 +uvicorn main:app --host 0.0.0.0 --port 9999 +``` + +Start [tib](https://github.com/ialacol/text-inference-batcher), pointing to upstream ialacol instances. + +```bash +gh repo clone ialacol/text-inference-batcher && cd text-inference-batcher && npm install +UPSTREAMS="http://localhost:9998,http://localhost:9999" npm start +``` + +Configure VSCode Github Copilot to use [tib](https://github.com/ialacol/text-inference-batcher). + +```json +"github.copilot.advanced": { + "debug.overrideEngine": "stablecode-completion-alpha-3b-4k.ggmlv1.q4_0.bin", + "debug.testOverrideProxyUrl": "http://localhost:8000", + "debug.overrideProxyUrl": "http://localhost:8000" +} +``` + ### Creative v.s. Conservative LLMs are known to be sensitive to parameters, the higher `temperature` leads to more "randomness" hence LLM becomes more "creative", `top_p` and `top_k` also contribute to the "randomness" diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml index 1c19c57f51a05dde1fa72dfde54e0fe7e83e8566..8cf5f32400f76fb72661a39d31114ebcc7e2f4ec 100644 --- a/charts/ialacol/Chart.yaml +++ b/charts/ialacol/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 -appVersion: 0.10.4 +appVersion: 0.11.0 description: A Helm chart for ialacol name: ialacol type: application -version: 0.10.4 +version: 0.11.0 diff --git a/get_config.py b/get_config.py index 94a51ff2877d1e6e61664258e46399b6297bd684..5e4c3f0d149f14c4dcd5f577c69a40d343b63e2f 100644 --- a/get_config.py +++ b/get_config.py @@ -7,7 +7,6 @@ from log import log THREADS = int(get_env("THREADS", str(get_default_thread()))) - def get_config( body: CompletionRequestBody | ChatCompletionRequestBody, ) -> Config: diff --git a/main.py b/main.py index 9ccbee9db0537b066e813f28dea0cfb61c6cc404..a0daab5fce08260c6d790f8f8f78db1f500d297b 100644 --- a/main.py +++ b/main.py @@ -23,6 +23,7 @@ from streamers import chat_completions_streamer, completions_streamer from model_generate import chat_model_generate, model_generate from get_env import get_env from log import log +from truncate import truncate DEFAULT_MODEL_HG_REPO_ID = get_env( "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML" @@ -44,7 +45,8 @@ def set_downloading_model(boolean: bool): """ globals()["DOWNLOADING_MODEL"] = boolean log.debug("DOWNLOADING_MODEL set to %s", globals()["DOWNLOADING_MODEL"]) - + + def set_loading_model(boolean: bool): """_summary_ @@ -102,7 +104,11 @@ async def startup_event(): gpu_layers=GPU_LAYERS, ) model_type = get_model_type(DEFAULT_MODEL_FILE) - log.info("Creating llm singleton with model_type: %s for DEFAULT_MODEL_FILE %s", model_type, DEFAULT_MODEL_FILE) + log.info( + "Creating llm singleton with model_type: %s for DEFAULT_MODEL_FILE %s", + model_type, + DEFAULT_MODEL_FILE, + ) set_loading_model(True) llm = LLM( model_path=f"{os.getcwd()}/models/{DEFAULT_MODEL_FILE}", @@ -137,7 +143,6 @@ async def models(): "object": "list", } - @app.post("/v1/completions", response_model=CompletionResponseBody) async def completions( body: Annotated[CompletionRequestBody, Body()], @@ -177,6 +182,40 @@ async def completions( ) return model_generate(prompt, model_name, llm, config) +@app.post("/v1/engines/{engine}/completions") +async def engine_completions( + # Can't use body as FastAPI require corrent context-type header + # But copilot client maybe not send such header + request: Request, + # copilot client ONLY request param + engine: str, +): + """_summary_ + Similar to https://platform.openai.com/docs/api-reference/completions + but with engine param and with /v1/engines + Args: + body (CompletionRequestBody): parsed request body + Returns: + StreamingResponse: streaming response + """ + if DOWNLOADING_MODEL is True: + raise HTTPException(status_code=503, detail="Downloading model") + json = await request.json() + log.debug("Body:%s", str(json)) + + body = CompletionRequestBody(**json, model=engine) + prompt = truncate(body.prompt) + + config = get_config(body) + llm = request.app.state.llm + if body.stream is True: + log.debug("Streaming response from %s", engine) + return StreamingResponse( + completions_streamer(prompt, engine, llm, config), + media_type="text/event-stream", + ) + return model_generate(prompt, engine, llm, config) + @app.post("/v1/chat/completions", response_model=ChatCompletionResponseBody) async def chat_completions( diff --git a/truncate.py b/truncate.py new file mode 100644 index 0000000000000000000000000000000000000000..49013a5db22d1acfd6e4710a2bce7512c917bfa4 --- /dev/null +++ b/truncate.py @@ -0,0 +1,27 @@ +from get_env import get_env_or_none + +def truncate(string, beginning=True): + """Shorten the given string to the given length. + + :Parameters: + length (int) = The maximum allowed length before truncating. + beginning (bool) = Trim starting chars, else; ending. + + :Return: + (str) + + ex. call: truncate('12345678', 4) + returns: '5678' + """ + TRUNCATE_PROMPT_LENGTH = get_env_or_none("TRUNCATE_PROMPT_LENGTH") + if (TRUNCATE_PROMPT_LENGTH is None): + return string + length = int(TRUNCATE_PROMPT_LENGTH) + if len(string) > length: + # trim starting chars. + if beginning: + string = string[-length:] + # trim ending chars. + else: + string = string[:length] + return string