diff --git a/.github/workflows/smoke_test.yaml b/.github/workflows/smoke_test.yaml index 3a4414a2bfa823fe6616246d86a4cb6d9c41f986..fe465e9a3763e3eb58611625e922bf74ac7d623c 100644 --- a/.github/workflows/smoke_test.yaml +++ b/.github/workflows/smoke_test.yaml @@ -1,14 +1,12 @@ name: Smoke Test -on: - pull_request: - branches: - - main +on: pull_request env: REGISTRY: quay.io REPO_ORG_NAME: ialacol IMAGE_NAME: ialacol-smoke-test + GPTQ_IMAGE_TAG: gptq HELM_NAMESPACE: default LOGGING_LEVEL: DEBUG # for testing llama base models @@ -26,6 +24,12 @@ env: STARCODER_MODEL_HG_REPO_ID: mike-ravkine/tiny_starcoder_py-GGML STARCODER_MODEL_FILE: tiny_starcoder_py-q8_0.bin STARCODER_SVC_PORT: 8002 + # for testing gptq models + GPTQ_HELM_RELEASE_NAME: stablecode-instruct-alpha-3b-gptq + GPTQ_MODEL_HG_REPO_ID: TheBloke/stablecode-instruct-alpha-3b-GPTQ + GPTQ_MODEL_HG_REVISION: gptq-4bit-32g-actorder_True + GPTQ_MODEL_FILE: model.safetensors + GPTQ_SVC_PORT: 8003 jobs: build-image: @@ -36,7 +40,7 @@ jobs: with: fetch-depth: 0 - - name: Login to Github Container Registry + - name: Login to Registry uses: docker/login-action@v2 with: registry: ${{ env.REGISTRY }} @@ -51,6 +55,29 @@ jobs: push: true tags: | ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }} + build-gptq-cuda12-image: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Login to Registry + uses: docker/login-action@v2 + with: + registry: ${{ env.REGISTRY }} + username: ${{ secrets.QUAY_ROBOT_USERNAME }} + password: ${{ secrets.QUAY_ROBOT_PASSWORD }} + + - name: Build and push Docker image + uses: docker/build-push-action@v4 + with: + context: . + file: ./Dockerfile.cuda12 + push: true + tags: | + ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ env.GPTQ_IMAGE_TAG }} llama-smoke-test: runs-on: ubuntu-latest needs: build-image @@ -274,3 +301,79 @@ jobs: - if: always() run: | kubectl logs --tail=200 --selector app.kubernetes.io/name=$STARCODER_HELM_RELEASE_NAME -n $HELM_NAMESPACE + gptq-smoke-test: + runs-on: ubuntu-latest + needs: build-gptq-cuda12-image + steps: + - name: Create k8s Kind Cluster + uses: helm/kind-action@v1.7.0 + + - name: Set up Helm + uses: azure/setup-helm@v3 + with: + version: v3.12.0 + + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Install ialacol with GPTQ model from a revision and wait for pods to be ready + run: | + cat > values.yaml <<EOF + replicas: 1 + deployment: + image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ ${{ env.GPTQ_IMAGE_TAG }} + env: + DEFAULT_MODEL_HG_REPO_ID: $GPTQ_MODEL_HG_REPO_ID + DEFAULT_MODEL_HG_REPO_REVISION: $GPTQ_MODEL_HG_REVISION + DEFAULT_MODEL_FILE: $GPTQ_MODEL_FILE + MODEL_TYPE: "gptq" + LOGGING_LEVEL: $LOGGING_LEVEL + resources: + {} + model: + persistence: + size: 3Gi + accessModes: + - ReadWriteOnce + service: + type: ClusterIP + port: $GPTQ_SVC_PORT + annotations: {} + nodeSelector: {} + tolerations: [] + affinity: {} + EOF + helm install $GPTQ_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol + + echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)" + sleep 40 + - if: always() + run: | + kubectl get pods -n $HELM_NAMESPACE + - if: always() + run: | + kubectl logs --tail=200 --selector app.kubernetes.io/name=$GPTQ_HELM_RELEASE_NAME -n $HELM_NAMESPACE + - name: Port forward to the GPTQ model service + run: | + kubectl port-forward svc/$GPTQ_HELM_RELEASE_NAME $GPTQ_SVC_PORT:$GPTQ_SVC_PORT & + echo "Wait for port-forward to be ready" + sleep 5 + - name: Check the GET /v1/models endpoint + run: | + curl http://localhost:$GPTQ_SVC_PORT/v1/models + - uses: actions/setup-python@v4 + with: + python-version: 3.11 + - name: Install OpenAI CLI + run: | + pip install --upgrade openai --quiet + # We can only test if download works and if GET /models returns something on CPU CI workers + - name: Test the OpenAI CLI with default parameters + run: | + openai -k "sk-fake" -b http://localhost:$GPTQ_SVC_PORT/v1 api models.list + - if: always() + run: | + kubectl logs --tail=200 --selector app.kubernetes.io/name=$LLAMA_HELM_RELEASE_NAME -n $HELM_NAMESPACE + \ No newline at end of file diff --git a/README.md b/README.md index dc243c89c970b3958c97344fb866b30d5a046328..f88eeaf2dd0512aea28809c7d07ccae7107edd06 100644 --- a/README.md +++ b/README.md @@ -85,24 +85,25 @@ openai -k "sk-fake" \ All configuration is done via environmental variable. -| Parameter | Description | Default | Example | -| :----------------------------| :------------------------------------------------------------------- | :------ | :--------------------------------------------------------------------------- | -| `DEFAULT_MODEL_HG_REPO_ID` | The Hugging Face repo id to download the model | `None` | `TheBloke/orca_mini_3B-GGML` | -| `DEFAULT_MODEL_FILE` | The file name to download from the repo, optional for GPTQ models | `None` | `orca-mini-3b.ggmlv3.q4_0.bin` | -| `MODE_TYPE` | Model type to override the auto model type detection | `None` | `gptq`, `gpt_bigcode`, `llama`, `mpt`, `replit`, `falcon`, `gpt_neox` `gptj` | -| `LOGGING_LEVEL` | Logging level | `INFO` | `DEBUG` | -| `TOP_K` | top-k for sampling. | `40 ` | Integers | -| `TOP_P` | top-p for sampling. | `1.0` | Floats | -| `REPETITION_PENALTY` | rp for sampling. | `1.1` | Floats | -| `LAST_N_TOKENS` | The last n tokens for repetition penalty. | `1.1` | Integers | -| `SEED` | The seed for sampling. | `-1` | Integers | -| `BATCH_SIZE` | The batch size for evaluating tokens, only for GGUF/GGML models | `8` | Integers | -| `THREADS` | Thread number override auto detect by CPU/2, set `1` for GPTQ models | `Auto` | Integers | -| `MAX_TOKENS` | The max number of token to generate | `512` | Integers | -| `STOP` | The token to stop the generation | `None` | `<|endoftext>` | -| `CONTEXT_LENGTH` | Override the auto detect context length | `512` | Integers | -| `GPU_LAYERS` | The number of layers to off load to GPU | `0` | Integers | -| `TRUNCATE_PROMPT_LENGTH` | Truncate the prompt if set | `0` | Integers | +| Parameter | Description | Default | Example | +| :----------------------------------| :------------------------------------------------------------------- | :------ | :--------------------------------------------------------------------------- | +| `DEFAULT_MODEL_HG_REPO_ID` | The Hugging Face repo id to download the model | `None` | `TheBloke/orca_mini_3B-GGML` | +| `DEFAULT_MODEL_HG_REPO_REVISION` | The Hugging Face repo revision | `main` | `gptq-4bit-32g-actorder_True` | +| `DEFAULT_MODEL_FILE` | The file name to download from the repo, optional for GPTQ models | `None` | `orca-mini-3b.ggmlv3.q4_0.bin` | +| `MODE_TYPE` | Model type to override the auto model type detection | `None` | `gptq`, `gpt_bigcode`, `llama`, `mpt`, `replit`, `falcon`, `gpt_neox` `gptj` | +| `LOGGING_LEVEL` | Logging level | `INFO` | `DEBUG` | +| `TOP_K` | top-k for sampling. | `40 ` | Integers | +| `TOP_P` | top-p for sampling. | `1.0` | Floats | +| `REPETITION_PENALTY` | rp for sampling. | `1.1` | Floats | +| `LAST_N_TOKENS` | The last n tokens for repetition penalty. | `1.1` | Integers | +| `SEED` | The seed for sampling. | `-1` | Integers | +| `BATCH_SIZE` | The batch size for evaluating tokens, only for GGUF/GGML models | `8` | Integers | +| `THREADS` | Thread number override auto detect by CPU/2, set `1` for GPTQ models | `Auto` | Integers | +| `MAX_TOKENS` | The max number of token to generate | `512` | Integers | +| `STOP` | The token to stop the generation | `None` | `<|endoftext>` | +| `CONTEXT_LENGTH` | Override the auto detect context length | `512` | Integers | +| `GPU_LAYERS` | The number of layers to off load to GPU | `0` | Integers | +| `TRUNCATE_PROMPT_LENGTH` | Truncate the prompt if set | `0` | Integers | Sampling parameters including `TOP_K`, `TOP_P`, `REPETITION_PENALTY`, `LAST_N_TOKENS`, `SEED`, `MAX_TOKENS`, `STOP` can be override per request via request body, for example: diff --git a/charts/ialacol/Chart.yaml b/charts/ialacol/Chart.yaml index a536af7b8f9c331d307a179ddb5240636d91c110..5d1d626ce0b12f6636dac1239321f3642ce09281 100644 --- a/charts/ialacol/Chart.yaml +++ b/charts/ialacol/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 -appVersion: 0.11.5 +appVersion: 0.12.0 description: A Helm chart for ialacol name: ialacol type: application -version: 0.11.5 +version: 0.12.0 diff --git a/charts/ialacol/templates/deployment.yaml b/charts/ialacol/templates/deployment.yaml index d3c4b1a2feabbb1a06a531922787ce48a1d62693..84552666e0225c32a5783937501a11b9f0a98a00 100644 --- a/charts/ialacol/templates/deployment.yaml +++ b/charts/ialacol/templates/deployment.yaml @@ -27,6 +27,8 @@ spec: env: - name: DEFAULT_MODEL_HG_REPO_ID value: {{ (.Values.deployment.env).DEFAULT_MODEL_HG_REPO_ID | quote }} + - name: DEFAULT_MODEL_HG_REPO_REVISION + value: {{ (.Values.deployment.env).DEFAULT_MODEL_HG_REPO_REVISION | quote }} - name: DEFAULT_MODEL_FILE value: {{ (.Values.deployment.env).DEFAULT_MODEL_FILE | quote }} - name: MODE_TYPE diff --git a/charts/ialacol/values.yaml b/charts/ialacol/values.yaml index 08e45a848a92a1e1a260e4076361996ce656306d..2a06c5135254a6dbb05bd2c1a4afafcf69ba83fc 100644 --- a/charts/ialacol/values.yaml +++ b/charts/ialacol/values.yaml @@ -5,6 +5,7 @@ deployment: # or use CUDA image `ghcr.io/chenhunghan/ialacol-cuda12:latest` # env: # DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML + # DEFAULT_MODEL_HG_REPO_REVISION: main # DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin # LOGGING_LEVEL: DEBUG resources: diff --git a/main.py b/main.py index 6dff1a50fe399ab4de79e47f1d60aea4a4e448c2..6b936cd3c88cc6cb9af8bfcd2afd7c17e4113e17 100644 --- a/main.py +++ b/main.py @@ -31,9 +31,13 @@ from const import DEFAULT_CONTEXT_LENGTH DEFAULT_MODEL_HG_REPO_ID = get_env( "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML" ) +DEFAULT_MODEL_HG_REPO_REVISION = get_env( + "DEFAULT_MODEL_HG_REPO_REVISION", "main" +) DEFAULT_MODEL_FILE = get_env("DEFAULT_MODEL_FILE", "llama-2-7b-chat.ggmlv3.q4_0.bin") log.info("DEFAULT_MODEL_HG_REPO_ID: %s", DEFAULT_MODEL_HG_REPO_ID) +log.info("DEFAULT_MODEL_HG_REPO_REVISION: %s", DEFAULT_MODEL_HG_REPO_REVISION) log.info("DEFAULT_MODEL_FILE: %s", DEFAULT_MODEL_FILE) DOWNLOADING_MODEL = False @@ -93,6 +97,7 @@ async def startup_event(): ) snapshot_download( repo_id=DEFAULT_MODEL_HG_REPO_ID, + revision=DEFAULT_MODEL_HG_REPO_REVISION, cache_dir="models/.cache", local_dir="models", resume_download=True, @@ -106,6 +111,7 @@ async def startup_event(): ) hf_hub_download( repo_id=DEFAULT_MODEL_HG_REPO_ID, + revision=DEFAULT_MODEL_HG_REPO_REVISION, cache_dir="models/.cache", local_dir="models", filename=DEFAULT_MODEL_FILE,