Skip to content
Snippets Groups Projects
Unverified Commit e22bd336 authored by Henry Chen's avatar Henry Chen Committed by GitHub
Browse files

Add support for download model from a specific revision (#77)

parent 13d4334b
No related branches found
No related tags found
No related merge requests found
name: Smoke Test name: Smoke Test
on: on: pull_request
pull_request:
branches:
- main
env: env:
REGISTRY: quay.io REGISTRY: quay.io
REPO_ORG_NAME: ialacol REPO_ORG_NAME: ialacol
IMAGE_NAME: ialacol-smoke-test IMAGE_NAME: ialacol-smoke-test
GPTQ_IMAGE_TAG: gptq
HELM_NAMESPACE: default HELM_NAMESPACE: default
LOGGING_LEVEL: DEBUG LOGGING_LEVEL: DEBUG
# for testing llama base models # for testing llama base models
...@@ -26,6 +24,12 @@ env: ...@@ -26,6 +24,12 @@ env:
STARCODER_MODEL_HG_REPO_ID: mike-ravkine/tiny_starcoder_py-GGML STARCODER_MODEL_HG_REPO_ID: mike-ravkine/tiny_starcoder_py-GGML
STARCODER_MODEL_FILE: tiny_starcoder_py-q8_0.bin STARCODER_MODEL_FILE: tiny_starcoder_py-q8_0.bin
STARCODER_SVC_PORT: 8002 STARCODER_SVC_PORT: 8002
# for testing gptq models
GPTQ_HELM_RELEASE_NAME: stablecode-instruct-alpha-3b-gptq
GPTQ_MODEL_HG_REPO_ID: TheBloke/stablecode-instruct-alpha-3b-GPTQ
GPTQ_MODEL_HG_REVISION: gptq-4bit-32g-actorder_True
GPTQ_MODEL_FILE: model.safetensors
GPTQ_SVC_PORT: 8003
jobs: jobs:
build-image: build-image:
...@@ -36,7 +40,7 @@ jobs: ...@@ -36,7 +40,7 @@ jobs:
with: with:
fetch-depth: 0 fetch-depth: 0
- name: Login to Github Container Registry - name: Login to Registry
uses: docker/login-action@v2 uses: docker/login-action@v2
with: with:
registry: ${{ env.REGISTRY }} registry: ${{ env.REGISTRY }}
...@@ -51,6 +55,29 @@ jobs: ...@@ -51,6 +55,29 @@ jobs:
push: true push: true
tags: | tags: |
${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }} ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
build-gptq-cuda12-image:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Login to Registry
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ secrets.QUAY_ROBOT_USERNAME }}
password: ${{ secrets.QUAY_ROBOT_PASSWORD }}
- name: Build and push Docker image
uses: docker/build-push-action@v4
with:
context: .
file: ./Dockerfile.cuda12
push: true
tags: |
${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ env.GPTQ_IMAGE_TAG }}
llama-smoke-test: llama-smoke-test:
runs-on: ubuntu-latest runs-on: ubuntu-latest
needs: build-image needs: build-image
...@@ -274,3 +301,79 @@ jobs: ...@@ -274,3 +301,79 @@ jobs:
- if: always() - if: always()
run: | run: |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$STARCODER_HELM_RELEASE_NAME -n $HELM_NAMESPACE kubectl logs --tail=200 --selector app.kubernetes.io/name=$STARCODER_HELM_RELEASE_NAME -n $HELM_NAMESPACE
gptq-smoke-test:
runs-on: ubuntu-latest
needs: build-gptq-cuda12-image
steps:
- name: Create k8s Kind Cluster
uses: helm/kind-action@v1.7.0
- name: Set up Helm
uses: azure/setup-helm@v3
with:
version: v3.12.0
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Install ialacol with GPTQ model from a revision and wait for pods to be ready
run: |
cat > values.yaml <<EOF
replicas: 1
deployment:
image: ${{ env.REGISTRY }}/${{ env.REPO_ORG_NAME }}/${{ env.IMAGE_NAME }}:${{ ${{ env.GPTQ_IMAGE_TAG }}
env:
DEFAULT_MODEL_HG_REPO_ID: $GPTQ_MODEL_HG_REPO_ID
DEFAULT_MODEL_HG_REPO_REVISION: $GPTQ_MODEL_HG_REVISION
DEFAULT_MODEL_FILE: $GPTQ_MODEL_FILE
MODEL_TYPE: "gptq"
LOGGING_LEVEL: $LOGGING_LEVEL
resources:
{}
model:
persistence:
size: 3Gi
accessModes:
- ReadWriteOnce
service:
type: ClusterIP
port: $GPTQ_SVC_PORT
annotations: {}
nodeSelector: {}
tolerations: []
affinity: {}
EOF
helm install $GPTQ_HELM_RELEASE_NAME -f values.yaml --namespace $HELM_NAMESPACE ./charts/ialacol
echo "Wait for the pod to be ready, it takes about 36s to download a 1.93GB model (~50MB/s)"
sleep 40
- if: always()
run: |
kubectl get pods -n $HELM_NAMESPACE
- if: always()
run: |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$GPTQ_HELM_RELEASE_NAME -n $HELM_NAMESPACE
- name: Port forward to the GPTQ model service
run: |
kubectl port-forward svc/$GPTQ_HELM_RELEASE_NAME $GPTQ_SVC_PORT:$GPTQ_SVC_PORT &
echo "Wait for port-forward to be ready"
sleep 5
- name: Check the GET /v1/models endpoint
run: |
curl http://localhost:$GPTQ_SVC_PORT/v1/models
- uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Install OpenAI CLI
run: |
pip install --upgrade openai --quiet
# We can only test if download works and if GET /models returns something on CPU CI workers
- name: Test the OpenAI CLI with default parameters
run: |
openai -k "sk-fake" -b http://localhost:$GPTQ_SVC_PORT/v1 api models.list
- if: always()
run: |
kubectl logs --tail=200 --selector app.kubernetes.io/name=$LLAMA_HELM_RELEASE_NAME -n $HELM_NAMESPACE
\ No newline at end of file
...@@ -85,24 +85,25 @@ openai -k "sk-fake" \ ...@@ -85,24 +85,25 @@ openai -k "sk-fake" \
All configuration is done via environmental variable. All configuration is done via environmental variable.
| Parameter | Description | Default | Example | | Parameter | Description | Default | Example |
| :----------------------------| :------------------------------------------------------------------- | :------ | :--------------------------------------------------------------------------- | | :----------------------------------| :------------------------------------------------------------------- | :------ | :--------------------------------------------------------------------------- |
| `DEFAULT_MODEL_HG_REPO_ID` | The Hugging Face repo id to download the model | `None` | `TheBloke/orca_mini_3B-GGML` | | `DEFAULT_MODEL_HG_REPO_ID` | The Hugging Face repo id to download the model | `None` | `TheBloke/orca_mini_3B-GGML` |
| `DEFAULT_MODEL_FILE` | The file name to download from the repo, optional for GPTQ models | `None` | `orca-mini-3b.ggmlv3.q4_0.bin` | | `DEFAULT_MODEL_HG_REPO_REVISION` | The Hugging Face repo revision | `main` | `gptq-4bit-32g-actorder_True` |
| `MODE_TYPE` | Model type to override the auto model type detection | `None` | `gptq`, `gpt_bigcode`, `llama`, `mpt`, `replit`, `falcon`, `gpt_neox` `gptj` | | `DEFAULT_MODEL_FILE` | The file name to download from the repo, optional for GPTQ models | `None` | `orca-mini-3b.ggmlv3.q4_0.bin` |
| `LOGGING_LEVEL` | Logging level | `INFO` | `DEBUG` | | `MODE_TYPE` | Model type to override the auto model type detection | `None` | `gptq`, `gpt_bigcode`, `llama`, `mpt`, `replit`, `falcon`, `gpt_neox` `gptj` |
| `TOP_K` | top-k for sampling. | `40 ` | Integers | | `LOGGING_LEVEL` | Logging level | `INFO` | `DEBUG` |
| `TOP_P` | top-p for sampling. | `1.0` | Floats | | `TOP_K` | top-k for sampling. | `40 ` | Integers |
| `REPETITION_PENALTY` | rp for sampling. | `1.1` | Floats | | `TOP_P` | top-p for sampling. | `1.0` | Floats |
| `LAST_N_TOKENS` | The last n tokens for repetition penalty. | `1.1` | Integers | | `REPETITION_PENALTY` | rp for sampling. | `1.1` | Floats |
| `SEED` | The seed for sampling. | `-1` | Integers | | `LAST_N_TOKENS` | The last n tokens for repetition penalty. | `1.1` | Integers |
| `BATCH_SIZE` | The batch size for evaluating tokens, only for GGUF/GGML models | `8` | Integers | | `SEED` | The seed for sampling. | `-1` | Integers |
| `THREADS` | Thread number override auto detect by CPU/2, set `1` for GPTQ models | `Auto` | Integers | | `BATCH_SIZE` | The batch size for evaluating tokens, only for GGUF/GGML models | `8` | Integers |
| `MAX_TOKENS` | The max number of token to generate | `512` | Integers | | `THREADS` | Thread number override auto detect by CPU/2, set `1` for GPTQ models | `Auto` | Integers |
| `STOP` | The token to stop the generation | `None` | `<|endoftext>` | | `MAX_TOKENS` | The max number of token to generate | `512` | Integers |
| `CONTEXT_LENGTH` | Override the auto detect context length | `512` | Integers | | `STOP` | The token to stop the generation | `None` | `<|endoftext>` |
| `GPU_LAYERS` | The number of layers to off load to GPU | `0` | Integers | | `CONTEXT_LENGTH` | Override the auto detect context length | `512` | Integers |
| `TRUNCATE_PROMPT_LENGTH` | Truncate the prompt if set | `0` | Integers | | `GPU_LAYERS` | The number of layers to off load to GPU | `0` | Integers |
| `TRUNCATE_PROMPT_LENGTH` | Truncate the prompt if set | `0` | Integers |
Sampling parameters including `TOP_K`, `TOP_P`, `REPETITION_PENALTY`, `LAST_N_TOKENS`, `SEED`, `MAX_TOKENS`, `STOP` can be override per request via request body, for example: Sampling parameters including `TOP_K`, `TOP_P`, `REPETITION_PENALTY`, `LAST_N_TOKENS`, `SEED`, `MAX_TOKENS`, `STOP` can be override per request via request body, for example:
......
apiVersion: v2 apiVersion: v2
appVersion: 0.11.5 appVersion: 0.12.0
description: A Helm chart for ialacol description: A Helm chart for ialacol
name: ialacol name: ialacol
type: application type: application
version: 0.11.5 version: 0.12.0
...@@ -27,6 +27,8 @@ spec: ...@@ -27,6 +27,8 @@ spec:
env: env:
- name: DEFAULT_MODEL_HG_REPO_ID - name: DEFAULT_MODEL_HG_REPO_ID
value: {{ (.Values.deployment.env).DEFAULT_MODEL_HG_REPO_ID | quote }} value: {{ (.Values.deployment.env).DEFAULT_MODEL_HG_REPO_ID | quote }}
- name: DEFAULT_MODEL_HG_REPO_REVISION
value: {{ (.Values.deployment.env).DEFAULT_MODEL_HG_REPO_REVISION | quote }}
- name: DEFAULT_MODEL_FILE - name: DEFAULT_MODEL_FILE
value: {{ (.Values.deployment.env).DEFAULT_MODEL_FILE | quote }} value: {{ (.Values.deployment.env).DEFAULT_MODEL_FILE | quote }}
- name: MODE_TYPE - name: MODE_TYPE
......
...@@ -5,6 +5,7 @@ deployment: ...@@ -5,6 +5,7 @@ deployment:
# or use CUDA image `ghcr.io/chenhunghan/ialacol-cuda12:latest` # or use CUDA image `ghcr.io/chenhunghan/ialacol-cuda12:latest`
# env: # env:
# DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML # DEFAULT_MODEL_HG_REPO_ID: TheBloke/Llama-2-7B-Chat-GGML
# DEFAULT_MODEL_HG_REPO_REVISION: main
# DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin # DEFAULT_MODEL_FILE: llama-2-7b-chat.ggmlv3.q4_0.bin
# LOGGING_LEVEL: DEBUG # LOGGING_LEVEL: DEBUG
resources: resources:
......
...@@ -31,9 +31,13 @@ from const import DEFAULT_CONTEXT_LENGTH ...@@ -31,9 +31,13 @@ from const import DEFAULT_CONTEXT_LENGTH
DEFAULT_MODEL_HG_REPO_ID = get_env( DEFAULT_MODEL_HG_REPO_ID = get_env(
"DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML" "DEFAULT_MODEL_HG_REPO_ID", "TheBloke/Llama-2-7B-Chat-GGML"
) )
DEFAULT_MODEL_HG_REPO_REVISION = get_env(
"DEFAULT_MODEL_HG_REPO_REVISION", "main"
)
DEFAULT_MODEL_FILE = get_env("DEFAULT_MODEL_FILE", "llama-2-7b-chat.ggmlv3.q4_0.bin") DEFAULT_MODEL_FILE = get_env("DEFAULT_MODEL_FILE", "llama-2-7b-chat.ggmlv3.q4_0.bin")
log.info("DEFAULT_MODEL_HG_REPO_ID: %s", DEFAULT_MODEL_HG_REPO_ID) log.info("DEFAULT_MODEL_HG_REPO_ID: %s", DEFAULT_MODEL_HG_REPO_ID)
log.info("DEFAULT_MODEL_HG_REPO_REVISION: %s", DEFAULT_MODEL_HG_REPO_REVISION)
log.info("DEFAULT_MODEL_FILE: %s", DEFAULT_MODEL_FILE) log.info("DEFAULT_MODEL_FILE: %s", DEFAULT_MODEL_FILE)
DOWNLOADING_MODEL = False DOWNLOADING_MODEL = False
...@@ -93,6 +97,7 @@ async def startup_event(): ...@@ -93,6 +97,7 @@ async def startup_event():
) )
snapshot_download( snapshot_download(
repo_id=DEFAULT_MODEL_HG_REPO_ID, repo_id=DEFAULT_MODEL_HG_REPO_ID,
revision=DEFAULT_MODEL_HG_REPO_REVISION,
cache_dir="models/.cache", cache_dir="models/.cache",
local_dir="models", local_dir="models",
resume_download=True, resume_download=True,
...@@ -106,6 +111,7 @@ async def startup_event(): ...@@ -106,6 +111,7 @@ async def startup_event():
) )
hf_hub_download( hf_hub_download(
repo_id=DEFAULT_MODEL_HG_REPO_ID, repo_id=DEFAULT_MODEL_HG_REPO_ID,
revision=DEFAULT_MODEL_HG_REPO_REVISION,
cache_dir="models/.cache", cache_dir="models/.cache",
local_dir="models", local_dir="models",
filename=DEFAULT_MODEL_FILE, filename=DEFAULT_MODEL_FILE,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment