Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
I
Ialacol-build
Manage
Activity
Members
Plan
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Terraform modules
Analyze
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
K8s AI Models
Ialacol-build
Commits
4d1fc25e
Unverified
Commit
4d1fc25e
authored
1 year ago
by
Henry Chen
Committed by
GitHub
1 year ago
Browse files
Options
Downloads
Patches
Plain Diff
Fixes for gptq image, improve `codegen` mapping (to gptj) (#64)
Signed-off-by:
Hung-Han (Henry) Chen
<
chenhungh@gmail.com
>
parent
4f651e38
No related branches found
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
Dockerfile.gptq
+7
-1
7 additions, 1 deletion
Dockerfile.gptq
charts/ialacol/Chart.yaml
+2
-2
2 additions, 2 deletions
charts/ialacol/Chart.yaml
get_model_type.py
+10
-1
10 additions, 1 deletion
get_model_type.py
main.py
+51
-25
51 additions, 25 deletions
main.py
with
70 additions
and
29 deletions
Dockerfile.gptq
+
7
−
1
View file @
4d1fc25e
# syntax=docker/dockerfile:1
FROM python:3.11-slim
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
RUN apt-get update \
&& apt-get install -y --no-install-recommends g++ python3-dev python3-pip \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get purge -y --auto-remove g++ python3-dev python3-pip
WORKDIR /app
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt
# Fixes exllama/cuda_ext.py:82: UserWarning: Failed to initialize NumPy: No module named 'numpy'
RUN pip3 install numpy
# https://github.com/marella/ctransformers#gptq
RUN pip3 install ctransformers[gptq]
COPY . .
...
...
This diff is collapsed.
Click to expand it.
charts/ialacol/Chart.yaml
+
2
−
2
View file @
4d1fc25e
apiVersion
:
v2
appVersion
:
0.11.
1
appVersion
:
0.11.
2
description
:
A Helm chart for ialacol
name
:
ialacol
type
:
application
version
:
0.11.
1
version
:
0.11.
2
This diff is collapsed.
Click to expand it.
get_model_type.py
+
10
−
1
View file @
4d1fc25e
from
request_body
import
ChatCompletionRequestBody
,
CompletionRequestBody
from
get_env
import
get_env
...
...
@@ -6,6 +5,7 @@ def get_model_type(
filename
:
str
,
)
->
str
:
ctransformer_model_type
=
"
llama
"
filename
=
filename
.
lower
()
# These are also in "starcoder" format
# https://huggingface.co/TheBloke/WizardCoder-15B-1.0-GGML
# https://huggingface.co/TheBloke/minotaur-15B-GGML
...
...
@@ -34,6 +34,15 @@ def get_model_type(
# matching https://huggingface.co/EleutherAI/pythia-70m
if
"
pythia
"
in
filename
:
ctransformer_model_type
=
"
gpt_neox
"
# codegen family are in gptj, codegen2 isn't but not supported by ggml/ctransformer yet
# https://huggingface.co/Salesforce/codegen-2B-multi
# https://huggingface.co/ravenscroftj/CodeGen-2B-multi-ggml-quant
if
"
codegen
"
in
filename
:
ctransformer_model_type
=
"
gptj
"
DEFAULT_MODEL_HG_REPO_ID
=
get_env
(
"
DEFAULT_MODEL_HG_REPO_ID
"
,
""
)
if
"
gptq
"
in
str
(
DEFAULT_MODEL_HG_REPO_ID
).
lower
()
or
"
gptq
"
in
filename
:
ctransformer_model_type
=
"
gptq
"
MODE_TYPE
=
get_env
(
"
MODE_TYPE
"
,
""
)
if
len
(
MODE_TYPE
)
>
0
:
...
...
This diff is collapsed.
Click to expand it.
main.py
+
51
−
25
View file @
4d1fc25e
...
...
@@ -12,8 +12,8 @@ from typing import (
)
from
fastapi
import
FastAPI
,
Depends
,
HTTPException
,
Body
,
Request
from
fastapi.responses
import
StreamingResponse
from
ctransformers
import
LLM
,
Config
from
huggingface_hub
import
hf_hub_download
from
ctransformers
import
LLM
,
AutoModelForCausalLM
,
Config
from
huggingface_hub
import
hf_hub_download
,
snapshot_download
from
get_config
import
get_config
from
get_model_type
import
get_model_type
...
...
@@ -70,22 +70,37 @@ async def startup_event():
Starts up the server, setting log level, downloading the default model if necessary.
"""
log
.
info
(
"
Starting up...
"
)
if
DEFAULT_MODEL_FILE
and
DEFAULT_MODEL_HG_REPO_ID
:
model_type
=
get_model_type
(
DEFAULT_MODEL_FILE
)
if
DEFAULT_MODEL_HG_REPO_ID
:
set_downloading_model
(
True
)
log
.
info
(
"
Downloading model... %s/%s to %s/models
"
,
DEFAULT_MODEL_HG_REPO_ID
,
DEFAULT_MODEL_FILE
,
os
.
getcwd
(),
)
try
:
hf_hub_download
(
repo_id
=
DEFAULT_MODEL_HG_REPO_ID
,
cache_dir
=
"
models/.cache
"
,
local_dir
=
"
models
"
,
filename
=
DEFAULT_MODEL_FILE
,
resume_download
=
True
,
)
if
model_type
==
"
gptq
"
:
log
.
info
(
"
Downloading repo %s to %s/models
"
,
DEFAULT_MODEL_HG_REPO_ID
,
os
.
getcwd
(),
)
snapshot_download
(
repo_id
=
DEFAULT_MODEL_HG_REPO_ID
,
cache_dir
=
"
models/.cache
"
,
local_dir
=
"
models
"
,
resume_download
=
True
,
)
elif
DEFAULT_MODEL_FILE
:
log
.
info
(
"
Downloading model... %s/%s to %s/models
"
,
DEFAULT_MODEL_HG_REPO_ID
,
DEFAULT_MODEL_FILE
,
os
.
getcwd
(),
)
hf_hub_download
(
repo_id
=
DEFAULT_MODEL_HG_REPO_ID
,
cache_dir
=
"
models/.cache
"
,
local_dir
=
"
models
"
,
filename
=
DEFAULT_MODEL_FILE
,
resume_download
=
True
,
)
except
Exception
as
exception
:
log
.
error
(
"
Error downloading model: %s
"
,
exception
)
finally
:
...
...
@@ -103,20 +118,29 @@ async def startup_event():
context_length
=
CONTEXT_LENGTH
,
gpu_layers
=
GPU_LAYERS
,
)
model_type
=
get_model_type
(
DEFAULT_MODEL_FILE
)
log
.
info
(
"
Creating llm singleton with model_type:
%s for DEFAULT_MODEL_FILE
%s
"
,
"
Creating llm singleton with model_type: %s
"
,
model_type
,
DEFAULT_MODEL_FILE
,
)
set_loading_model
(
True
)
llm
=
LLM
(
model_path
=
f
"
{
os
.
getcwd
()
}
/models/
{
DEFAULT_MODEL_FILE
}
"
,
config
=
config
,
model_type
=
model_type
,
)
if
model_type
==
"
gptq
"
:
log
.
debug
(
"
Creating llm/gptq instance...
"
)
llm
=
AutoModelForCausalLM
.
from_pretrained
(
model_path_or_repo_id
=
f
"
{
os
.
getcwd
()
}
/models
"
,
model_type
=
"
gptq
"
,
local_files_only
=
True
,
)
app
.
state
.
llm
=
llm
else
:
log
.
debug
(
"
Creating llm/ggml instance...
"
)
llm
=
LLM
(
model_path
=
f
"
{
os
.
getcwd
()
}
/models/
{
DEFAULT_MODEL_FILE
}
"
,
config
=
config
,
model_type
=
model_type
,
)
app
.
state
.
llm
=
llm
log
.
info
(
"
llm singleton created.
"
)
app
.
state
.
llm
=
llm
set_loading_model
(
False
)
...
...
@@ -143,6 +167,7 @@ async def models():
"
object
"
:
"
list
"
,
}
@app.post
(
"
/v1/completions
"
,
response_model
=
CompletionResponseBody
)
async
def
completions
(
body
:
Annotated
[
CompletionRequestBody
,
Body
()],
...
...
@@ -182,6 +207,7 @@ async def completions(
)
return
model_generate
(
prompt
,
model_name
,
llm
,
config
)
@app.post
(
"
/v1/engines/{engine}/completions
"
)
async
def
engine_completions
(
# Can't use body as FastAPI require corrent context-type header
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment