Spaces:
Runtime error
Runtime error
Upload 3 files
Browse files- Dockerfile +12 -0
- main.py +57 -0
- requirements.txt +7 -0
Dockerfile
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
8 |
+
|
9 |
+
COPY ./zephyr-7b-beta.Q4_K_S.gguf /code/zephyr-7b-beta.Q4_K_S.gguf
|
10 |
+
COPY ./main.py /code/main.py
|
11 |
+
|
12 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
main.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ctransformers import AutoModelForCausalLM
|
2 |
+
from fastapi import FastAPI, HTTPException
|
3 |
+
from pydantic import BaseModel
|
4 |
+
from fastapi.middleware.cors import CORSMiddleware
|
5 |
+
|
6 |
+
# Initialize the Mistral model with appropriate quantization settings
|
7 |
+
llm = AutoModelForCausalLM.from_pretrained(
|
8 |
+
"mistral-7b-v0.1.Q4_K_M.gguf", # Path to your model file
|
9 |
+
model_type='mistral', # Specify the model type
|
10 |
+
max_new_tokens=512, # Adjust this to a safe value
|
11 |
+
threads=5 # Adjust based on your CPU resources
|
12 |
+
)
|
13 |
+
|
14 |
+
MAX_CONTEXT_LENGTH = 2500
|
15 |
+
|
16 |
+
app = FastAPI()
|
17 |
+
|
18 |
+
app.add_middleware(
|
19 |
+
CORSMiddleware,
|
20 |
+
allow_origins=["*"], # Update this to allow specific origins if needed
|
21 |
+
allow_credentials=True,
|
22 |
+
allow_methods=["*"],
|
23 |
+
allow_headers=["*"],
|
24 |
+
)
|
25 |
+
|
26 |
+
# Define the input structure using Pydantic
|
27 |
+
class RequestData(BaseModel):
|
28 |
+
prompt: str
|
29 |
+
|
30 |
+
@app.post("/generate")
|
31 |
+
async def generate_response(request_data: RequestData):
|
32 |
+
system_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
|
33 |
+
E_INST = "</s>"
|
34 |
+
user, assistant = "<|user|>", "<|assistant|>"
|
35 |
+
|
36 |
+
# Construct the full prompt
|
37 |
+
prompt = f"{system_prompt}{E_INST}\n{user}\n{request_data.prompt}{E_INST}\n{assistant}\n"
|
38 |
+
|
39 |
+
# Tokenize the prompt
|
40 |
+
prompt_tokens = llm.tokenize(prompt)
|
41 |
+
|
42 |
+
# Ensure the prompt does not exceed the maximum context length
|
43 |
+
if len(prompt_tokens) > MAX_CONTEXT_LENGTH:
|
44 |
+
prompt_tokens = prompt_tokens[:MAX_CONTEXT_LENGTH]
|
45 |
+
prompt = llm.detokenize(prompt_tokens)
|
46 |
+
|
47 |
+
try:
|
48 |
+
# Generate the response using the model
|
49 |
+
response = llm(prompt)
|
50 |
+
return {"response": response}
|
51 |
+
except Exception as e:
|
52 |
+
raise HTTPException(status_code=500, detail=f"Model inference failed: {str(e)}")
|
53 |
+
|
54 |
+
# if __name__ == "__main__":
|
55 |
+
# import uvicorn
|
56 |
+
# # Run the FastAPI app with Uvicorn
|
57 |
+
# uvicorn.run(app, host="0.0.0.0", port=7860)
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python-multipart
|
2 |
+
fastapi
|
3 |
+
pydantic
|
4 |
+
uvicorn
|
5 |
+
requests
|
6 |
+
python-dotenv
|
7 |
+
ctransformers
|