fahad1403 commited on
Commit
183c0d9
1 Parent(s): fa674b1

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +12 -0
  2. main.py +57 -0
  3. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ COPY ./zephyr-7b-beta.Q4_K_S.gguf /code/zephyr-7b-beta.Q4_K_S.gguf
10
+ COPY ./main.py /code/main.py
11
+
12
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ctransformers import AutoModelForCausalLM
2
+ from fastapi import FastAPI, HTTPException
3
+ from pydantic import BaseModel
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+
6
+ # Initialize the Mistral model with appropriate quantization settings
7
+ llm = AutoModelForCausalLM.from_pretrained(
8
+ "mistral-7b-v0.1.Q4_K_M.gguf", # Path to your model file
9
+ model_type='mistral', # Specify the model type
10
+ max_new_tokens=512, # Adjust this to a safe value
11
+ threads=5 # Adjust based on your CPU resources
12
+ )
13
+
14
+ MAX_CONTEXT_LENGTH = 2500
15
+
16
+ app = FastAPI()
17
+
18
+ app.add_middleware(
19
+ CORSMiddleware,
20
+ allow_origins=["*"], # Update this to allow specific origins if needed
21
+ allow_credentials=True,
22
+ allow_methods=["*"],
23
+ allow_headers=["*"],
24
+ )
25
+
26
+ # Define the input structure using Pydantic
27
+ class RequestData(BaseModel):
28
+ prompt: str
29
+
30
+ @app.post("/generate")
31
+ async def generate_response(request_data: RequestData):
32
+ system_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
33
+ E_INST = "</s>"
34
+ user, assistant = "<|user|>", "<|assistant|>"
35
+
36
+ # Construct the full prompt
37
+ prompt = f"{system_prompt}{E_INST}\n{user}\n{request_data.prompt}{E_INST}\n{assistant}\n"
38
+
39
+ # Tokenize the prompt
40
+ prompt_tokens = llm.tokenize(prompt)
41
+
42
+ # Ensure the prompt does not exceed the maximum context length
43
+ if len(prompt_tokens) > MAX_CONTEXT_LENGTH:
44
+ prompt_tokens = prompt_tokens[:MAX_CONTEXT_LENGTH]
45
+ prompt = llm.detokenize(prompt_tokens)
46
+
47
+ try:
48
+ # Generate the response using the model
49
+ response = llm(prompt)
50
+ return {"response": response}
51
+ except Exception as e:
52
+ raise HTTPException(status_code=500, detail=f"Model inference failed: {str(e)}")
53
+
54
+ # if __name__ == "__main__":
55
+ # import uvicorn
56
+ # # Run the FastAPI app with Uvicorn
57
+ # uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ python-multipart
2
+ fastapi
3
+ pydantic
4
+ uvicorn
5
+ requests
6
+ python-dotenv
7
+ ctransformers