Spaces:

fahad1403
/

deploy-mistral-api

Runtime error

App Files Files Community

fahad1403 commited on 28 days ago

Commit

183c0d9

•

1 Parent(s): fa674b1

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile +12 -0
main.py +57 -0
requirements.txt +7 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,12 @@

+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY ./zephyr-7b-beta.Q4_K_S.gguf /code/zephyr-7b-beta.Q4_K_S.gguf
+COPY ./main.py /code/main.py
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

main.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from ctransformers import AutoModelForCausalLM
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from fastapi.middleware.cors import CORSMiddleware
+# Initialize the Mistral model with appropriate quantization settings
+llm = AutoModelForCausalLM.from_pretrained(
+    "mistral-7b-v0.1.Q4_K_M.gguf",  # Path to your model file
+    model_type='mistral',            # Specify the model type
+    max_new_tokens=512,              # Adjust this to a safe value
+    threads=5                    # Adjust based on your CPU resources
+)
+MAX_CONTEXT_LENGTH = 2500
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Update this to allow specific origins if needed
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Define the input structure using Pydantic
+class RequestData(BaseModel):
+    prompt: str
+@app.post("/generate")
+async def generate_response(request_data: RequestData):
+    system_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
+    E_INST = "</s>"
+    user, assistant = "<|user|>", "<|assistant|>"
+    # Construct the full prompt
+    prompt = f"{system_prompt}{E_INST}\n{user}\n{request_data.prompt}{E_INST}\n{assistant}\n"
+    # Tokenize the prompt
+    prompt_tokens = llm.tokenize(prompt)
+    # Ensure the prompt does not exceed the maximum context length
+    if len(prompt_tokens) > MAX_CONTEXT_LENGTH:
+        prompt_tokens = prompt_tokens[:MAX_CONTEXT_LENGTH]
+        prompt = llm.detokenize(prompt_tokens)
+    try:
+        # Generate the response using the model
+        response = llm(prompt)
+        return {"response": response}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Model inference failed: {str(e)}")
+# if __name__ == "__main__":
+#     import uvicorn
+#     # Run the FastAPI app with Uvicorn
+#     uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+python-multipart
+fastapi
+pydantic
+uvicorn
+requests
+python-dotenv
+ctransformers