syberWolf commited on
Commit
fe371ad
1 Parent(s): 5e521f7

add handler

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. handler.py +92 -0
  3. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ llama_env/
handler.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_cpp import Llama
2
+ from typing import Dict, List, Any
3
+ import os
4
+
5
+
6
+ class EndpointHandler:
7
+ def __init__(self):
8
+ # Construct the model path assuming the model is in the same directory as the handler file
9
+ script_dir = os.path.dirname(os.path.abspath(__file__))
10
+ model_filename = "Phi-3-medium-128k-instruct-IQ2_XS.gguf"
11
+ self.model_path = os.path.join(script_dir, model_filename)
12
+
13
+ # Load the GGUF model using llama_cpp
14
+ self.llm = Llama(
15
+ model_path=self.model_path,
16
+ n_ctx=5000, # Set context length to 5000 tokens
17
+ n_threads=12, # Adjust the number of CPU threads as per your machine
18
+ n_gpu_layers=4 # Adjust based on GPU availability
19
+ )
20
+
21
+ # Define generation kwargs for the model
22
+ self.generation_kwargs = {
23
+ "max_tokens": 400, # Respond with up to 400 tokens
24
+ "stop": ["<|end|>", "<|user|>", "<|assistant|>"],
25
+ "top_k": 1 # Greedy decoding
26
+ }
27
+
28
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
29
+ """
30
+ Data args:
31
+ inputs (:obj:`dict`): The input prompts for the LLM including system instructions and user messages.
32
+
33
+ Return:
34
+ A :obj:`list` | `dict`: will be serialized and returned.
35
+ """
36
+ # Extract inputs
37
+ inputs = data.get("inputs", {})
38
+ system_instructions = inputs.get("system", "")
39
+ user_message = inputs.get("message", "")
40
+
41
+ if not user_message:
42
+ raise ValueError("No user message provided for the model.")
43
+
44
+ # Combine system instructions and user message
45
+ final_input = f"{system_instructions}\n{user_message}"
46
+
47
+ # Run inference with llama_cpp
48
+ response = self.llm.create_chat_completion(
49
+ messages=[
50
+ {"role": "system", "content": system_instructions},
51
+ {"role": "user", "content": user_message}
52
+ ],
53
+ **self.generation_kwargs
54
+ )
55
+
56
+ # Access generated text based on the response structure
57
+ try:
58
+ generated_text = response["choices"][0]["message"].get("content", "")
59
+ except (KeyError, IndexError):
60
+ raise ValueError("Unexpected response structure: missing 'content' in 'choices[0]['message']'")
61
+
62
+ # Return the generated text
63
+ return [{"generated_text": generated_text}]
64
+
65
+
66
+ # Example usage:
67
+ if __name__ == "__main__":
68
+ # Instantiate the handler ONCE
69
+ handler = EndpointHandler()
70
+
71
+ # Handlers can be called multiple times with different inputs and the model will remain in memory
72
+ data1 = {
73
+ "inputs": {
74
+ "system": "You are a helpful assistant.",
75
+ "message": "What is the meaning of life?"
76
+ }
77
+ }
78
+
79
+ data2 = {
80
+ "inputs": {
81
+ "system": "You are a knowledgeable assistant.",
82
+ "message": "Tell me about the history of the internet."
83
+ }
84
+ }
85
+
86
+ # First call - model already in memory
87
+ response1 = handler(data1)
88
+ print(response1)
89
+
90
+ # Second call - model still in memory
91
+ response2 = handler(data2)
92
+ print(response2)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ llama-cpp-python
2
+ torch
3
+ transformers