Spaces:
Sleeping
Sleeping
title: Openai | |
emoji: 馃彚 | |
colorFrom: yellow | |
colorTo: red | |
sdk: gradio | |
sdk_version: 4.43.0 | |
app_file: app.py | |
pinned: false | |
# Start an LlamaEdge API service | |
## Step 1: Install WasmEdge | |
``` | |
curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s | |
``` | |
## Step 2: Download an LLM model | |
``` | |
curl -LO https://huggingface.co/second-state/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf | |
``` | |
## Step 3: Download an embedding model | |
``` | |
curl -LO https://huggingface.co/gaianet/Nomic-embed-text-v1.5-Embedding-GGUF/resolve/main/nomic-embed-text-v1.5.f16.gguf | |
``` | |
## Step 4 | |
``` | |
curl -LO https://github.com/LlamaEdge/LlamaEdge/releases/latest/download/llama-api-server.wasm | |
``` | |
# Run OpenAI endpoint | |
``` | |
wasmedge --dir .:. --nn-preload default:GGML:AUTO:Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf llama-api-server.wasm --model-alias default --model-name llama-3-8b-chat --prompt-template llama-3-chat --batch-size 128,4096 --socket-addr 0.0.0.0:8080 -g 60 --main-gpu 0 | |
``` | |
## List Models | |
https://jordigonzm-openai.hf.space/v1/models | |
## Proxy | |
``` | |
from http.server import BaseHTTPRequestHandler, HTTPServer | |
import http.client | |
import socket | |
# Configuraci贸n del servidor backend | |
BACKEND_HOST = "localhost" | |
BACKEND_PORT = 8080 | |
class TransparentProxy(BaseHTTPRequestHandler): | |
def do_GET(self): | |
self.proxy_request() | |
def do_POST(self): | |
self.proxy_request() | |
def proxy_request(self): | |
# Modifica la ruta para a帽adir /v1 | |
modified_path = f"/v1{self.path}" | |
print(f"Redirigiendo la solicitud {self.command} a: {modified_path}") | |
# Establece una conexi贸n al backend con un timeout extendido | |
conn = http.client.HTTPConnection(BACKEND_HOST, BACKEND_PORT, timeout=300) # Timeout de 5 minutos | |
try: | |
# Leer datos del cuerpo si existen | |
content_length = self.headers.get('Content-Length') | |
if content_length: | |
post_data = self.rfile.read(int(content_length)) | |
conn.request(self.command, modified_path, body=post_data, headers=self.headers) | |
else: | |
conn.request(self.command, modified_path, headers=self.headers) | |
# Obtener la respuesta del backend | |
backend_response = conn.getresponse() | |
# Enviar la respuesta al cliente | |
self.send_response(backend_response.status, backend_response.reason) | |
# Reenviar todos los encabezados del backend al cliente | |
for key, value in backend_response.getheaders(): | |
self.send_header(key, value) | |
self.end_headers() | |
# Reenviar el cuerpo de la respuesta en modo streaming | |
while True: | |
chunk = backend_response.read(1024) | |
if not chunk: | |
break | |
self.wfile.write(chunk) | |
self.wfile.flush() # Asegura que cada fragmento se env铆a inmediatamente al cliente | |
except socket.timeout: | |
self.send_error(504, "Gateway Timeout: El backend no respondi贸 en el tiempo esperado.") | |
print("Error: Tiempo de espera agotado en la solicitud al backend.") | |
except Exception as e: | |
self.send_error(500, f"Error en el proxy: {e}") | |
print(f"Error al manejar la solicitud: {e}") | |
finally: | |
conn.close() | |
def run(server_class=HTTPServer, handler_class=TransparentProxy, port=7860): | |
server_address = ('', port) | |
httpd = server_class(server_address, handler_class) | |
print(f"Proxy corriendo en el puerto {port}") | |
httpd.serve_forever() | |
if __name__ == "__main__": | |
run() | |
``` |