Llama-2-13B / README.md
jordigonzm's picture
proxy
1e4ed8e
|
raw
history blame
No virus
3.72 kB
metadata
title: Openai
emoji: 馃彚
colorFrom: yellow
colorTo: red
sdk: gradio
sdk_version: 4.43.0
app_file: app.py
pinned: false

Start an LlamaEdge API service

Step 1: Install WasmEdge

curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install_v2.sh | bash -s

Step 2: Download an LLM model

curl -LO https://huggingface.co/second-state/Meta-Llama-3.1-8B-Instruct-GGUF/resolve/main/Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf

Step 3: Download an embedding model

curl -LO https://huggingface.co/gaianet/Nomic-embed-text-v1.5-Embedding-GGUF/resolve/main/nomic-embed-text-v1.5.f16.gguf

Step 4

curl -LO https://github.com/LlamaEdge/LlamaEdge/releases/latest/download/llama-api-server.wasm

Run OpenAI endpoint

wasmedge --dir .:.     --nn-preload default:GGML:AUTO:Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf  llama-api-server.wasm     --model-alias default  --model-name llama-3-8b-chat   --prompt-template llama-3-chat   --batch-size 128,4096  --socket-addr 0.0.0.0:8080  -g 60 --main-gpu 0

List Models

https://jordigonzm-openai.hf.space/v1/models

Proxy

from http.server import BaseHTTPRequestHandler, HTTPServer
import http.client
import socket

# Configuraci贸n del servidor backend
BACKEND_HOST = "localhost"
BACKEND_PORT = 8080

class TransparentProxy(BaseHTTPRequestHandler):
    def do_GET(self):
        self.proxy_request()

    def do_POST(self):
        self.proxy_request()

    def proxy_request(self):
        # Modifica la ruta para a帽adir /v1
        modified_path = f"/v1{self.path}"
        print(f"Redirigiendo la solicitud {self.command} a: {modified_path}")

        # Establece una conexi贸n al backend con un timeout extendido
        conn = http.client.HTTPConnection(BACKEND_HOST, BACKEND_PORT, timeout=300)  # Timeout de 5 minutos

        try:
            # Leer datos del cuerpo si existen
            content_length = self.headers.get('Content-Length')
            if content_length:
                post_data = self.rfile.read(int(content_length))
                conn.request(self.command, modified_path, body=post_data, headers=self.headers)
            else:
                conn.request(self.command, modified_path, headers=self.headers)

            # Obtener la respuesta del backend
            backend_response = conn.getresponse()

            # Enviar la respuesta al cliente
            self.send_response(backend_response.status, backend_response.reason)

            # Reenviar todos los encabezados del backend al cliente
            for key, value in backend_response.getheaders():
                self.send_header(key, value)
            self.end_headers()

            # Reenviar el cuerpo de la respuesta en modo streaming
            while True:
                chunk = backend_response.read(1024)
                if not chunk:
                    break
                self.wfile.write(chunk)
                self.wfile.flush()  # Asegura que cada fragmento se env铆a inmediatamente al cliente

        except socket.timeout:
            self.send_error(504, "Gateway Timeout: El backend no respondi贸 en el tiempo esperado.")
            print("Error: Tiempo de espera agotado en la solicitud al backend.")

        except Exception as e:
            self.send_error(500, f"Error en el proxy: {e}")
            print(f"Error al manejar la solicitud: {e}")

        finally:
            conn.close()

def run(server_class=HTTPServer, handler_class=TransparentProxy, port=7860):
    server_address = ('', port)
    httpd = server_class(server_address, handler_class)
    print(f"Proxy corriendo en el puerto {port}")
    httpd.serve_forever()

if __name__ == "__main__":
    run()