from flask import Flask, redirect, render_template, request, url_for
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import Pinecone
from langchain.prompts import PromptTemplate
from langchain_community.llms import CTransformers
#from flask_limiter import Limiter
#from flask_limiter.util import get_remote_address
from langchain_community.llms import LlamaCpp
import time 


app = Flask(__name__)

# Setup Flask-Limiter
#limiter = Limiter(
 #   app=app,
  #  key_func=get_remote_address,  # Correctly specify key_func as a keyword argument
   # default_limits=["200 per day", "20 per hour"]
#)


# Initialize embeddings directly    
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer say that you don't know it, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the correct answer in human readable text and avoide printing programming code!
Make it short with no more text than needed and do not repeat your answers or the question!
"""


# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="model/phi-2.Q2_K.gguf",
    temperature=0.1,
    max_tokens=128,
   # repetition_penalty=1,
    top_p=1,
    verbose=True,  # Verbose is required to pass to the callback manager
)


PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
docsearch = Pinecone.from_existing_index("medicalbot", embeddings)
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 2})

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)


# chat history
messages = []


@app.route("/", methods=["GET"])
@limiter.limit("10/minute")  
def home():
    return render_template("home.html", messages=messages)

@app.route("/post_message", methods=["POST"])
def post_message():
    start_time = time.time()
    try:
        msg = request.form['message']
        messages.append({"sender": "user", "text": msg})
        if len(messages) > 10:
            messages.clear()

        bot_response = qa({"query": msg})
        response_time = time.time() - start_time 
        response_with_time = f"{bot_response['result']} (Response time: {response_time:.2f} seconds)"
        messages.append({"sender": "bot", "text": response_with_time})
    except Exception as e:
        print(f"Error processing the message: {e}")
        messages.append({"sender": "bot", "text": "Sorry, I couldn't process your request."})

    return redirect(url_for('home'))


if __name__ == "__main__":
    app.run(host='0.0.0.0', port=7860, debut=True)