import streamlit as st
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

def search(text, model, ds, n):
  encoded_text = model.encode(text)
  scores, retrieved_examples = ds.get_nearest_examples('embedding', encoded_text, k=n)
  matching_titles = retrieved_examples["title"]
  urls = retrieved_examples["link"]
  contents = retrieved_examples["content"]
  return list(zip(matching_titles, [c[:150] for c in contents], urls, scores))

@st.cache()
def get_dataset():
    ds = load_dataset("justinian336/salvadoran-news-embedded")["train"]
    ds.add_faiss_index(column="embedding")
    return ds

def get_model():
    if "model" not in st.session_state:
        st.session_state["model"] = SentenceTransformer("justinian336/chupeto")

ds = get_dataset()
get_model()

st.markdown("# Buscador de Noticias Salvadoreñas")
search_text = st.text_input(label="Búsqueda")

if search_text:
    search_results = search(search_text, st.session_state["model"], ds, 10)
    for title, content, url, _ in search_results:
        st.markdown(f"""<div><a href="{url}">{title}</a></div>""", unsafe_allow_html=True)
        st.markdown(f"""<div>{content}...</div>""", unsafe_allow_html=True)
        st.markdown("---")