import streamlit as st from datasets import load_dataset from sentence_transformers import SentenceTransformer def search(text, model, ds, n): encoded_text = model.encode(text) scores, retrieved_examples = ds.get_nearest_examples('embedding', encoded_text, k=n) matching_titles = retrieved_examples["title"] urls = retrieved_examples["link"] contents = retrieved_examples["content"] return list(zip(matching_titles, [c[:150] for c in contents], urls, scores)) @st.cache() def get_dataset(): ds = load_dataset("justinian336/salvadoran-news-embedded")["train"] ds.add_faiss_index(column="embedding") return ds def get_model(): if "model" not in st.session_state: st.session_state["model"] = SentenceTransformer("justinian336/chupeto") ds = get_dataset() get_model() st.markdown("# Buscador de Noticias Salvadoreñas") search_text = st.text_input(label="Búsqueda") if search_text: search_results = search(search_text, st.session_state["model"], ds, 10) for title, content, url, _ in search_results: st.markdown(f"""
{title}
""", unsafe_allow_html=True) st.markdown(f"""
{content}...
""", unsafe_allow_html=True) st.markdown("---")