# Commented out IPython magic to ensure Python compatibility. # %%capture # !pip install -U sentence-transformers # !pip install gradio chromadb import pandas as pd from sentence_transformers import SentenceTransformer, util import ast from ast import literal_eval import chromadb from chromadb.utils import embedding_functions import gdown url = 'https://drive.google.com/uc?id=' file_id = '1MgM3iObIAdqA-SvI-pXeUeXEiEAuMzXw' output = '25k IMDb movie Dataset.csv' gdown.download(url+file_id, output, quiet=False) df = pd.read_csv(output) def concatenar_lista(lista): lista = literal_eval(lista) return ' '.join(lista) def string_to_list(lista): lista = literal_eval(lista) return lista df = df.fillna(' ') df['Keywords'] = df['Plot Kyeword'].apply(concatenar_lista) df['Stars'] = df['Top 5 Casts'].apply(concatenar_lista) df['Generes'] = df['Generes'].apply(string_to_list) df['Rating'] = pd.to_numeric(df['Rating'], errors="coerce").fillna(0).astype("float") unique_generes = df['Generes'].explode().unique() df.drop(['Plot Kyeword','Top 5 Casts'],axis=1, inplace=True) df['text'] = df.apply(lambda x: str(x['Overview']) + ' ' + x['Keywords'] + ' ' + x['Stars'], axis=1) model = SentenceTransformer('all-MiniLM-L6-v2') embeddings = model.encode(df['text'],batch_size=64,show_progress_bar=True) df['embeddings'] = embeddings.tolist() df['ids'] = df.index df['ids'] = df['ids'].astype('str') client_persistent = chromadb.PersistentClient(path='data_embeddings') db = client_persistent.create_collection(name='movies_db') df['Generes'] = df['Generes'].apply(lambda x: ', '.join(x)) from torch import embedding db.add( ids = df['ids'].tolist(), embeddings = df['embeddings'].tolist(), metadatas = df.drop(['ids', 'embeddings', 'text'], axis=1).to_dict('records') ) from chromadb.api.types import Metadatas def search(query, genre, rating, num): num = int(num) if rating: filter_rating = rating else: filter_rating = 0 if genre: conditions = { "$and": [ {"Generes": genre}, {"Rating": {"$gte": filter_rating}} ] } else: conditions = { "Rating": {"$gte": filter_rating} } responses = db.query( query_texts=[query], n_results=num, where=conditions, include=['metadatas'] ) response_data = [] for response in responses['metadatas']: for metadata in response: if not isinstance(genre, list): genre = [genre] response_data.append({ 'Title': metadata['movie title'], 'Overview': metadata['Overview'], 'Director': metadata['Director'], 'Stars': metadata['Stars'], 'Genre': metadata['Generes'], 'year': metadata['year'], 'Rating': metadata['Rating'] }) df = pd.DataFrame(response_data) return df import gradio as gr genres = unique_generes.tolist() iface = gr.Interface( fn=search, inputs=[ gr.Textbox(lines=5, placeholder="Escribe aquí tu consulta...", label="Consulta"), gr.Dropdown(choices=genres, label="Género de la película"), gr.Slider(minimum=1, maximum=10, value=5, label="Puntuación mínima"), gr.Number(minimum=1, maximum=10, value=3, label="Número de resultados") ], outputs=gr.Dataframe(type="pandas",label="Resultados"), title="Buscador de películas", description="Introduce tu consulta (en INGLES), selecciona un género y define una puntuación mínima para buscar películas." ) iface.launch(share=False)