File size: 3,651 Bytes
e20d7fd
 
 
 
 
 
 
 
dd05a16
e20d7fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3365e8
e20d7fd
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

# Commented out IPython magic to ensure Python compatibility.
# %%capture
# !pip install -U sentence-transformers
# !pip install gradio chromadb

import pandas as pd
from sentence_transformers import SentenceTransformer, util
import ast 
from ast import literal_eval
import chromadb
from chromadb.utils import embedding_functions

import gdown

url = 'https://drive.google.com/uc?id='
file_id = '1MgM3iObIAdqA-SvI-pXeUeXEiEAuMzXw'
output = '25k IMDb movie Dataset.csv'

gdown.download(url+file_id, output, quiet=False)

df = pd.read_csv(output)

def concatenar_lista(lista):
    lista = literal_eval(lista)
    return ' '.join(lista)

def string_to_list(lista):
    lista = literal_eval(lista)
    return lista

df = df.fillna(' ')

df['Keywords'] = df['Plot Kyeword'].apply(concatenar_lista)

df['Stars'] = df['Top 5 Casts'].apply(concatenar_lista)

df['Generes'] = df['Generes'].apply(string_to_list)

df['Rating'] = pd.to_numeric(df['Rating'], errors="coerce").fillna(0).astype("float")

unique_generes = df['Generes'].explode().unique()

df.drop(['Plot Kyeword','Top 5 Casts'],axis=1, inplace=True)

df['text'] = df.apply(lambda x: str(x['Overview']) + ' ' + x['Keywords'] + ' ' + x['Stars'], axis=1)

model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(df['text'],batch_size=64,show_progress_bar=True)

df['embeddings'] = embeddings.tolist()

df['ids'] = df.index

df['ids'] = df['ids'].astype('str')

client_persistent = chromadb.PersistentClient(path='data_embeddings')

db = client_persistent.create_collection(name='movies_db')

df['Generes'] = df['Generes'].apply(lambda x: ', '.join(x))

from torch import embedding
db.add(
    ids = df['ids'].tolist(),
    embeddings = df['embeddings'].tolist(),
    metadatas = df.drop(['ids', 'embeddings', 'text'], axis=1).to_dict('records')
)

from chromadb.api.types import Metadatas

def search(query, genre, rating, num):
    num = int(num)
    if rating:
        filter_rating = rating
    else:
        filter_rating = 0

    if genre:
        conditions = {
            "$and": [
            {"Generes": genre},
            {"Rating": {"$gte": filter_rating}}
            ]
        }
    else:
        conditions = {
            "Rating": {"$gte": filter_rating}
        }

    responses = db.query(
        query_texts=[query],
        n_results=num,
        where=conditions,
        include=['metadatas']

    )

    response_data = []

    for response in responses['metadatas']:
      for metadata in response:
        if not isinstance(genre, list):
            genre = [genre]
        response_data.append({
            'Title': metadata['movie title'],
            'Overview': metadata['Overview'],
            'Director': metadata['Director'],
            'Stars': metadata['Stars'],
            'Genre':  metadata['Generes'],
            'year': metadata['year'],
            'Rating': metadata['Rating']
        })


    df = pd.DataFrame(response_data)

    return df

import gradio as gr

genres = unique_generes.tolist()
iface = gr.Interface(
    fn=search,
    inputs=[
        gr.Textbox(lines=5, placeholder="Escribe aquí tu consulta...", label="Consulta"),
        gr.Dropdown(choices=genres, label="Género de la película"),
        gr.Slider(minimum=1, maximum=10, value=5, label="Puntuación mínima"),
        gr.Number(minimum=1, maximum=10, value=3, label="Número de resultados")

    ],
    outputs=gr.Dataframe(type="pandas",label="Resultados"),
    title="Buscador de películas",
    description="Introduce tu consulta (en INGLES), selecciona un género y define una puntuación mínima para buscar películas."
)

iface.launch(share=False)