Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,7 @@ embedding_path = "abokbot/wikipedia-embedding"
|
|
8 |
|
9 |
st.header("Wikipedia Search Engine app")
|
10 |
|
11 |
-
st_model_load = st.text('Loading
|
12 |
|
13 |
@st.cache_resource
|
14 |
def load_embedding():
|
@@ -19,7 +19,6 @@ def load_embedding():
|
|
19 |
return wikipedia_embedding
|
20 |
|
21 |
wikipedia_embedding = load_embedding()
|
22 |
-
st.success('Embedding loaded!')
|
23 |
st_model_load.text("")
|
24 |
|
25 |
@st.cache_resource
|
@@ -29,6 +28,7 @@ def load_encoders():
|
|
29 |
bi_encoder.max_seq_length = 256 #Truncate long passages to 256 tokens
|
30 |
top_k = 32
|
31 |
cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2')
|
|
|
32 |
return bi_encoder, cross_encoder
|
33 |
|
34 |
bi_encoder, cross_encoder = load_encoders()
|
@@ -39,10 +39,11 @@ st_model_load.text("")
|
|
39 |
def load_wikipedia_dataset():
|
40 |
print("Loading wikipedia dataset...")
|
41 |
dataset = load_dataset("abokbot/wikipedia-first-paragraph")["train"]
|
|
|
42 |
return dataset
|
43 |
|
44 |
dataset = load_wikipedia_dataset()
|
45 |
-
st.success('
|
46 |
st_model_load.text("")
|
47 |
|
48 |
if 'text' not in st.session_state:
|
@@ -57,7 +58,10 @@ st_text_area = st.text_area(
|
|
57 |
def search():
|
58 |
st.session_state.text = st_text_area
|
59 |
query = st_text_area
|
|
|
|
|
60 |
##### Sematic Search #####
|
|
|
61 |
# Encode the query using the bi-encoder and find potentially relevant passages
|
62 |
top_k = 32
|
63 |
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
|
@@ -66,6 +70,7 @@ def search():
|
|
66 |
|
67 |
##### Re-Ranking #####
|
68 |
# Now, score all retrieved passages with the cross_encoder
|
|
|
69 |
cross_inp = [[query, dataset[hit['corpus_id']]["text"]] for hit in hits]
|
70 |
cross_scores = cross_encoder.predict(cross_inp)
|
71 |
|
@@ -99,7 +104,7 @@ if 'results' not in st.session_state:
|
|
99 |
if len(st.session_state.results) > 0:
|
100 |
with st.container():
|
101 |
st.subheader("Search results")
|
102 |
-
for result in st.session_state.
|
103 |
for k,v in result.items():
|
104 |
st.markdown("score: " + results["score"])
|
105 |
st.markdown("title: " + results["title"])
|
|
|
8 |
|
9 |
st.header("Wikipedia Search Engine app")
|
10 |
|
11 |
+
st_model_load = st.text('Loading encoders, embeddings and dataset (takes about 5min)')
|
12 |
|
13 |
@st.cache_resource
|
14 |
def load_embedding():
|
|
|
19 |
return wikipedia_embedding
|
20 |
|
21 |
wikipedia_embedding = load_embedding()
|
|
|
22 |
st_model_load.text("")
|
23 |
|
24 |
@st.cache_resource
|
|
|
28 |
bi_encoder.max_seq_length = 256 #Truncate long passages to 256 tokens
|
29 |
top_k = 32
|
30 |
cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2')
|
31 |
+
print("Encoders loaded!")
|
32 |
return bi_encoder, cross_encoder
|
33 |
|
34 |
bi_encoder, cross_encoder = load_encoders()
|
|
|
39 |
def load_wikipedia_dataset():
|
40 |
print("Loading wikipedia dataset...")
|
41 |
dataset = load_dataset("abokbot/wikipedia-first-paragraph")["train"]
|
42 |
+
print("Dataset loaded!")
|
43 |
return dataset
|
44 |
|
45 |
dataset = load_wikipedia_dataset()
|
46 |
+
st.success('Loading done')
|
47 |
st_model_load.text("")
|
48 |
|
49 |
if 'text' not in st.session_state:
|
|
|
58 |
def search():
|
59 |
st.session_state.text = st_text_area
|
60 |
query = st_text_area
|
61 |
+
print("Input question:", query)
|
62 |
+
|
63 |
##### Sematic Search #####
|
64 |
+
print("Semantic Search")
|
65 |
# Encode the query using the bi-encoder and find potentially relevant passages
|
66 |
top_k = 32
|
67 |
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
|
|
|
70 |
|
71 |
##### Re-Ranking #####
|
72 |
# Now, score all retrieved passages with the cross_encoder
|
73 |
+
print("Re-Ranking")
|
74 |
cross_inp = [[query, dataset[hit['corpus_id']]["text"]] for hit in hits]
|
75 |
cross_scores = cross_encoder.predict(cross_inp)
|
76 |
|
|
|
104 |
if len(st.session_state.results) > 0:
|
105 |
with st.container():
|
106 |
st.subheader("Search results")
|
107 |
+
for result in st.session_state.results:
|
108 |
for k,v in result.items():
|
109 |
st.markdown("score: " + results["score"])
|
110 |
st.markdown("title: " + results["title"])
|