Jongmo commited on
Commit
49664ed
1 Parent(s): ded1688

Upload 10 files

Browse files
Files changed (10) hide show
  1. Prove_lite.py +271 -0
  2. Prove_llm.py +84 -0
  3. SimpleUI_lite.py +122 -0
  4. SimpleUI_llm.py +136 -0
  5. UI_tester.py +52 -0
  6. Wikidata_Text_Parser.py +929 -0
  7. app.py +122 -0
  8. llm_load copy.py +188 -0
  9. llm_load.py +188 -0
  10. requirements.txt +118 -0
Prove_lite.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import sqlite3, torch, json, re, os, torch, itertools, nltk
4
+ from ast import literal_eval as leval
5
+ from tqdm.auto import tqdm
6
+ from utils.verbalisation_module import VerbModule
7
+ from utils.sentence_retrieval_module import SentenceRetrievalModule
8
+ from utils.textual_entailment_module import TextualEntailmentModule
9
+ from importlib import reload
10
+ from html.parser import HTMLParser
11
+ from sentence_transformers import SentenceTransformer
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+ from tqdm import tqdm
14
+ import gradio as gr
15
+ from bs4 import BeautifulSoup
16
+ from cleantext import clean
17
+
18
+
19
+ def verbalisation(claim_df):
20
+ verb_module = VerbModule()
21
+ triples = []
22
+ for _, row in claim_df.iterrows():
23
+ triple = {
24
+ 'subject': row['entity_label'],
25
+ 'predicate': row['property_label'],
26
+ 'object': row['object_label']
27
+ }
28
+ triples.append(triple)
29
+
30
+
31
+ claim_df['verbalisation'] = verb_module.verbalise_triples(triples)
32
+ claim_df['verbalisation_unks_replaced'] = claim_df['verbalisation'].apply(verb_module.replace_unks_on_sentence)
33
+ claim_df['verbalisation_unks_replaced_then_dropped'] = claim_df['verbalisation'].apply(lambda x: verb_module.replace_unks_on_sentence(x, empty_after=True))
34
+ return claim_df
35
+
36
+ def setencesSpliter(verbalised_claims_df_final, reference_text_df, update_progress):
37
+ join_df = pd.merge(verbalised_claims_df_final, reference_text_df[['reference_id', 'url', 'html']], on='reference_id', how='left')
38
+ SS_df = join_df[['reference_id','url','verbalisation', 'html']].copy()
39
+ def clean_html(html_content):
40
+ soup = BeautifulSoup(html_content, 'html.parser')
41
+ text = soup.get_text(separator=' ', strip=True)
42
+ cleaned_text = clean(text,
43
+ fix_unicode=True,
44
+ to_ascii=True,
45
+ lower=False,
46
+ no_line_breaks=False,
47
+ no_urls=True,
48
+ no_emails=True,
49
+ no_phone_numbers=True,
50
+ no_numbers=False,
51
+ no_digits=False,
52
+ no_currency_symbols=True,
53
+ no_punct=False,
54
+ replace_with_url="",
55
+ replace_with_email="",
56
+ replace_with_phone_number="",
57
+ replace_with_number="",
58
+ replace_with_digit="",
59
+ replace_with_currency_symbol="")
60
+ return cleaned_text
61
+ def split_into_sentences(text):
62
+ sentences = nltk.sent_tokenize(text)
63
+ return sentences
64
+ def slide_sentences(sentences, window_size=2):
65
+ if len(sentences) < window_size:
66
+ return [" ".join(sentences)]
67
+ return [" ".join(sentences[i:i + window_size]) for i in range(len(sentences) - window_size + 1)]
68
+
69
+ SS_df['html2text'] = SS_df['html'].apply(clean_html)
70
+ SS_df['nlp_sentences'] = SS_df['html2text'].apply(split_into_sentences)
71
+ SS_df['nlp_sentences_slide_2'] = SS_df['nlp_sentences'].apply(slide_sentences)
72
+
73
+ return SS_df[['reference_id','verbalisation','url','nlp_sentences','nlp_sentences_slide_2']]
74
+
75
+ def evidenceSelection(splited_sentences_from_html, BATCH_SIZE, N_TOP_SENTENCES):
76
+ sr_module = SentenceRetrievalModule(max_len=512)
77
+ sentence_relevance_df = splited_sentences_from_html.copy()
78
+ sentence_relevance_df.rename(columns={'verbalisation': 'final_verbalisation'}, inplace=True)
79
+
80
+ def chunks(l, n):
81
+ n = max(1, n)
82
+ return [l[i:i + n] for i in range(0, len(l), n)]
83
+
84
+ def compute_scores(column_name):
85
+ all_outputs = []
86
+ for _, row in tqdm(sentence_relevance_df.iterrows(), total=sentence_relevance_df.shape[0]):
87
+ outputs = []
88
+ for batch in chunks(row[column_name], BATCH_SIZE):
89
+ batch_outputs = sr_module.score_sentence_pairs([(row['final_verbalisation'], sentence) for sentence in batch])
90
+ outputs += batch_outputs
91
+ all_outputs.append(outputs)
92
+ sentence_relevance_df[f'{column_name}_scores'] = pd.Series(all_outputs)
93
+ assert all(sentence_relevance_df.apply(lambda x: len(x[column_name]) == len(x[f'{column_name}_scores']), axis=1))
94
+
95
+ compute_scores('nlp_sentences')
96
+ compute_scores('nlp_sentences_slide_2')
97
+
98
+ def get_top_n_sentences(row, column_name, n):
99
+ sentences_with_scores = [{'sentence': t[0], 'score': t[1], 'sentence_id': f"{row.name}_{j}"} for j, t in enumerate(zip(row[column_name], row[f'{column_name}_scores']))]
100
+ return sorted(sentences_with_scores, key=lambda x: x['score'], reverse=True)[:n]
101
+
102
+
103
+ def filter_overlaps(sentences):
104
+ filtered = []
105
+ for evidence in sentences:
106
+ if ';' in evidence['sentence_id']:
107
+ start_id, end_id = evidence['sentence_id'].split(';')
108
+ if not any(start_id in e['sentence_id'].split(';') or end_id in e['sentence_id'].split(';') for e in filtered):
109
+ filtered.append(evidence)
110
+ else:
111
+ if not any(evidence['sentence_id'] in e['sentence_id'].split(';') for e in filtered):
112
+ filtered.append(evidence)
113
+ return filtered
114
+
115
+ def limit_sentence_length(sentence, max_length):
116
+ if len(sentence) > max_length:
117
+ return sentence[:max_length] + '...'
118
+ return sentence
119
+
120
+ nlp_sentences_TOP_N, nlp_sentences_slide_2_TOP_N, nlp_sentences_all_TOP_N = [], [], []
121
+
122
+ for _, row in tqdm(sentence_relevance_df.iterrows(), total=sentence_relevance_df.shape[0]):
123
+ top_n = get_top_n_sentences(row, 'nlp_sentences', N_TOP_SENTENCES)
124
+ top_n = [{'sentence': limit_sentence_length(s['sentence'], 1024), 'score': s['score'], 'sentence_id': s['sentence_id']} for s in top_n]
125
+ nlp_sentences_TOP_N.append(top_n)
126
+
127
+ top_n_slide_2 = get_top_n_sentences(row, 'nlp_sentences_slide_2', N_TOP_SENTENCES)
128
+ top_n_slide_2 = [{'sentence': limit_sentence_length(s['sentence'], 1024), 'score': s['score'], 'sentence_id': s['sentence_id']} for s in top_n_slide_2]
129
+ nlp_sentences_slide_2_TOP_N.append(top_n_slide_2)
130
+
131
+ all_sentences = top_n + top_n_slide_2
132
+ all_sentences_sorted = sorted(all_sentences, key=lambda x: x['score'], reverse=True)
133
+ filtered_sentences = filter_overlaps(all_sentences_sorted)
134
+ filtered_sentences = [{'sentence': limit_sentence_length(s['sentence'], 1024), 'score': s['score'], 'sentence_id': s['sentence_id']} for s in filtered_sentences]
135
+ nlp_sentences_all_TOP_N.append(filtered_sentences[:N_TOP_SENTENCES])
136
+
137
+ sentence_relevance_df['nlp_sentences_TOP_N'] = pd.Series(nlp_sentences_TOP_N)
138
+ sentence_relevance_df['nlp_sentences_slide_2_TOP_N'] = pd.Series(nlp_sentences_slide_2_TOP_N)
139
+ sentence_relevance_df['nlp_sentences_all_TOP_N'] = pd.Series(nlp_sentences_all_TOP_N)
140
+
141
+ return sentence_relevance_df
142
+
143
+ def textEntailment(evidence_df, SCORE_THRESHOLD):
144
+ textual_entailment_df = evidence_df.copy()
145
+ te_module = TextualEntailmentModule()
146
+
147
+ keys = ['TOP_N', 'slide_2_TOP_N', 'all_TOP_N']
148
+ te_columns = {f'evidence_TE_prob_{key}': [] for key in keys}
149
+ te_columns.update({f'evidence_TE_prob_weighted_{key}': [] for key in keys})
150
+ te_columns.update({f'evidence_TE_labels_{key}': [] for key in keys})
151
+ te_columns.update({f'claim_TE_prob_weighted_sum_{key}': [] for key in keys})
152
+ te_columns.update({f'claim_TE_label_weighted_sum_{key}': [] for key in keys})
153
+ te_columns.update({f'claim_TE_label_malon_{key}': [] for key in keys})
154
+
155
+ def process_row(row):
156
+ claim = row['final_verbalisation']
157
+ results = {}
158
+ for key in keys:
159
+ evidence = row[f'nlp_sentences_{key}']
160
+ evidence_size = len(evidence)
161
+ if evidence_size == 0:
162
+ results[key] = {
163
+ 'evidence_TE_prob': [],
164
+ 'evidence_TE_labels': [],
165
+ 'evidence_TE_prob_weighted': [],
166
+ 'claim_TE_prob_weighted_sum': [0, 0, 0],
167
+ 'claim_TE_label_weighted_sum': 'NOT ENOUGH INFO',
168
+ 'claim_TE_label_malon': 'NOT ENOUGH INFO'
169
+ }
170
+ continue
171
+
172
+ evidence_TE_prob = te_module.get_batch_scores(
173
+ claims=[claim] * evidence_size,
174
+ evidence=[e['sentence'] for e in evidence]
175
+ )
176
+
177
+ evidence_TE_labels = [te_module.get_label_from_scores(s) for s in evidence_TE_prob]
178
+
179
+ evidence_TE_prob_weighted = [
180
+ probs * ev['score'] for probs, ev in zip(evidence_TE_prob, evidence)
181
+ if ev['score'] > SCORE_THRESHOLD
182
+ ]
183
+
184
+ claim_TE_prob_weighted_sum = np.sum(evidence_TE_prob_weighted, axis=0) if evidence_TE_prob_weighted else [0, 0, 0]
185
+
186
+ claim_TE_label_weighted_sum = te_module.get_label_from_scores(claim_TE_prob_weighted_sum) if evidence_TE_prob_weighted else 'NOT ENOUGH INFO'
187
+
188
+ claim_TE_label_malon = te_module.get_label_malon(
189
+ [probs for probs, ev in zip(evidence_TE_prob, evidence) if ev['score'] > SCORE_THRESHOLD]
190
+ )
191
+
192
+ results[key] = {
193
+ 'evidence_TE_prob': evidence_TE_prob,
194
+ 'evidence_TE_labels': evidence_TE_labels,
195
+ 'evidence_TE_prob_weighted': evidence_TE_prob_weighted,
196
+ 'claim_TE_prob_weighted_sum': claim_TE_prob_weighted_sum,
197
+ 'claim_TE_label_weighted_sum': claim_TE_label_weighted_sum,
198
+ 'claim_TE_label_malon': claim_TE_label_malon
199
+ }
200
+ return results
201
+
202
+ for i, row in tqdm(textual_entailment_df.iterrows(), total=textual_entailment_df.shape[0]):
203
+ try:
204
+ result_sets = process_row(row)
205
+ for key in keys:
206
+ for k, v in result_sets[key].items():
207
+ te_columns[f'{k}_{key}'].append(v)
208
+ except Exception as e:
209
+ print(f"Error processing row {i}: {e}")
210
+ print(row)
211
+ raise
212
+
213
+ for key in keys:
214
+ for col in ['evidence_TE_prob', 'evidence_TE_prob_weighted', 'evidence_TE_labels',
215
+ 'claim_TE_prob_weighted_sum', 'claim_TE_label_weighted_sum', 'claim_TE_label_malon']:
216
+ textual_entailment_df[f'{col}_{key}'] = pd.Series(te_columns[f'{col}_{key}'])
217
+
218
+ return textual_entailment_df
219
+
220
+ def TableMaking(verbalised_claims_df_final, result):
221
+ verbalised_claims_df_final.set_index('reference_id', inplace=True)
222
+ result.set_index('reference_id', inplace=True)
223
+ results = pd.concat([verbalised_claims_df_final, result], axis=1)
224
+ results['triple'] = results[['entity_label', 'property_label', 'object_label']].apply(lambda x: ', '.join(x), axis=1)
225
+ all_result = pd.DataFrame()
226
+ for idx, row in results.iterrows():
227
+ aResult = pd.DataFrame(row["nlp_sentences_TOP_N"])[['sentence','score']]
228
+ aResult.rename(columns={'score': 'Relevance_score'}, inplace=True)
229
+ aResult = pd.concat([aResult, pd.DataFrame(row["evidence_TE_labels_all_TOP_N"], columns=['TextEntailment'])], axis=1)
230
+ aResult = pd.concat([aResult, pd.DataFrame(np.max(row["evidence_TE_prob_all_TOP_N"], axis=1), columns=['Entailment_score'])], axis=1)
231
+ aResult = aResult.reindex(columns=['sentence', 'TextEntailment', 'Entailment_score','Relevance_score'])
232
+ aBox = pd.DataFrame({'triple': [row["triple"]], 'url': row['url'],'Results': [aResult]})
233
+ all_result = pd.concat([all_result,aBox], axis=0)
234
+
235
+ def dataframe_to_html(all_result):
236
+ html = '<html><head><style>table {border-collapse: collapse; width: 100%;} th, td {border: 1px solid black; padding: 8px; text-align: left;} th {background-color: #f2f2f2;}</style></head><body>'
237
+ for triple in all_result['triple'].unique():
238
+ html += f'<h3>Triple: {triple}</h3>'
239
+ df = all_result[all_result['triple']==triple].copy()
240
+ for idx, row in df.iterrows():
241
+ url = row['url']
242
+ results = row['Results']
243
+ html += f'<h3>Reference: {url}</h3>'
244
+ html += results.to_html(index=False)
245
+ html += '</body></html>'
246
+ return html
247
+ html_result = dataframe_to_html(all_result)
248
+ return html_result
249
+
250
+ if __name__ == '__main__':
251
+ target_QID = 'Q245247'
252
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
253
+ query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
254
+ claim_df = pd.read_sql_query(query, conn)
255
+ query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
256
+ reference_text_df = pd.read_sql_query(query, conn)
257
+ verbalised_claims_df_final = verbalisation(claim_df)
258
+ progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar for Gradio
259
+ def update_progress(curr_step, total_steps):
260
+ progress((curr_step + 1) / total_steps)
261
+
262
+ splited_sentences_from_html = setencesSpliter(verbalised_claims_df_final, reference_text_df, update_progress)
263
+
264
+ BATCH_SIZE = 512
265
+ N_TOP_SENTENCES = 5
266
+ SCORE_THRESHOLD = 0.6
267
+ evidence_df = evidenceSelection(splited_sentences_from_html, BATCH_SIZE, N_TOP_SENTENCES)
268
+ result = textEntailment(evidence_df, SCORE_THRESHOLD)
269
+ conn.commit()
270
+ conn.close()
271
+ display_df =TableMaking(verbalised_claims_df_final, result)
Prove_llm.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import sqlite3, torch, json, re, os, torch, itertools, html2text
4
+ from ast import literal_eval as leval
5
+ from tqdm.auto import tqdm
6
+ from utils.verbalisation_module import VerbModule
7
+ from utils.sentence_retrieval_module import SentenceRetrievalModule
8
+ from utils.textual_entailment_module import TextualEntailmentModule
9
+ from importlib import reload
10
+ import llm_load
11
+ from html.parser import HTMLParser
12
+ from sentence_transformers import SentenceTransformer
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+ from tqdm import tqdm
15
+ import gradio as gr
16
+
17
+
18
+ def verbalisation(claim_df):
19
+ verb_module = VerbModule()
20
+ triples = []
21
+ for _, row in claim_df.iterrows():
22
+ triple = {
23
+ 'subject': row['entity_label'],
24
+ 'predicate': row['property_label'],
25
+ 'object': row['object_label']
26
+ }
27
+ triples.append(triple)
28
+
29
+
30
+ claim_df['verbalisation'] = verb_module.verbalise_triples(triples)
31
+ claim_df['verbalisation_unks_replaced'] = claim_df['verbalisation'].apply(verb_module.replace_unks_on_sentence)
32
+ claim_df['verbalisation_unks_replaced_then_dropped'] = claim_df['verbalisation'].apply(lambda x: verb_module.replace_unks_on_sentence(x, empty_after=True))
33
+ return claim_df
34
+
35
+ def RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress):
36
+ join_df = pd.merge(verbalised_claims_df_final, reference_text_df[['reference_id', 'url', 'html']], on='reference_id', how='left')
37
+ tokenizer, model = llm_load.llmLoad(4096)
38
+ h = html2text.HTML2Text()
39
+ h.ignore_links = True
40
+
41
+ filtered_htmls = []
42
+ answers = []
43
+ verifications = []
44
+ for idx, (html, verb) in enumerate(zip(join_df['html'], join_df['verbalisation'])):
45
+ try:
46
+ filtered_html = h.handle(html)
47
+ filtered_htmls.append(filtered_html)
48
+ instruct = "Find the most relevant sentences from the filtered HTML document based on the given target sentence. If there are no directly related sentences, try to find sentences that provide context or background information related to the target sentence. Only answer 'nothing' if there is absolutely no relevant information in the document. Do not include any HTML tags or markup in your answer."
49
+ question = f"target sentence:'{verb}', filtered HTML document:{filtered_html}"
50
+ answer = llm_load.llmQuestion(tokenizer, model, instruct, question, output_size=128)
51
+ answers.append(answer)
52
+ except:
53
+ answers.append('Malformed html')
54
+ instruct = "Determine whether the target sentence is supported by the given evidence or not. If so, answer 'supportive'. It not, answer 'No supports'. Or, you can't determine with the given evidence, then asnwer 'Not enough information'"
55
+ question = f"target sentence:'{verb}', evidence:{answers[-1]}"
56
+ verification = llm_load.llmQuestion(tokenizer, model, instruct, question, output_size=64)
57
+ verifications.append(verification)
58
+
59
+ update_progress(idx, len(join_df)) # Update progress
60
+
61
+
62
+ return pd.DataFrame({'verbalisation': join_df['verbalisation'], 'verification': verifications, 'evidence_set': answers, 'url': join_df['url'], 'filtered_html': filtered_htmls})
63
+
64
+
65
+
66
+ if __name__ == '__main__':
67
+ target_QID = 'Q42'
68
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
69
+ query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
70
+ claim_df = pd.read_sql_query(query, conn)
71
+
72
+ query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
73
+ reference_text_df = pd.read_sql_query(query, conn)
74
+
75
+ verbalised_claims_df_final = verbalisation(claim_df)
76
+
77
+ progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar
78
+ def update_progress(curr_step, total_steps):
79
+ progress((curr_step + 1) / total_steps)
80
+
81
+ result = RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress)
82
+
83
+ conn.commit()
84
+ conn.close()
SimpleUI_lite.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import Wikidata_Text_Parser as wtr
3
+ import sqlite3
4
+ import Prove_lite as prv
5
+ import pandas as pd
6
+ import numpy as np
7
+ import os
8
+
9
+ def wtr_process(qid):
10
+ try:
11
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
12
+ target_QID = qid
13
+ query = f"SELECT * FROM {'claim_text'}"
14
+ df = pd.read_sql_query(query, conn)
15
+ if target_QID in df['entity_id'].unique():
16
+ pass
17
+ else:
18
+ wtr.claimParser(target_QID) #save results in .db
19
+ filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
20
+ url_set = wtr.urlParser(target_QID) #from ref table in .db
21
+ html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
22
+ claim_text = wtr.claim2text(html_set) #Claims generation
23
+ html_text = wtr.html2text(html_set)
24
+ claim_text = claim_text.astype(str)
25
+ html_text = html_text.astype(str)
26
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
27
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
28
+ conn.commit()
29
+ query = f"""
30
+ SELECT
31
+ claim_text.entity_label,
32
+ claim_text.property_label,
33
+ claim_text.object_label,
34
+ html_text.url
35
+ FROM claim_text
36
+ INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id
37
+ WHERE claim_text.entity_id = '{target_QID}'
38
+ """
39
+
40
+ result_df = pd.read_sql_query(query, conn)
41
+
42
+ conn.commit()
43
+ conn.close()
44
+
45
+ return result_df
46
+
47
+ except Exception as e:
48
+ error_df = pd.DataFrame({'Error': [str(e)]})
49
+ return error_df
50
+
51
+
52
+ def prv_process(qid):
53
+ target_QID = qid
54
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
55
+ query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
56
+ claim_df = pd.read_sql_query(query, conn)
57
+
58
+ query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
59
+ reference_text_df = pd.read_sql_query(query, conn)
60
+
61
+ verbalised_claims_df_final = prv.verbalisation(claim_df)
62
+
63
+ progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar for Gradio
64
+ def update_progress(curr_step, total_steps):
65
+ progress((curr_step + 1) / total_steps)
66
+
67
+ splited_sentences_from_html = prv.setencesSpliter(verbalised_claims_df_final, reference_text_df, update_progress)
68
+
69
+ BATCH_SIZE = 512
70
+ N_TOP_SENTENCES = 5
71
+ SCORE_THRESHOLD = 0
72
+ evidence_df = prv.evidenceSelection(splited_sentences_from_html, BATCH_SIZE, N_TOP_SENTENCES)
73
+ result = prv.textEntailment(evidence_df, SCORE_THRESHOLD)
74
+ display_df = prv.TableMaking(verbalised_claims_df_final, result)
75
+ conn.commit()
76
+ conn.close()
77
+ return display_df
78
+
79
+
80
+
81
+ with gr.Blocks() as demo:
82
+ print("gradio started!")
83
+ gr.Markdown(
84
+ """
85
+ # Prove
86
+ This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
87
+ """
88
+ )
89
+ inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q245247)")
90
+ out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)", headers=["entity_label", "property_label", "object_label", "url"])
91
+ run_button_1 = gr.Button("Start parsing")
92
+ run_button_1.click(wtr_process, inp, out)
93
+
94
+
95
+ gr.Markdown(
96
+ """
97
+ Pre-trained language models-based text entailment.
98
+ """
99
+ )
100
+ out_2 = gr.HTML(label="Results")
101
+ run_button_2 = gr.Button("Start processing")
102
+ run_button_2.click(prv_process, inp, out_2)
103
+
104
+
105
+ if __name__ == "__main__":
106
+ #DB initialising
107
+ if os.path.isfile('wikidata_claims_refs_parsed.db') != True:
108
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
109
+ target_QID = 'Q115305900'
110
+ wtr.claimParser(target_QID) #save results in .db
111
+ filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
112
+ url_set = wtr.urlParser(target_QID) #from ref table in .db
113
+ html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
114
+ claim_text = wtr.claim2text(html_set) #Claims generation
115
+ html_text = wtr.html2text(html_set)
116
+ claim_text = claim_text.astype(str)
117
+ html_text = html_text.astype(str)
118
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
119
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
120
+ conn.commit()
121
+ conn.close()
122
+ demo.launch(share=True)
SimpleUI_llm.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import Wikidata_Text_Parser as wtr
3
+ import sqlite3
4
+ import CodeArchive.Prove_llm as prv
5
+ import pandas as pd
6
+
7
+ def wtr_process(qid):
8
+ try:
9
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
10
+ target_QID = qid
11
+
12
+ cursor = conn.cursor()
13
+
14
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='claims'")
15
+ table_exists = cursor.fetchone()
16
+
17
+ if table_exists:
18
+ cursor.execute("SELECT entity_id FROM claims WHERE entity_id=?", (target_QID,))
19
+ result = cursor.fetchone()
20
+
21
+ if result is not None and result[0] == target_QID:
22
+ print(result)
23
+ print(f"{target_QID} already exists in the 'claims' table. Skipping execution.")
24
+ else:
25
+ progress = gr.Progress(0)
26
+ progress(0.00, desc="Wikidata claims parsing...")
27
+ wtr.claimParser(target_QID) #save results in .db
28
+ filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
29
+ progress(0.25, desc="URL and HTML parsing...")
30
+ url_set = wtr.urlParser() #from ref table in .db
31
+ html_set = wtr.htmlParser(url_set, qid) #Original html docs collection
32
+ progress(0.50, desc="claim2Text...")
33
+ claim_text = wtr.claim2text(html_set) #Claims generation
34
+ progress(0.74, desc="html2Text...")
35
+ html_text = wtr.html2text(html_set)
36
+ claim_text = claim_text.astype(str)
37
+ html_text = html_text.astype(str)
38
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
39
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
40
+ progress(1, desc="completed...")
41
+ else:
42
+ progress = gr.Progress(0)
43
+ progress(0.00, desc="Wikidata claims parsing...")
44
+ wtr.claimParser(target_QID) #save results in .db
45
+ filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
46
+ progress(0.25, desc="URL and HTML parsing...")
47
+ url_set = wtr.urlParser() #from ref table in .db
48
+ html_set = wtr.htmlParser(url_set) #Original html docs collection
49
+ progress(0.50, desc="claim2Text...")
50
+ claim_text = wtr.claim2text(html_set) #Claims generation
51
+ progress(0.74, desc="html2Text...")
52
+ html_text = wtr.html2text(html_set)
53
+ claim_text = claim_text.astype(str)
54
+ html_text = html_text.astype(str)
55
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
56
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
57
+ progress(1, desc="completed...")
58
+
59
+
60
+ query = f"""
61
+ SELECT
62
+ claim_text.entity_label,
63
+ claim_text.property_label,
64
+ claim_text.object_label,
65
+ html_text.url
66
+ FROM claim_text
67
+ INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id
68
+ WHERE claim_text.entity_id = '{target_QID}'
69
+ """
70
+
71
+ result_df = pd.read_sql_query(query, conn)
72
+
73
+ conn.commit()
74
+ conn.close()
75
+
76
+ return result_df
77
+
78
+ except Exception as e:
79
+ error_df = pd.DataFrame({'Error': [str(e)]})
80
+ return error_df
81
+
82
+
83
+ def prv_process(qid):
84
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
85
+
86
+ query = f"""
87
+ SELECT html_text.*
88
+ FROM html_text
89
+ INNER JOIN claim_text ON html_text.reference_id = claim_text.reference_id
90
+ WHERE claim_text.entity_id = '{qid}'
91
+ """
92
+ reference_text_df = pd.read_sql_query(query, conn)
93
+ query = f"SELECT * FROM claim_text WHERE entity_id = '{qid}'"
94
+ claim_df = pd.read_sql_query(query, conn)
95
+
96
+ verbalised_claims_df_final = prv.verbalisation(claim_df)
97
+ progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar
98
+
99
+ def update_progress(curr_step, total_steps):
100
+ progress((curr_step + 1) / total_steps)
101
+
102
+ result = prv.RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress)
103
+
104
+ conn.close()
105
+ return result
106
+
107
+
108
+
109
+ with gr.Blocks() as demo:
110
+ print("gradio started!")
111
+ gr.Markdown(
112
+ """
113
+ # Reference Quality Verification Tool
114
+ This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
115
+ Parsing could take 3~5 mins depending on the number of references.
116
+ """
117
+ )
118
+ inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q42)")
119
+ out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)", headers=["entity_label", "property_label", "object_label", "url"])
120
+ run_button_1 = gr.Button("Start parsing")
121
+ run_button_1.click(wtr_process, inp, out)
122
+
123
+
124
+ gr.Markdown(
125
+ """
126
+ LLM-based HTML parsing and verification !
127
+ """
128
+ )
129
+ out_2 = gr.DataFrame(label="LLM-based verificaiton result")
130
+
131
+ run_button_2 = gr.Button("Start processing")
132
+ run_button_2.click(prv_process, inp, out_2)
133
+
134
+
135
+ if __name__ == "__main__":
136
+ demo.launch(share=True)
UI_tester.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import Wikidata_Text_Parser as wtr
3
+ import sqlite3
4
+
5
+ def process_input(qid):
6
+ progress = gr.Progress(0)
7
+
8
+ wtr.claimParser(qid)
9
+
10
+ progress(0.20, desc="Filtering properties...")
11
+ filtered_df = wtr.propertyFiltering(qid)
12
+
13
+ progress(0.40, desc="Parsing URLs...")
14
+ url_set = wtr.urlParser()
15
+
16
+ progress(0.60, desc="Parsing HTML...")
17
+ html_set = wtr.htmlParser(url_set)
18
+
19
+ progress(0.80, desc="Generating claim text...")
20
+ claim_text = wtr.claim2text(html_set) #Claims generation
21
+
22
+ progress(1, desc="Generating claim text...")
23
+ html_text = wtr.html2text(html_set)
24
+
25
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
26
+ claim_text = claim_text.astype(str)
27
+ html_text = html_text.astype(str)
28
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
29
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
30
+ conn.commit()
31
+ conn.close()
32
+ return f"{html_text.shape[0]} HTMl documents collection via references of {qid}"
33
+
34
+ with gr.Blocks() as demo:
35
+ gr.Markdown(
36
+ """
37
+ # Reference Quality Verification Tool
38
+ This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
39
+
40
+ Parsing could take 3~5 mins depending on the number of references.
41
+ """
42
+ )
43
+
44
+ inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q42)")
45
+ out = gr.Textbox(label="Parsing result")
46
+ run_button = gr.Button("Start parsing")
47
+ run_button.click(process_input, inp, out)
48
+
49
+
50
+
51
+ if __name__ == "__main__":
52
+ demo.launch()
Wikidata_Text_Parser.py ADDED
@@ -0,0 +1,929 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from tqdm import tqdm
3
+ import pandas as pd
4
+ import os, sqlite3, traceback, ast, requests, fasttext, re, time, string, spacy, pysbd
5
+ from requests.exceptions import ReadTimeout, TooManyRedirects, ConnectionError, ConnectTimeout, InvalidSchema, InvalidURL
6
+ from qwikidata.linked_data_interface import get_entity_dict_from_api
7
+ from datetime import datetime
8
+ import utils.wikidata_utils as wdutils
9
+ from importlib import reload
10
+ from urllib.parse import urlparse, unquote
11
+ from urllib import parse
12
+ from bs4 import BeautifulSoup
13
+ from IPython.display import clear_output
14
+ from os.path import exists
15
+ from pathlib import Path
16
+ from nltk.tokenize import sent_tokenize
17
+ from sentence_splitter import SentenceSplitter, split_text_into_sentences
18
+
19
+
20
+ class DatabaseExtractor():
21
+ def __init__(self, dbname='wikidata_claims_refs_parsed.db'):
22
+ self.dbname = dbname
23
+ self.prepare_extraction()
24
+
25
+ def finish_extraction(self):
26
+ self.db.commit()
27
+
28
+ def prepare_extraction(self):
29
+ self.db = sqlite3.connect(self.dbname)
30
+ self.cursor = self.db.cursor()
31
+
32
+ self.cursor.execute('''
33
+ CREATE TABLE IF NOT EXISTS claims(
34
+ entity_id TEXT,
35
+ claim_id TEXT,
36
+ rank TEXT,
37
+ property_id TEXT,
38
+ datatype TEXT,
39
+ datavalue TEXT,
40
+ PRIMARY KEY (
41
+ claim_id
42
+ )
43
+ )''')
44
+
45
+ self.cursor.execute('''
46
+ CREATE TABLE IF NOT EXISTS claims_refs(
47
+ claim_id TEXT,
48
+ reference_id TEXT,
49
+ PRIMARY KEY (
50
+ claim_id,
51
+ reference_id
52
+ )
53
+ )''')
54
+
55
+ self.cursor.execute('''
56
+ CREATE TABLE IF NOT EXISTS refs(
57
+ reference_id TEXT,
58
+ reference_property_id TEXT,
59
+ reference_index TEXT,
60
+ reference_datatype TEXT,
61
+ reference_value TEXT,
62
+ PRIMARY KEY (
63
+ reference_id,
64
+ reference_property_id,
65
+ reference_index
66
+ )
67
+ )''')
68
+ self.db.commit()
69
+
70
+ def extract_claim(self, entity_id, claim):
71
+ if claim['mainsnak']['snaktype'] == 'value':
72
+ value = str(claim['mainsnak']['datavalue'])
73
+ else:
74
+ value = claim['mainsnak']['snaktype']
75
+ try:
76
+ self.cursor.execute('''
77
+ INSERT INTO claims(entity_id, claim_id, rank, property_id, datatype, datavalue)
78
+ VALUES($var,$var,$var,$var,$var,$var)'''.replace('$var','?'), (
79
+ entity_id,claim['id'],claim['rank'],
80
+ claim['mainsnak']['property'],claim['mainsnak']['datatype'],value
81
+ ))
82
+ except UnicodeEncodeError:
83
+ print(entity_id,claim['id'],claim['rank'],
84
+ claim['mainsnak']['property'],claim['mainsnak']['datatype'],value)
85
+ raise
86
+ except sqlite3.IntegrityError as err:
87
+ #self.db.rollback()
88
+ self.cursor.execute(
89
+ '''SELECT *
90
+ FROM claims
91
+ WHERE claim_id=$var
92
+ '''.replace('$var','?'), (claim['id'],)
93
+ )
94
+ conflicted_value = self.cursor.fetchone()
95
+ if conflicted_value == (entity_id,claim['id'],claim['rank'],
96
+ claim['mainsnak']['property'],claim['mainsnak']['datatype'],value):
97
+ pass
98
+ else:
99
+ print(err, claim['id'])
100
+ traceback.print_exc()
101
+ raise err
102
+ finally:
103
+ #self.db.commit()
104
+ pass
105
+
106
+ def extract_reference(self, ref):
107
+ for snaks in ref['snaks'].values():
108
+ for i, snak in enumerate(snaks):
109
+ if snak['snaktype'] == 'value':
110
+ value = str(snak['datavalue'])
111
+ else:
112
+ value = snak['snaktype']
113
+ try:
114
+ self.cursor.execute('''
115
+ INSERT INTO refs(reference_id, reference_property_id, reference_index,
116
+ reference_datatype, reference_value)
117
+ VALUES($var,$var,$var,$var,$var)'''.replace('$var','?'), (
118
+ ref['hash'],snak['property'],str(i),snak['datatype'],value
119
+ ))
120
+ except sqlite3.IntegrityError as err:
121
+ #self.db.rollback()
122
+ self.cursor.execute(# WE DONT USE THE INDEX HERE, THEY TEND TO COME SHUFFLED FROM API AND SORTING TAKES TOO LONG
123
+ '''SELECT reference_id, reference_property_id, reference_datatype, reference_value
124
+ FROM refs
125
+ WHERE reference_id = $var
126
+ AND reference_property_id = $var
127
+ '''.replace('$var','?'), (ref['hash'],snak['property'])
128
+ )
129
+ conflicted_values = self.cursor.fetchall()
130
+ if (ref['hash'],snak['property'],snak['datatype'],value) in conflicted_values:
131
+ pass
132
+ else:
133
+ print(err, ref['hash'],snak['property'],i)
134
+ print('trying to insert:',(ref['hash'],snak['property'],str(i),snak['datatype'],value))
135
+ traceback.print_exc()
136
+ raise err
137
+ finally:
138
+ #self.db.commit()
139
+ pass
140
+
141
+ def extract_claim_reference(self, claim, ref):
142
+ claim['id'],ref['hash']
143
+ try:
144
+ self.cursor.execute('''
145
+ INSERT INTO claims_refs(claim_id, reference_id)
146
+ VALUES($var,$var)'''.replace('$var','?'), (
147
+ claim['id'],ref['hash']
148
+ ))
149
+ except sqlite3.IntegrityError as err:
150
+ #db.rollback()
151
+ pass
152
+ finally:
153
+ #self.db.commit()
154
+ pass
155
+
156
+ def extract_entity(self, e):
157
+ for outgoing_property_id in e['claims'].values():
158
+ for claim in outgoing_property_id:
159
+ self.extract_claim(e['id'],claim)
160
+ if 'references' in claim:
161
+ for ref in claim['references']:
162
+ self.extract_claim_reference(claim, ref)
163
+ self.extract_reference(ref)
164
+
165
+ def claimParser(QID):
166
+ entity_id = QID
167
+ print('Setting up database ...')
168
+ extractor = DatabaseExtractor()
169
+
170
+ print('Fetching entity from API ...')
171
+ entity = get_entity_dict_from_api(entity_id)
172
+
173
+ if entity:
174
+ print(f'Parsing entity: {entity_id}')
175
+ extractor.extract_entity(entity)
176
+ else:
177
+ print(f'Failed to fetch entity: {entity_id}')
178
+
179
+ extractor.finish_extraction()
180
+
181
+ def propertyFiltering(QID):
182
+ reload(wdutils)
183
+ DB_PATH = 'wikidata_claims_refs_parsed.db'
184
+ claims_columns = ['entity_id','claim_id','rank','property_id','datatype','datavalue']
185
+
186
+ properties_to_remove = {
187
+ 'general':[
188
+ 'P31', # - instance of
189
+ 'P279',# - subclass of
190
+ 'P373',# - commons category
191
+ 'P910',# - Topic's main category
192
+ 'P7561',# - category for the interior of the item
193
+ 'P5008',# - on focus list of Wikimedia project
194
+ 'P2670',# - has parts of the class
195
+ 'P1740',# - category for films shot at this location
196
+ 'P1612',# - Commons Institution page
197
+ 'P8989',# - category for the view of the item
198
+ 'P2959',# - permanent duplicated item
199
+ 'P7867',# - category for maps
200
+ 'P935' ,# - Commons gallery
201
+ 'P1472',# - Commons Creator page
202
+ 'P8596',# category for the exterior of the item
203
+ 'P5105',# Deutsche Bahn station category
204
+ 'P8933',# category for the view from the item
205
+ 'P642',# of
206
+ 'P3876',# category for alumni of educational institution
207
+ 'P1791',# category of people buried here
208
+ 'P7084',# related category
209
+ 'P1465',# category for people who died here
210
+ 'P1687',# Wikidata property
211
+ 'P6104',# maintained by WikiProject
212
+ 'P4195',# category for employees of the organization
213
+ 'P1792',# category of associated people
214
+ 'P5869',# model item
215
+ 'P1659',# see also
216
+ 'P1464',# category for people born here
217
+ 'P2354',# has list
218
+ 'P1424',# topic's main template
219
+ 'P7782',# category for ship name
220
+ 'P179',# part of the series
221
+ 'P7888',# merged into
222
+ 'P6365',# member category
223
+ 'P8464',# content partnership category
224
+ 'P360',# is a list of
225
+ 'P805',# statement is subject of
226
+ 'P8703',# entry in abbreviations table
227
+ 'P1456',# list of monuments
228
+ 'P1012',# including
229
+ 'P1151',# topic's main Wikimedia portal
230
+ 'P2490',# page at OSTIS Belarus Wiki
231
+ 'P593',# HomoloGene ID
232
+ 'P8744',# economy of topic
233
+ 'P2614',# World Heritage criteria
234
+ 'P2184',# history of topic
235
+ 'P9241',# demographics of topic
236
+ 'P487',#Unicode character
237
+ 'P1754',#category related to list
238
+ 'P2559',#Wikidata usage instructions
239
+ 'P2517',#category for recipients of this award
240
+ 'P971',#category combines topics
241
+ 'P6112',# category for members of a team
242
+ 'P4224',#category contains
243
+ 'P301',#category's main topic
244
+ 'P1753',#list related to category
245
+ 'P1423',#template has topic
246
+ 'P1204',#Wikimedia portal's main topic
247
+ 'P3921',#Wikidata SPARQL query equivalent
248
+ 'P1963',#properties for this type
249
+ 'P5125',#Wikimedia outline
250
+ 'P3176',#uses property
251
+ 'P8952',#inappropriate property for this type
252
+ 'P2306',#property
253
+ 'P5193',#Wikidata property example for forms
254
+ 'P5977',#Wikidata property example for senses
255
+ ],
256
+ 'specific': {}
257
+ }
258
+
259
+ db = sqlite3.connect(DB_PATH)
260
+ cursor = db.cursor()
261
+ # To see how many out of the total number of stored claims we are excluding by removing the general properties
262
+ sql_query = "select count(*) from claims where property_id in $1;"
263
+ sql_query = sql_query.replace('$1', '(' + ','.join([('"' + e + '"') for e in properties_to_remove['general']]) + ')')
264
+ cursor.execute(sql_query)
265
+ print('Removing the',len(properties_to_remove['general']),'properties deemed as ontological or unverbalisable')
266
+ cursor = db.cursor()
267
+
268
+ sql_query = "select * from claims where entity_id in $1;"
269
+ sql_query = sql_query.replace('$1', '(' + ','.join([('"' + e + '"') for e in [QID]]) + ')')
270
+
271
+ cursor.execute(sql_query)
272
+ theme_df = pd.DataFrame(cursor.fetchall())
273
+ theme_df.columns = claims_columns
274
+
275
+ original_theme_df_size = theme_df.shape[0]
276
+ last_stage_theme_df_size = original_theme_df_size
277
+
278
+ print('- Removing deprecated')
279
+
280
+ # Remove deprecated
281
+ theme_df = theme_df[theme_df['rank'] != 'deprecated'].reset_index(drop=True)
282
+ print(
283
+ ' - Percentage of deprecated:',
284
+ round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%'
285
+ )
286
+ last_stage_theme_df_size = theme_df.shape[0]
287
+
288
+ print('- Removing bad datatypes')
289
+
290
+ # Remove external_ids, commonsMedia (e.g. photos), globe-coordinates, urls
291
+ bad_datatypes = ['commonsMedia','external-id','globe-coordinate','url', 'wikibase-form',
292
+ 'geo-shape', 'math', 'musical-notation', 'tabular-data', 'wikibase-sense']
293
+ theme_df = theme_df[
294
+ theme_df['datatype'].apply(
295
+ lambda x : x not in bad_datatypes
296
+ )
297
+ ].reset_index(drop=True)
298
+ print(
299
+ ' - Percentage of bad datatypes:',
300
+ round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%'
301
+ )
302
+ last_stage_theme_df_size = theme_df.shape[0]
303
+
304
+ print('- Removing bad properties')
305
+
306
+ # Remove specific properties such as P31 and P279
307
+ theme_df = theme_df[
308
+ theme_df['property_id'].apply(
309
+ lambda x : (x not in properties_to_remove['general']))
310
+
311
+ ].reset_index(drop=True)
312
+ print(
313
+ ' - Percentage of ontology (non-domain) properties:',
314
+ round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%'
315
+ )
316
+ last_stage_theme_df_size = theme_df.shape[0]
317
+
318
+ print('- Removing somevalue/novalue')
319
+
320
+ # Remove novalue and somevalue
321
+ theme_df = theme_df[
322
+ theme_df['datavalue'].apply(
323
+ lambda x : x not in ['somevalue', 'novalue']
324
+ )
325
+ ].reset_index(drop=True)
326
+ print(
327
+ ' - Percentage of somevalue/novalue:',
328
+ round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%'
329
+ )
330
+ last_stage_theme_df_size = theme_df.shape[0]
331
+
332
+ print(
333
+ 'After all removals, we keep',
334
+ round(last_stage_theme_df_size/original_theme_df_size*100, 2),
335
+ )
336
+ theme_df.to_sql('claims', db, if_exists='replace', index=False)
337
+
338
+ return theme_df
339
+
340
+ def get_object_label_given_datatype(row):
341
+ Wd_API = wdutils.CachedWikidataAPI()
342
+ Wd_API.languages = ['en']
343
+ def turn_to_century_or_millennium(y, mode):
344
+ y = str(y)
345
+ if mode == 'C':
346
+ div = 100
347
+ group = int(y.rjust(3, '0')[:-2])
348
+ mode_name = 'century'
349
+ elif mode == 'M':
350
+ div = 1000
351
+ group = int(y.rjust(4, '0')[:-3])
352
+ mode_name = 'millenium'
353
+ else:
354
+ raise ValueError('Use mode = C for century and M for millennium')
355
+
356
+ if int(y)%div != 0:
357
+ group += 1
358
+ group = str(group)
359
+
360
+ group_suffix = (
361
+ 'st' if group[-1] == '1' else (
362
+ 'nd' if group[-1] == '2' else (
363
+ 'rd' if group[-1] == '3' else 'th'
364
+ )
365
+ )
366
+ )
367
+
368
+ return ' '.join([group+group_suffix, mode_name])
369
+
370
+ dt = row['datatype']
371
+ dv = row['datavalue']
372
+
373
+ dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string']
374
+ if dt not in dt_types:
375
+ print(dt)
376
+ raise ValueError
377
+ else:
378
+ try:
379
+ if dt == dt_types[0]:
380
+ return Wd_API.get_label(ast.literal_eval(dv)['value']['id'], True) #get label here
381
+ elif dt == dt_types[1]:
382
+ dv = ast.literal_eval(dv)
383
+ return (dv['value']['text'], dv['value']['language'])
384
+ elif dt == dt_types[2]:
385
+ dv = ast.literal_eval(dv)
386
+ amount, unit = dv['value']['amount'], dv['value']['unit']
387
+ if amount[0] == '+':
388
+ amount = amount[1:]
389
+ if str(unit) == '1':
390
+ return (str(amount), 'en')
391
+ else:
392
+ unit_entity_id = unit.split('/')[-1]
393
+ unit = Wd_API.get_label(unit_entity_id, True)#get label here
394
+ return (' '.join([amount, unit[0]]), unit[1])
395
+ elif dt == dt_types[3]:
396
+ dv = ast.literal_eval(dv)
397
+ time = dv['value']['time']
398
+ timezone = dv['value']['timezone']
399
+ precision = dv['value']['precision']
400
+ assert dv['value']['after'] == 0 and dv['value']['before'] == 0
401
+
402
+ sufix = 'BC' if time[0] == '-' else ''
403
+ time = time[1:]
404
+
405
+ if precision == 11: #date
406
+ return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%d/%m/%Y') + sufix, 'en')
407
+ elif precision == 10: #month
408
+ try:
409
+ return (datetime.strptime(time, '%Y-%m-00T00:00:%SZ').strftime("%B of %Y") + sufix, 'en')
410
+ except ValueError:
411
+ return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime("%B of %Y") + sufix, 'en')
412
+ elif precision == 9: #year
413
+ try:
414
+ return (datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y') + sufix, 'en')
415
+ except ValueError:
416
+ return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%Y') + sufix, 'en')
417
+ elif precision == 8: #decade
418
+ try:
419
+ return (datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y')[:-1] +'0s' + sufix, 'en')
420
+ except ValueError:
421
+ return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%Y')[:-1] +'0s' + sufix, 'en')
422
+ elif precision == 7: #century
423
+ try:
424
+ parsed_time = datetime.strptime(time, '%Y-00-00T00:00:%SZ')
425
+ except ValueError:
426
+ parsed_time = datetime.strptime(time, '%Y-%m-%dT00:00:%SZ')
427
+ finally:
428
+ return (turn_to_century_or_millennium(
429
+ parsed_time.strftime('%Y'), mode='C'
430
+ ) + sufix, 'en')
431
+ elif precision == 6: #millennium
432
+ try:
433
+ parsed_time = datetime.strptime(time, '%Y-00-00T00:00:%SZ')
434
+ except ValueError:
435
+ parsed_time = datetime.strptime(time, '%Y-%m-%dT00:00:%SZ')
436
+ finally:
437
+ return (turn_to_century_or_millennium(
438
+ parsed_time.strftime('%Y'), mode='M'
439
+ ) + sufix, 'en')
440
+ elif precision == 4: #hundred thousand years
441
+ timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y'))
442
+ timeint = round(timeint/1e5,1)
443
+ return (str(timeint) + 'hundred thousand years' + sufix, 'en')
444
+ elif precision == 3: #million years
445
+ timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y'))
446
+ timeint = round(timeint/1e6,1)
447
+ return (str(timeint) + 'million years' + sufix, 'en')
448
+ elif precision == 0: #billion years
449
+ timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y'))
450
+ timeint = round(timeint/1e9,1)
451
+ return (str(timeint) + 'billion years' +sufix, 'en')
452
+ elif dt == dt_types[4]:
453
+ return (ast.literal_eval(dv)['value'], 'en')
454
+ except ValueError as e:
455
+ #pdb.set_trace()
456
+ raise e
457
+
458
+ def get_object_desc_given_datatype(row):
459
+ Wd_API = wdutils.CachedWikidataAPI()
460
+ Wd_API.languages = ['en']
461
+ dt = row['datatype']
462
+ dv = row['datavalue']
463
+
464
+ dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string']
465
+ if dt not in dt_types:
466
+ print(dt)
467
+ raise ValueError
468
+ else:
469
+ try:
470
+ if dt == dt_types[0]:
471
+ return Wd_API.get_desc(ast.literal_eval(dv)['value']['id']) #get label here
472
+ elif dt == dt_types[1]:
473
+ return ('no-desc', 'none')
474
+ elif dt == dt_types[2]:
475
+ dv = ast.literal_eval(dv)
476
+ amount, unit = dv['value']['amount'], dv['value']['unit']
477
+ if amount[0] == '+':
478
+ amount = amount[1:]
479
+ if str(unit) == '1':
480
+ return ('no-desc', 'none')
481
+ else:
482
+ unit_entity_id = unit.split('/')[-1]
483
+ return Wd_API.get_desc(unit_entity_id)
484
+ elif dt == dt_types[3]:
485
+ return ('no-desc', 'none')
486
+ elif dt == dt_types[4]:
487
+ return ('no-desc', 'none')
488
+ except ValueError as e:
489
+ #pdb.set_trace()
490
+ raise e
491
+
492
+ def get_object_alias_given_datatype(row):
493
+ Wd_API = wdutils.CachedWikidataAPI()
494
+ Wd_API.languages = ['en']
495
+ dt = row['datatype']
496
+ dv = row['datavalue']
497
+
498
+ dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string']
499
+ if dt not in dt_types:
500
+ print(dt)
501
+ raise ValueError
502
+ else:
503
+ try:
504
+ if dt == dt_types[0]:
505
+ return Wd_API.get_alias(ast.literal_eval(dv)['value']['id']) #get label here
506
+ elif dt == dt_types[1]:
507
+ return ('no-alias', 'none')
508
+ elif dt == dt_types[2]:
509
+ dv = ast.literal_eval(dv)
510
+ amount, unit = dv['value']['amount'], dv['value']['unit']
511
+ if amount[0] == '+':
512
+ amount = amount[1:]
513
+ if str(unit) == '1':
514
+ return ('no-alias', 'none')
515
+ else:
516
+ unit_entity_id = unit.split('/')[-1]
517
+ return Wd_API.get_alias(unit_entity_id)
518
+ elif dt == dt_types[3]:
519
+ dv = ast.literal_eval(dv)
520
+ time = dv['value']['time']
521
+ timezone = dv['value']['timezone']
522
+ precision = dv['value']['precision']
523
+ assert dv['value']['after'] == 0 and dv['value']['before'] == 0
524
+
525
+ sufix = 'BC' if time[0] == '-' else ''
526
+ time = time[1:]
527
+
528
+ if precision == 11: #date
529
+ return ([
530
+ datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%-d of %B, %Y') + sufix,
531
+ datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%d/%m/%Y (dd/mm/yyyy)') + sufix,
532
+ datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%b %-d, %Y') + sufix
533
+ ], 'en')
534
+ else: #month
535
+ return ('no-alias', 'none')
536
+ elif dt == dt_types[4]:
537
+ return ('no-alias', 'none')
538
+ except ValueError as e:
539
+ #pdb.set_trace()
540
+ raise e
541
+
542
+ def textualAugmentation(filtered_df):
543
+
544
+ Wd_API = wdutils.CachedWikidataAPI()
545
+ Wd_API.languages = ['en']
546
+
547
+ filtered_df['entity_label'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_label(x, True))
548
+ filtered_df['entity_desc'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_desc(x))
549
+ filtered_df['entity_alias'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_alias(x))
550
+
551
+ print(' - Predicate augmentation...')
552
+ filtered_df['property_label'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_label(x, True))
553
+ filtered_df['property_desc'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_desc(x))
554
+ filtered_df['property_alias'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_alias(x))
555
+
556
+ print(' - Object augmentation...')
557
+ filtered_df['object_label'] = filtered_df.apply(get_object_label_given_datatype, axis=1)
558
+ filtered_df['object_desc'] = filtered_df.apply(get_object_desc_given_datatype, axis=1)
559
+ filtered_df['object_alias'] = filtered_df.apply(get_object_alias_given_datatype, axis=1)
560
+
561
+
562
+ no_subject_label_perc = filtered_df[filtered_df['entity_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100
563
+ print(' - No subject label %:', no_subject_label_perc, '%')
564
+
565
+ no_predicate_label_perc = filtered_df[filtered_df['property_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100
566
+ print(' - No predicate label %:', no_predicate_label_perc, '%')
567
+
568
+ no_object_label_perc = filtered_df[filtered_df['object_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100
569
+ print(' - No object label %:', no_object_label_perc, '%')
570
+ return filtered_df
571
+
572
+ def urlParser(target_QID):
573
+ Wd_API = wdutils.CachedWikidataAPI()
574
+ Wd_API.languages = ['en']
575
+ db = sqlite3.connect('wikidata_claims_refs_parsed.db')
576
+ cursor = db.cursor()
577
+ refs_columns = ['reference_id','reference_property_id', 'reference_index', 'reference_datatype', 'reference_value']
578
+ cursor.execute('select * from refs where reference_datatype="url";')
579
+ url_df = pd.DataFrame(cursor.fetchall())
580
+ url_df.columns = refs_columns
581
+ def reference_value_to_url(reference_value):
582
+ if reference_value in ['novalue','somevalue']:
583
+ return reference_value
584
+ reference_value = ast.literal_eval(reference_value)
585
+ assert reference_value['type'] == 'string'
586
+ return reference_value['value']
587
+ def reference_value_to_external_id(reference_value):
588
+ if reference_value in ['novalue','somevalue']:
589
+ return reference_value
590
+ reference_value = ast.literal_eval(reference_value)
591
+ assert reference_value['type'] == 'string'
592
+ return reference_value['value']
593
+ def get_formatter_url(entity_id):
594
+ try:
595
+ sparql_query = '''
596
+ SELECT ?item ?itemLabel
597
+ WHERE
598
+ {
599
+ wd:$1 wdt:P1630 ?item.
600
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
601
+ }
602
+ '''.replace('$1',entity_id)
603
+ sparql_results = Wd_API.query_sparql_endpoint(sparql_query)
604
+ if len(sparql_results['results']['bindings']) > 0:
605
+ return sparql_results['results']['bindings'][0]['item']['value']
606
+ else:
607
+ return 'no_formatter_url'
608
+ except Exception:
609
+ print(entity_id)
610
+ print(sparql_results)
611
+ raise
612
+ url_df['url'] = url_df.reference_value.apply(reference_value_to_url)
613
+ cursor.execute('select * from refs where reference_datatype="url";')
614
+ ext_id_df = pd.DataFrame(cursor.fetchall())
615
+ ext_id_df.columns = refs_columns
616
+ ext_id_df['ext_id'] = ext_id_df.reference_value.apply(reference_value_to_external_id)
617
+ ext_id_df['formatter_url'] = ext_id_df['reference_property_id'].apply(get_formatter_url)
618
+ ext_id_df['url'] = ext_id_df.apply(lambda x : x['formatter_url'].replace('$1', x['ext_id']), axis=1)
619
+ columns_for_join = ['reference_id', 'reference_property_id','reference_index','reference_datatype','url']
620
+ url_df_pre_join = url_df[columns_for_join]
621
+ ext_id_df_pre_join = ext_id_df[columns_for_join]
622
+ all_url_df = pd.concat([url_df_pre_join,ext_id_df_pre_join])
623
+ all_url_df = all_url_df.sort_values(['reference_id','reference_index'])
624
+ # drop those with url = 'no_formatter_url'
625
+ all_url_df = all_url_df[all_url_df['url'] != 'no_formatter_url'].reset_index(drop=True)
626
+ # drop those with url = somevalue and novalue
627
+ all_url_df = all_url_df[~all_url_df['url'].isin(['somevalue','novalue'])]
628
+ reference_id_counts = all_url_df.reference_id.value_counts().reset_index()
629
+ reference_id_counts.columns = ['reference_id', 'counts']
630
+ reference_id_counts_equal_1 = reference_id_counts[reference_id_counts['counts'] == 1].reference_id.tolist()
631
+ all_url_df_eq1 = all_url_df[all_url_df.reference_id.isin(reference_id_counts_equal_1)]
632
+ all_url_df_eq1 = all_url_df_eq1.reset_index(drop=True).drop('reference_index', axis=1)
633
+ return all_url_df_eq1
634
+
635
+ def htmlParser(url_set, qid):
636
+ text_reference_sampled_df = url_set
637
+ _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
638
+ text_reference_sampled_df['html'] = None
639
+ for i, row in text_reference_sampled_df.iterrows():
640
+
641
+ print(i, row.url)
642
+ try:
643
+ response = requests.get(row.url, timeout=10)
644
+ if response.status_code == 200:
645
+ html = response.text
646
+ text_reference_sampled_df.loc[i, 'html'] = html
647
+ else:
648
+ print(f"not response, {response.status_code}")
649
+ text_reference_sampled_df.loc[i, 'html'] = response.status_code
650
+ except requests.exceptions.Timeout:
651
+ print("Timeout occurred while fetching the URL:", row.url)
652
+ text_reference_sampled_df.loc[i, 'html'] = 'TimeOut'
653
+ pass
654
+ except Exception as e:
655
+ print("An error occurred:", str(e))
656
+ pass
657
+ text_reference_sampled_df_html = text_reference_sampled_df.copy()
658
+ text_reference_sampled_df_html['entity_id'] = qid
659
+ return text_reference_sampled_df_html
660
+
661
+ def claim2text(html_set):
662
+ text_reference_sampled_df_html = html_set
663
+ Wd_API = wdutils.CachedWikidataAPI()
664
+ Wd_API.languages = ['en']
665
+ db = sqlite3.connect('wikidata_claims_refs_parsed.db')
666
+ cursor = db.cursor()
667
+ claims_columns = ['entity_id','claim_id','rank','property_id','datatype','datavalue']
668
+ refs_columns = ['reference_id', 'reference_property_id', 'reference_index', 'reference_datatype', 'reference_value']
669
+
670
+ def reference_id_to_claim_id(reference_id):
671
+ cursor.execute(f'select claim_id from claims_refs where reference_id="{reference_id}"')
672
+ sql_result = cursor.fetchall()
673
+ #return sql_result
674
+ randomly_chosen_claim_id = np.array(sql_result).reshape(-1)
675
+ return randomly_chosen_claim_id
676
+
677
+ def reference_id_to_claim_data(reference_id):
678
+ claim_ids = reference_id_to_claim_id(reference_id)
679
+ r = []
680
+ for claim_id in claim_ids:
681
+ #print(claim_id)
682
+ cursor.execute(f'select * from claims where claim_id="{claim_id}";')
683
+ d = cursor.fetchall()
684
+ r = r + d
685
+ return r
686
+
687
+ claim_data = []
688
+ for reference_id in text_reference_sampled_df_html.reference_id:
689
+ data = reference_id_to_claim_data(reference_id)
690
+ #print(data)
691
+ data = [(reference_id,) + t for t in data]
692
+ claim_data = claim_data + data
693
+ #break
694
+
695
+ claim_df = pd.DataFrame(claim_data, columns = ['reference_id'] + claims_columns)
696
+ claim_df
697
+
698
+ def claim_id_to_claim_url(claim_id):
699
+ claim_id_parts = claim_id.split('$')
700
+ return f'https://www.wikidata.org/wiki/{claim_id_parts[0]}#{claim_id}'
701
+
702
+ BAD_DATATYPES = ['external-id','commonsMedia','url', 'globe-coordinate', 'wikibase-lexeme', 'wikibase-property']
703
+
704
+ assert claim_df[~claim_df.datatype.isin(BAD_DATATYPES)].reference_id.unique().shape\
705
+ == claim_df.reference_id.unique().shape
706
+
707
+ print(claim_df.reference_id.unique().shape[0])
708
+ claim_df = claim_df[~claim_df.datatype.isin(BAD_DATATYPES)].reset_index(drop=True)
709
+
710
+ from tqdm.auto import tqdm
711
+ tqdm.pandas()
712
+
713
+ claim_df[['entity_label','entity_label_lan']] = pd.DataFrame(
714
+ claim_df.entity_id.progress_apply(Wd_API.get_label, non_language_set=True).tolist()
715
+ )
716
+ claim_df[['property_label','property_label_lan']] = pd.DataFrame(
717
+ claim_df.property_id.progress_apply(Wd_API.get_label, non_language_set=True).tolist()
718
+ )
719
+
720
+ claim_df[['entity_alias','entity_alias_lan']] = pd.DataFrame(
721
+ claim_df.entity_id.progress_apply(Wd_API.get_alias, non_language_set=True).tolist()
722
+ )
723
+ claim_df[['property_alias','property_alias_lan']] = pd.DataFrame(
724
+ claim_df.property_id.progress_apply(Wd_API.get_alias, non_language_set=True).tolist()
725
+ )
726
+
727
+ claim_df[['entity_desc','entity_desc_lan']] = pd.DataFrame(
728
+ claim_df.entity_id.progress_apply(Wd_API.get_desc, non_language_set=True).tolist()
729
+ )
730
+ claim_df[['property_desc','property_desc_lan']] = pd.DataFrame(
731
+ claim_df.property_id.progress_apply(Wd_API.get_desc, non_language_set=True).tolist()
732
+ )
733
+
734
+ claim_df['object_label'] = claim_df.apply(get_object_label_given_datatype, axis=1)
735
+ claim_df['object_alias'] = claim_df.apply(get_object_alias_given_datatype, axis=1)
736
+ claim_df['object_desc'] = claim_df.apply(get_object_desc_given_datatype, axis=1)
737
+
738
+ claim_df['object_label'], claim_df['object_label_lan'] = zip(*claim_df['object_label'].apply(lambda x: x if isinstance(x, tuple) else (x, '')))
739
+ claim_df['object_alias'], claim_df['object_alias_lan'] = zip(*claim_df['object_alias'].apply(lambda x: x if isinstance(x, tuple) else (x, '')))
740
+ claim_df['object_desc'], claim_df['object_desc_lan'] = zip(*claim_df['object_desc'].apply(lambda x: x if isinstance(x, tuple) else (x, '')))
741
+
742
+ # Removing bad object labels
743
+ claim_df = claim_df[claim_df['object_label_lan'] != 'none'].reset_index(drop=True)
744
+ return claim_df
745
+
746
+ def html2text(html_set):
747
+ reference_html_df = html_set
748
+ _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
749
+ ft_model = fasttext.load_model('base/lid.176.ftz')
750
+ def predict_language(text, k=20):
751
+ ls, scores = ft_model.predict(text, k=k) # top 20 matching languages
752
+ ls = [l.replace('__label__','') for l in ls]
753
+ return list(zip(ls,scores))
754
+ def get_url_language(html):
755
+ try:
756
+ soup = BeautifulSoup(html, "lxml")
757
+ [s.decompose() for s in soup("script")] # remove <script> elements
758
+ if soup.body == None:
759
+ return ('no body', None)
760
+ body_text = _RE_COMBINE_WHITESPACE.sub(" ", soup.body.get_text(' ')).strip()
761
+ return predict_language(body_text, k=1)[0]
762
+ except Exception:
763
+ raise
764
+ def get_text_p_tags(soup):
765
+ p_tags = soup.find_all('p')
766
+ text = [p.getText().strip() for p in p_tags if p.getText()]
767
+ return '\n'.join(text)
768
+ def clean_text_line_by_line(text, join=True, ch_join = ' ', verb=True):
769
+ # text = soup.body.get_text()
770
+ # break into lines and remove leading and trailing space on each
771
+ lines = list(text.splitlines())
772
+ lines = (line.strip() for line in lines)
773
+ # for each line, lets correct double spaces into single space
774
+ lines = (re.sub(r' {2,}', ' ', line) for line in lines)
775
+ # for each line, lets correct punctuation spaced to the left
776
+ lines = (re.sub(r' ([.,:;!?\\-])', r'\1', line) for line in lines)
777
+ # put periods if missing
778
+ lines = [line+'.' if line and line[-1] not in string.punctuation else line for i, line in enumerate(lines)]
779
+
780
+ if verb:
781
+ for i, line in enumerate(lines):
782
+ print(i,line)
783
+ # drop blank lines
784
+ if join:
785
+ return ch_join.join([line for line in lines if line])
786
+ else:
787
+ return [line for line in lines if line]
788
+
789
+ def apply_manual_rules(text):
790
+ # RULE: A line ending with a ':' followed by whitespaces and a newline is likely a continuing line and should be joined
791
+ #text = re.sub(
792
+ # r':\s*\n',
793
+ # r': ',
794
+ # text
795
+ #)
796
+ # RULE: Remove [1] reference numbers
797
+ text = re.sub(r'\[[0-9]+\]', '', text)
798
+ return text
799
+ def retrieve_text_from_html(html, soup_parser = 'lxml', verb=True, join=True):
800
+ if not isinstance(html, str) or 'DOCTYPE html' not in html:
801
+ return 'No body'
802
+ soup = BeautifulSoup(html, soup_parser)
803
+ for script in soup(["script", "style"]):
804
+ script.decompose()
805
+ if soup.body == None:
806
+ return 'No body'
807
+ [s.unwrap() for s in soup.body.find_all('strong')]
808
+
809
+ for p in soup.body.find_all('p'):
810
+ p.string = _RE_COMBINE_WHITESPACE.sub(" ", p.get_text('')).strip()
811
+
812
+ #DECOMPOSE ALL BAD TAGS
813
+ #--------------
814
+ #for c in ['warningbox', 'metadata', 'references', 'navbox', 'toc', 'catlinks']:
815
+ # for e in soup.body.find_all(class_=c):
816
+ # print('decomposed',e)
817
+ # e.decompose()
818
+
819
+ # DECOMPOSE INVISIBLE ELEMENTS
820
+ #for e in soup.body.find_all():
821
+ # if e.hidden:
822
+ # print('decomposed',e)
823
+ # e.decompose()
824
+ # else:
825
+ # if e.attrs is not None:
826
+ # #print(e)
827
+ # #print('-')
828
+ # style = e.get('style')
829
+ # if style and 'display' in style and 'none' in style:
830
+ # print('decomposed',e)
831
+ # e.decompose()
832
+ # #print(e, style)
833
+ #--------------
834
+
835
+ #print(soup.body)
836
+
837
+ # BOILERPLATE REMOVAL OPTIONS
838
+ #1. jusText
839
+ #text = justext.justext(html, justext.get_stoplist("English"))
840
+ #text = '\n'.join([paragraph.text for paragraph in text if not paragraph.is_boilerplate])
841
+
842
+ #2. boilerpy3
843
+ #html = soup.body
844
+ #text = extractor.get_content(soup.prettify())
845
+
846
+ #3. Just extracting from 'text tags' like p
847
+ #simple rules (does not work depending on website, like on artgallery.yale, anything without clear paragraphic style)
848
+ #text = get_text_p_tags(soup)
849
+
850
+ #4. NONE
851
+ text = soup.body.get_text(' ').strip() # NOT GETTING FROM THE WHOLE SOUP, JUST BODY TO AVOID TITLES
852
+
853
+ #POST PROCESSING
854
+ text = apply_manual_rules(text)
855
+ text = clean_text_line_by_line(text, ch_join = ' ', verb=verb, join=join)
856
+
857
+ if not text:
858
+ return 'No extractable text' if join else ['No extractable text']
859
+ else:
860
+ return text
861
+ i=0
862
+ print(i)
863
+ print(reference_html_df.url.iloc[i])
864
+
865
+ reference_html_df['extracted_sentences'] = reference_html_df.html.progress_apply(retrieve_text_from_html, join=False, verb=False)
866
+
867
+ join_ch = ' '
868
+ reference_html_df['extracted_text'] = reference_html_df.extracted_sentences.apply(lambda x : join_ch.join(x))
869
+
870
+ splitter = SentenceSplitter(language='en')
871
+
872
+ seg = pysbd.Segmenter(language="en", clean=False)
873
+
874
+ nlp = spacy.load("en_core_web_lg")
875
+
876
+ text = reference_html_df.loc[0,'extracted_text']
877
+
878
+ # OPTION 1
879
+ # This gets some things wrong, such as Smt.=Shrimati ending a sentence, or any
880
+ # initials like P. N. Nampoothiri or Lt. Col.
881
+ #sents = sent_tokenize(text)
882
+
883
+ # OPTION 2
884
+ # Also breaks titles and initials like above, but additionally gets parenthesis wrong, like
885
+ # Amma Maharani [break](queen mother) [break] of Travancore.
886
+ #sents = seg.segment(text)
887
+
888
+ # OPTION 3
889
+ # Same as above plus new ones, like breaking contractions (like m. for married)
890
+ #sents = splitter.split(text)
891
+
892
+ # OPTION 4
893
+ # By far the best option, makes way less of the mistakes above, but not none. So let's adopt a strategy so ease this.
894
+ sents = [s for s in nlp(text).sents]
895
+
896
+
897
+ reference_html_df['nlp_sentences'] = reference_html_df.extracted_text.progress_apply(lambda x : [str(s) for s in nlp(x).sents])
898
+ reference_html_df['nlp_sentences_slide_2'] = reference_html_df['nlp_sentences'].progress_apply(
899
+ lambda x : [' '.join([a,b]) for a,b in list(zip(x,x[1:]+['']))]
900
+ )
901
+
902
+ assert type(reference_html_df.loc[0,'nlp_sentences']) == list
903
+ assert type(reference_html_df.loc[0,'nlp_sentences'][0]) == str
904
+ assert type(reference_html_df.loc[0,'nlp_sentences_slide_2']) == list
905
+ assert type(reference_html_df.loc[0,'nlp_sentences_slide_2'][0]) == str
906
+ return reference_html_df
907
+
908
+ if __name__ == '__main__':
909
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
910
+ target_QID = 'Q3621696'
911
+ claimParser(target_QID) #save results in .db
912
+ filtered_df = propertyFiltering(target_QID) #update db and return dataframe after filtering
913
+ url_set = urlParser(target_QID) #from ref table in .db
914
+ html_set = htmlParser(url_set, target_QID) #Original html docs collection
915
+ try:
916
+ claim_text = claim2text(html_set) #Claims generation
917
+ html_text = html2text(html_set)
918
+ claim_text = claim_text.astype(str)
919
+ html_text = html_text.astype(str)
920
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
921
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
922
+ except Exception as e:
923
+ print(f"No accessible html documents")
924
+
925
+
926
+ conn.commit()
927
+ conn.close()
928
+ #augmented_df = textualAugmentation(filtered_df) #textual information augmentation including label, desc, and alias
929
+
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import Wikidata_Text_Parser as wtr
3
+ import sqlite3
4
+ import Prove_lite as prv
5
+ import pandas as pd
6
+ import numpy as np
7
+ import os
8
+
9
+ def wtr_process(qid):
10
+ try:
11
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
12
+ target_QID = qid
13
+ query = f"SELECT * FROM {'claim_text'}"
14
+ df = pd.read_sql_query(query, conn)
15
+ if target_QID in df['entity_id'].unique():
16
+ pass
17
+ else:
18
+ wtr.claimParser(target_QID) #save results in .db
19
+ filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
20
+ url_set = wtr.urlParser(target_QID) #from ref table in .db
21
+ html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
22
+ claim_text = wtr.claim2text(html_set) #Claims generation
23
+ html_text = wtr.html2text(html_set)
24
+ claim_text = claim_text.astype(str)
25
+ html_text = html_text.astype(str)
26
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
27
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
28
+ conn.commit()
29
+ query = f"""
30
+ SELECT
31
+ claim_text.entity_label,
32
+ claim_text.property_label,
33
+ claim_text.object_label,
34
+ html_text.url
35
+ FROM claim_text
36
+ INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id
37
+ WHERE claim_text.entity_id = '{target_QID}'
38
+ """
39
+
40
+ result_df = pd.read_sql_query(query, conn)
41
+
42
+ conn.commit()
43
+ conn.close()
44
+
45
+ return result_df
46
+
47
+ except Exception as e:
48
+ error_df = pd.DataFrame({'Error': [str(e)]})
49
+ return error_df
50
+
51
+
52
+ def prv_process(qid):
53
+ target_QID = qid
54
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
55
+ query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
56
+ claim_df = pd.read_sql_query(query, conn)
57
+
58
+ query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
59
+ reference_text_df = pd.read_sql_query(query, conn)
60
+
61
+ verbalised_claims_df_final = prv.verbalisation(claim_df)
62
+
63
+ progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar for Gradio
64
+ def update_progress(curr_step, total_steps):
65
+ progress((curr_step + 1) / total_steps)
66
+
67
+ splited_sentences_from_html = prv.setencesSpliter(verbalised_claims_df_final, reference_text_df, update_progress)
68
+
69
+ BATCH_SIZE = 512
70
+ N_TOP_SENTENCES = 5
71
+ SCORE_THRESHOLD = 0
72
+ evidence_df = prv.evidenceSelection(splited_sentences_from_html, BATCH_SIZE, N_TOP_SENTENCES)
73
+ result = prv.textEntailment(evidence_df, SCORE_THRESHOLD)
74
+ display_df = prv.TableMaking(verbalised_claims_df_final, result)
75
+ conn.commit()
76
+ conn.close()
77
+ return display_df
78
+
79
+
80
+
81
+ with gr.Blocks() as demo:
82
+ print("gradio started!")
83
+ gr.Markdown(
84
+ """
85
+ # Prove
86
+ This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
87
+ """
88
+ )
89
+ inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q245247)")
90
+ out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)", headers=["entity_label", "property_label", "object_label", "url"])
91
+ run_button_1 = gr.Button("Start parsing")
92
+ run_button_1.click(wtr_process, inp, out)
93
+
94
+
95
+ gr.Markdown(
96
+ """
97
+ Pre-trained language models-based text entailment.
98
+ """
99
+ )
100
+ out_2 = gr.HTML(label="Results")
101
+ run_button_2 = gr.Button("Start processing")
102
+ run_button_2.click(prv_process, inp, out_2)
103
+
104
+
105
+ if __name__ == "__main__":
106
+ #DB initialising
107
+ if os.path.isfile('wikidata_claims_refs_parsed.db') != True:
108
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
109
+ target_QID = 'Q115305900'
110
+ wtr.claimParser(target_QID) #save results in .db
111
+ filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
112
+ url_set = wtr.urlParser(target_QID) #from ref table in .db
113
+ html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
114
+ claim_text = wtr.claim2text(html_set) #Claims generation
115
+ html_text = wtr.html2text(html_set)
116
+ claim_text = claim_text.astype(str)
117
+ html_text = html_text.astype(str)
118
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
119
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
120
+ conn.commit()
121
+ conn.close()
122
+ demo.launch(share=True)
llm_load copy.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import login
2
+ from unsloth import FastLanguageModel
3
+ import torch
4
+ from transformers import TextStreamer
5
+
6
+ def llmLoad(max_seq_length):
7
+ with open('API_key.txt', 'r') as file:
8
+ token = file.read().strip()
9
+ login(token=token)
10
+
11
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
12
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
13
+
14
+ # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
15
+ fourbit_models = [
16
+ "unsloth/mistral-7b-bnb-4bit",
17
+ "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
18
+ "unsloth/llama-2-7b-bnb-4bit",
19
+ "unsloth/gemma-7b-bnb-4bit",
20
+ "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
21
+ "unsloth/gemma-2b-bnb-4bit",
22
+ "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
23
+ "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
24
+ ] # More models at https://huggingface.co/unsloth
25
+
26
+ model, tokenizer = FastLanguageModel.from_pretrained(
27
+ model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
28
+ max_seq_length = max_seq_length,
29
+ dtype = dtype,
30
+ load_in_4bit = load_in_4bit,
31
+ )
32
+ return tokenizer, model
33
+
34
+ def llmQuestion(tokenizer, model, instruct, question, output_size):
35
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
36
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
37
+
38
+ ### Instruction:
39
+ {}
40
+
41
+ ### Input:
42
+ {}
43
+
44
+ ### Response:
45
+ {}"""
46
+
47
+ # alpaca_prompt = Copied from above
48
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
49
+ inputs = tokenizer(
50
+ [
51
+ alpaca_prompt.format(
52
+ instruct, # instruction
53
+ question, # input
54
+ "", # output - leave this blank for generation!
55
+ )
56
+ ], return_tensors = "pt").to("cuda")
57
+
58
+
59
+ outputs = model.generate(**inputs, max_new_tokens=output_size, use_cache=True)
60
+ output_text = tokenizer.batch_decode(outputs)[0].split('### Response:')[1]
61
+
62
+ return output_text
63
+
64
+ if __name__ == "__main__":
65
+ tokenizer, model = llmLoad(8192)
66
+ sentences = """['\n \n \n\t\t\t\n\t\t\t\n\t\t \n \n \n \n \n \n \n \n \n \n UK News Website of the Year\n \n \n The Telegraph logo\n \n \n \n \n ',
67
+ '\n \n \n \n \n Search Icon\n \n \n \n News \n Sport \n Money \n Travel \n ',
68
+ 'Business \n Health \n Opinion \n General election \n Ukraine \n Royals \n Life & Style \n Culture \n ',
69
+ " Puzzles \n \n \n\t\t(function () {\n\t\t\tdocument.querySelectorAll('.site-header__navigation .e-site-header-button__link').forEach(link => {\n\t\t\t\tlink.addEventListener('click', (e) => {\n",
70
+ '\t\t\t\t\teVar94 = "header-search-icon-mobile";\n\t\t\t\t\teVar95 = link.textContent.trim();\n\t\t\t\t\teVar96 = "img";\n\t\t\t\t\teVar97 = document.title;\n\t\t\t\t\ttmgComponentString = eVar94+";"+eVar95+"_"+eVar96+";"+eVar97;\n',
71
+ '\t\t\t\t\tlocalStorage.setItem("tmgComponentTracking", tmgComponentString);\n\t\t\t\t});\n\t\t\t});\n\t\t})();\n\t\n \n \n \n \n \n \n UK Edition \n \n \n ',
72
+ ' \n \n \n US Edition \n \n \n \n \n \n \n \n Search Icon\n \n \n \n Subscribe now Free for one month',
73
+ ' \n \n \n \n \n \n \n \n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tLog in\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n \n \n \n \n \n \n \n \n \n \n ',
74
+ ' \n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n Sections\n \n ',
75
+ ' \n \n UK Edition \n \n \n \n \n \n US Edition \n \n \n \n \n \n \n',
76
+ ' News\n \n \n \n \n News home \n UK news \n Politics \n World \n Health news \n Defe',
77
+ 'nce \n Science \n Education \n Environment \n Investigations \n Global Health Security \n \n Sport\n \n \n ',
78
+ " \n \n Sport home \n Football \n Rugby Union \n Cricket \n F1 \n Golf \n Tennis \n Women's Sp",
79
+ 'ort \n Racing \n Cycling \n Boxing \n More... \n \n Money\n \n \n \n \n Money home \n ',
80
+ ' Property \n Tax \n Pensions \n Banking \n Investing \n Net Zero \n Calculators \n Guides \n \n ',
81
+ 'Travel\n \n \n \n \n Travel home \n Europe \n UK \n Worldwide \n City breaks \n Hotels \n ',
82
+ ' Cruise \n Ski \n Advice \n \n Business\n \n \n \n \n Business home \n Alex \n Ec',
83
+ 'onomy \n Companies \n Markets \n Tech \n \n Health\n \n \n \n \n Health home \n Diet \n ',
84
+ ' Fitness \n Conditions \n Wellbeing \n Parenting \n Guides \n Tools \n \n Opinion\n \n \n ',
85
+ ' \n \n Opinion home \n Obituaries \n Letters to the Editor \n Telegraph View \n Our columnists \n Cartoons \n \n ',
86
+ ' General election \n Ukraine\n \n \n \n \n Ukraine home \n Daily podcast \n Daily newsletter \n \n ',
87
+ ' Royals\n \n \n \n \n Royals home \n King Charles III \n Queen Camilla \n Prince William \n Prince',
88
+ 'ss of Wales \n Prince Harry \n Duchess of Sussex \n \n Life & Style\n \n \n \n \n Life & Style home \n ',
89
+ ' Family \n Columnists \n Cookbook \n Food & Drink \n Fashion \n Beauty \n Luxury \n Cars \n Gardening \n ',
90
+ ' Interiors \n Puzzle News \n Recommended \n Tel Mag \n \n Culture\n \n \n \n \n Culture hom',
91
+ 'e \n TV \n Film \n Music \n Books \n Theatre \n Comedy \n Dance \n Opera \n Art \n \n ',
92
+ ' Telegraph Tickets \n \n Puzzles \n \n \n \n \n \n \n \n UK Edition \n \n \n \n ',
93
+ ' \n US Edition \n \n \n \n \n \n \n \n \n Subscribe now Free for one month \n \n \n ',
94
+ ' \n \n \n \n \n \n \n \n Log in\n \n Login icon\n \n \n \n \n ',
95
+ ' \n Follow us on:\n \n \n \n Facebook icon\n \n \n \n Instagram icon\n \n \n ',
96
+ ' \n X icon\n \n \n \n Snapchat icon\n \n \n \n LinkedIn icon\n \n \n \n ',
97
+ ' YouTube icon \n \n \n \n \n \n \n More from The Telegraph\n \n \n Download our app \n Newsletters \n ',
98
+ ' Telegraph Extra \n Recommended \n Financial Solutions \n Events \n Betting \n Dating \n Offers \n Travel offers \n Shop \n ',
99
+ ' Garden shop \n Bookshop \n Tickets \n Puzzles \n Fantasy Football \n Work at The Telegraph \n Telegraph Corporate \n Help and suppo',
100
+ 'rt \n The Chelsea Magazine Company \n Broadband and Mobile Deals \n Voucher codes \n \n See top shops\n \n \n \n ',
101
+ ' \n Samsung \n Nike \n ASOS \n eBay \n Currys \n Wayfair \n TUI \n JD Sports \n Travelodg',
102
+ 'e \n Adidas \n Broadband deals \n Cheap broadband \n Broadband in my area \n Broadband and TV deals \n Mobile deals \n ',
103
+ " SIM-only deals \n \n \n \n \n \n \n \n \n \n \n\t\t(function () {\n\t\t\tdocument.querySelectorAll('.site-header__buttons .e-site-header-button__link').forE",
104
+ 'ach(link => {\n\t\t\t\tlink.addEventListener(\'click\', (e) => {\n\t\t\t\t\teVar94 = "header-search-icon-desktop";\n\t\t\t\t\teVar95 = link.textContent.trim();\n\t\t\t\t\teVar96 = "img";\n\t\t\t\t\teVar97 = document.title;\n\t\t\t\t\ttmg',
105
+ 'ComponentString = eVar94+";"+eVar95+"_"+eVar96+";"+eVar97;\n\t\t\t\t\tlocalStorage.setItem("tmgComponentTracking", tmgComponentString);\n\t\t\t\t});\n\t\t\t});\n\t\t})();\n\t\n \n\t\n\t\t\n\t\t\t\n\t\t Jump to navigation\n \n \n \n',
106
+ " \n \n \n \n \n \n Hitch Hiker's Guide author Douglas Adams dies aged 49\n \n \n \n \n By Andrew Alderson and Daniel Foggo 13 May 2001 • 12:00am \n \n \n ",
107
+ "\n \n \n \n DOUGLAS ADAMS, the thought-provoking author who inspired a generation with his cult science-fiction novel, The Hitch Hiker's Guide to the Galaxy, has died at the age of 4",
108
+ '9 from a heart attack while working out at the gym.\n \n \n \n \n \n \n \n \n \n \n Douglas Adams: inspired a generation with t',
109
+ 'he cult novel, A Hitch Hiker\'s Guide to the Galaxy\n \n \n \n \n \n \n \n Adams\'s age was seven more than his cryptic answer of "42" to the intriguing ques',
110
+ 'tion the comic novel had posed: what is the answer to life, the universe and everything? His book has sold more than 14 million copies worldwide, but Adams became a household name in Britain after it ',
111
+ 'was turned into a BBC television series in the early 1980s.\n \n \n \n \n Adams, 6ft 5in tall and well built, did not have a history of heart problems. However, say friends, he',
112
+ ' had visited the doctor just days ago complaining of a numbness in his arm. He collapsed on Friday while exercising at a gym in Santa Barbara on the west coast of America and never regained consciousn',
113
+ 'ess. He leaves a widow and a six-year-old daughter.\n \n \n \n \n Adams was British but moved with his family to California in 1999, to be involved in a Disney film version of ',
114
+ 'his book: he had previously lived in Islington, north London, for 22 years. A complex man, he was transported from obscurity to fame in 1979 by the instant success of his novel, which became hugely po',
115
+ 'pular with students.\n \n \n \n \n Soon after the book was published, he was invited to sign copies at a small Soho bookshop. On his way there, Adams became convinced he was be',
116
+ 'en caught up in a demonstration, only to discover the crowds were waiting for him.\n \n \n \n \n The book shot to the number one spot in the best-seller list the next day. He s',
117
+ 'aid: "It was like being helicoptered to the top of Mount Everest, or having an orgasm without the foreplay." Adams, however, later suffered from writer\'s block and was so notoriously bad at meeting de',
118
+ "adlines that Sue Freestone, his former publisher, was even known to move into his house to bully him into writing.\n \n \n \n \n Ed Victor, Adams's literary agent for 20 years ",
119
+ 'and a close friend, was devastated by the news yesterday. He said: "I feel as if someone has torn a limb off me. Tragic is an overused word, but this really is a tragic loss.\n \n \n \n ',
120
+ ' \n Mr Victor said: "He was one of the truly original writers and thinkers of our generation who should have had many years ahead of him. He was not only entertaining, but also stimulating an',
121
+ 'd provoking: he was a unique thinker with a huge audience."\n \n \n \n \n Mr Victor said that writer\'s block had been a terrible problem for Adams, who hated spending time alon',
122
+ 'e. He said: "He was once locked in a hotel suite at the Berkeley for two weeks by Sonny Mehta [his former publisher]. When I asked Douglas how it had worked, he said: \'It was simple. I sat at the desk',
123
+ ' and typed and Sonny sat in an armchair and glowered.\' "\n \n \n \n \n Adams was said to have used The Hitch Hiker\'s Guide, which started off as a radio show in the 1970s, to p',
124
+ 'oke fun at those who seek solutions to unanswerable questions. It was intended to highlight the absurdity of attempting to do so.\n \n \n \n \n The novel has since been turned ',
125
+ 'into a play and a computer game, and has spawned four sequels. Adams also set up a website called h2g2, an entertainment guide now run by the BBC, as a spin-off from his book.\n \n \n \n ',
126
+ ' \n In his novel, which deals with the voyages of a suburban earthling, Arthur Dent, Adams describes a race of hyper-intelligent beings, who had reached a point where they were determined to',
127
+ ' understand the purpose of the universe and their own existence.\n \n \n \n \n They built a supercomputer, Deep Thought, and asked it for the answer to the ultimate question of',
128
+ ' life, the universe and everything. The computer worked for several millennia on the answer. Finally, the beings were shocked and disappointed with the computer\'s ridiculous response: "42".\n \n ',
129
+ ' \n \n \n In the book, the Earth is referred to as "mostly harmless", which became a buzz phrase of the 1980s. Adams was born in Cambridge in 1952 and educated at Brentwood School, E',
130
+ "ssex, before returning to Cambridge to study at St John's College.\n \n \n \n \n His early career included work as a radio and television writer and producer. Some of his early",
131
+ " writing was with his friend Graham Chapman, a member of the Monty Python's Flying Circus comedy team.\n \n \n \n \n He later collaborated with Terry Jones, another Python team",
132
+ ' member. Jones was in tears after learning of his friend\'s death yesterday. He told the Telegraph: "Douglas was a total original: he had a beautiful way of thinking and an incisive mind that went stra',
133
+ 'ight to the heart of matters. He had a genius for putting those concepts into words. His books were great works of literature. He was a lovely man, and I loved him."\n \n \n \n \n ',
134
+ ' Senior staff at the BBC, who worked with Adams, were equally sad. Alan Yentob, the corporation\'s director of drama and entertainment, said: "Douglas was a big character who will be hugely missed b',
135
+ 'y a host of friends and millions of fans around the world."\n \n \n \n \n Geoffrey Perkins, the BBC\'s head of comedy and who produced the original radio series of the novel, sa',
136
+ 'id: "I\'ve known Douglas for 25 years. He was absolutely one of the most creative geniuses to ever work in radio comedy."\n \n \n \n \n Adams\'s life was transformed by the publi',
137
+ "cation of The Hitch Hiker's Guide providing him with a wealth he had never imagined. He married Jane Belson, a barrister, in 1991 and they had a daughter, Polly, in 1994.\n \n \n \n ",
138
+ "\n Adams's other bestselling titles include The Restaurant at the End of the Universe; Life, the Universe and Everything and So Long, and Thanks for All the Fish. He was in discussion to turn an",
139
+ "other of his books, Dirk Gently's Holistic Detective Agency, into a film and was working on another novel, which was 12 years late.\n \n \n \n \n \n \n \n \n \n ",
140
+ ' \n Twitter Icon\n \n \n \n Facebook Icon\n \n \n \n WhatsApp Icon\n \n \n \n Email Icon\n ',
141
+ ' \n \n \n \n \n Comment speech bubble\n \n \n \n \n \n \n \n \n \n Advertisement\n \n \n \n\n\tMore stories\n\n\n',
142
+ '\n\n\n\n\n\n\n\t\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n Twitter Icon\n \n \n \n Facebook Icon\n \n \n \n Whats',
143
+ 'App Icon\n \n \n \n Email Icon\n \n \n \n \n \n Comment speech bubble\n \n \n \n \n \n \n \n \n \n \n\n\tMore from The T',
144
+ 'elegraph\n\n\n\n\n\n\n\n\n\n\t\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\n\t\n\n\t\n\n\t\n\n\t\t\n\t\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\t\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\n\n\t\t\t\t\t\n\t\t\t\t\t\tMore stories\n\t\t\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\n\t\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t',
145
+ '\n\n\t\t\n\n\t\n\n\n\n\t\t\n\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n',
146
+ '\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tProtesters charged after blocking coach bound ',
147
+ 'for Bibby Stockholm \n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n',
148
+ '\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tTelegraph Reporters\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 May 2024, 1:53am\n\t\t\t\t\t\t\t\n',
149
+ '\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t',
150
+ '\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t',
151
+ '\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t',
152
+ '\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tCanada police lay charges over murder of Sikh leader and probe Indian ties\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t',
153
+ '\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tOur F',
154
+ 'oreign Staff\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 May 2024, 1:12am\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t',
155
+ '\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t',
156
+ '\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n',
157
+ '\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n',
158
+ '\t\t\t\t\n\n\t\t\t\tKing takes on hundreds of new patronages\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t',
159
+ '\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tVictoria Ward\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 M',
160
+ 'ay 2024, 12:01am\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n',
161
+ '\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t',
162
+ '\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t',
163
+ '\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tLabour’s strategy ‘won’t last’ into a general election, says Cabinet minister\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n',
164
+ '\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t',
165
+ '\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tJack Maidment\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 11:01pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t',
166
+ '\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n',
167
+ '\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t',
168
+ '\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t',
169
+ '\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tLuton waste chance to start great escape in draw with Everton\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t',
170
+ '\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tWill Conroy\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n',
171
+ '\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 10:53pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n',
172
+ '\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t',
173
+ '\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n',
174
+ '\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tSeven things you may have missed in the local elections\n\n\t\t\t',
175
+ '\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t',
176
+ '\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tDominic Penna\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 10:37pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n',
177
+ '\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n',
178
+ '\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t',
179
+ '\n\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n The Telegraph\n \n \n \n Back to top\n \n \n \n \n \n Follow us on:\n \n \n ',
180
+ ' \n Facebook icon\n \n \n \n Instagram icon\n \n \n \n X icon\n \n \n \n Snapchat icon\n \n \n',
181
+ ' \n LinkedIn icon\n \n \n \n YouTube icon \n \n \n \n \n \n \n \n \n \n Help Centre\n About us\n Telegraph Extra\n ',
182
+ ' Reader Prints\n Branded Content\n Syndication and Commissioning\n Fantasy Sport\n UK Voucher Codes\n Betting Offers\n Tax Strategy\n Broadband and Mobile Deals\n',
183
+ ' The Chelsea Magazine Company\n Newsletters\n Download the Telegraph App\n Privacy\n Terms & Conditions\n Modern Slavery\n Advertising terms\n Guidelines\n ',
184
+ " \n \n © Telegraph Media Group Limited 2024\n \n \n \n \n \n \n\twindow.addEventListener( 'DOMContentLoaded', function() {\n\t\t_satellite.pageBottom();\n\t});\n\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\n\t \n\t\t\n\t\t\t\n\t",
185
+ "\t\t\t\n\t\t\t\n\t\n window.RUM_BASE = '/';\nimport { sampleRUM } from '/.rum/@adobe/helix-rum-js@^1/src/index.js';\nsampleRUM('lazy');\nsampleRUM('cwv');\n\n "]"""
186
+ instruct = "Find relevant sentences from text_dump with given the target sentence"
187
+ question = f"target sentence:'Adam douglas was born in Cambrige', text_dump:{sentences}"
188
+ answer = llmQuestion(tokenizer, model, instruct, question, 8192, 8192)
llm_load.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import login
2
+ from unsloth import FastLanguageModel
3
+ import torch
4
+ from transformers import TextStreamer
5
+
6
+ def llmLoad(max_seq_length):
7
+ with open('API_key.txt', 'r') as file:
8
+ token = file.read().strip()
9
+ login(token=token)
10
+
11
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
12
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
13
+
14
+ # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
15
+ fourbit_models = [
16
+ "unsloth/mistral-7b-bnb-4bit",
17
+ "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
18
+ "unsloth/llama-2-7b-bnb-4bit",
19
+ "unsloth/gemma-7b-bnb-4bit",
20
+ "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
21
+ "unsloth/gemma-2b-bnb-4bit",
22
+ "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
23
+ "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
24
+ ] # More models at https://huggingface.co/unsloth
25
+
26
+ model, tokenizer = FastLanguageModel.from_pretrained(
27
+ model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
28
+ max_seq_length = max_seq_length,
29
+ dtype = dtype,
30
+ load_in_4bit = load_in_4bit,
31
+ )
32
+ return tokenizer, model
33
+
34
+ def llmQuestion(tokenizer, model, instruct, question, output_size):
35
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
36
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
37
+
38
+ ### Instruction:
39
+ {}
40
+
41
+ ### Input:
42
+ {}
43
+
44
+ ### Response:
45
+ {}"""
46
+
47
+ # alpaca_prompt = Copied from above
48
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
49
+ inputs = tokenizer(
50
+ [
51
+ alpaca_prompt.format(
52
+ instruct, # instruction
53
+ question, # input
54
+ "", # output - leave this blank for generation!
55
+ )
56
+ ], return_tensors = "pt").to("cuda")
57
+
58
+
59
+ outputs = model.generate(**inputs, max_new_tokens=output_size, use_cache=True)
60
+ output_text = tokenizer.batch_decode(outputs)[0].split('### Response:')[1]
61
+
62
+ return output_text
63
+
64
+ if __name__ == "__main__":
65
+ tokenizer, model = llmLoad(8192)
66
+ sentences = """['\n \n \n\t\t\t\n\t\t\t\n\t\t \n \n \n \n \n \n \n \n \n \n UK News Website of the Year\n \n \n The Telegraph logo\n \n \n \n \n ',
67
+ '\n \n \n \n \n Search Icon\n \n \n \n News \n Sport \n Money \n Travel \n ',
68
+ 'Business \n Health \n Opinion \n General election \n Ukraine \n Royals \n Life & Style \n Culture \n ',
69
+ " Puzzles \n \n \n\t\t(function () {\n\t\t\tdocument.querySelectorAll('.site-header__navigation .e-site-header-button__link').forEach(link => {\n\t\t\t\tlink.addEventListener('click', (e) => {\n",
70
+ '\t\t\t\t\teVar94 = "header-search-icon-mobile";\n\t\t\t\t\teVar95 = link.textContent.trim();\n\t\t\t\t\teVar96 = "img";\n\t\t\t\t\teVar97 = document.title;\n\t\t\t\t\ttmgComponentString = eVar94+";"+eVar95+"_"+eVar96+";"+eVar97;\n',
71
+ '\t\t\t\t\tlocalStorage.setItem("tmgComponentTracking", tmgComponentString);\n\t\t\t\t});\n\t\t\t});\n\t\t})();\n\t\n \n \n \n \n \n \n UK Edition \n \n \n ',
72
+ ' \n \n \n US Edition \n \n \n \n \n \n \n \n Search Icon\n \n \n \n Subscribe now Free for one month',
73
+ ' \n \n \n \n \n \n \n \n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tLog in\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n \n \n \n \n \n \n \n \n \n \n ',
74
+ ' \n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n Sections\n \n ',
75
+ ' \n \n UK Edition \n \n \n \n \n \n US Edition \n \n \n \n \n \n \n',
76
+ ' News\n \n \n \n \n News home \n UK news \n Politics \n World \n Health news \n Defe',
77
+ 'nce \n Science \n Education \n Environment \n Investigations \n Global Health Security \n \n Sport\n \n \n ',
78
+ " \n \n Sport home \n Football \n Rugby Union \n Cricket \n F1 \n Golf \n Tennis \n Women's Sp",
79
+ 'ort \n Racing \n Cycling \n Boxing \n More... \n \n Money\n \n \n \n \n Money home \n ',
80
+ ' Property \n Tax \n Pensions \n Banking \n Investing \n Net Zero \n Calculators \n Guides \n \n ',
81
+ 'Travel\n \n \n \n \n Travel home \n Europe \n UK \n Worldwide \n City breaks \n Hotels \n ',
82
+ ' Cruise \n Ski \n Advice \n \n Business\n \n \n \n \n Business home \n Alex \n Ec',
83
+ 'onomy \n Companies \n Markets \n Tech \n \n Health\n \n \n \n \n Health home \n Diet \n ',
84
+ ' Fitness \n Conditions \n Wellbeing \n Parenting \n Guides \n Tools \n \n Opinion\n \n \n ',
85
+ ' \n \n Opinion home \n Obituaries \n Letters to the Editor \n Telegraph View \n Our columnists \n Cartoons \n \n ',
86
+ ' General election \n Ukraine\n \n \n \n \n Ukraine home \n Daily podcast \n Daily newsletter \n \n ',
87
+ ' Royals\n \n \n \n \n Royals home \n King Charles III \n Queen Camilla \n Prince William \n Prince',
88
+ 'ss of Wales \n Prince Harry \n Duchess of Sussex \n \n Life & Style\n \n \n \n \n Life & Style home \n ',
89
+ ' Family \n Columnists \n Cookbook \n Food & Drink \n Fashion \n Beauty \n Luxury \n Cars \n Gardening \n ',
90
+ ' Interiors \n Puzzle News \n Recommended \n Tel Mag \n \n Culture\n \n \n \n \n Culture hom',
91
+ 'e \n TV \n Film \n Music \n Books \n Theatre \n Comedy \n Dance \n Opera \n Art \n \n ',
92
+ ' Telegraph Tickets \n \n Puzzles \n \n \n \n \n \n \n \n UK Edition \n \n \n \n ',
93
+ ' \n US Edition \n \n \n \n \n \n \n \n \n Subscribe now Free for one month \n \n \n ',
94
+ ' \n \n \n \n \n \n \n \n Log in\n \n Login icon\n \n \n \n \n ',
95
+ ' \n Follow us on:\n \n \n \n Facebook icon\n \n \n \n Instagram icon\n \n \n ',
96
+ ' \n X icon\n \n \n \n Snapchat icon\n \n \n \n LinkedIn icon\n \n \n \n ',
97
+ ' YouTube icon \n \n \n \n \n \n \n More from The Telegraph\n \n \n Download our app \n Newsletters \n ',
98
+ ' Telegraph Extra \n Recommended \n Financial Solutions \n Events \n Betting \n Dating \n Offers \n Travel offers \n Shop \n ',
99
+ ' Garden shop \n Bookshop \n Tickets \n Puzzles \n Fantasy Football \n Work at The Telegraph \n Telegraph Corporate \n Help and suppo',
100
+ 'rt \n The Chelsea Magazine Company \n Broadband and Mobile Deals \n Voucher codes \n \n See top shops\n \n \n \n ',
101
+ ' \n Samsung \n Nike \n ASOS \n eBay \n Currys \n Wayfair \n TUI \n JD Sports \n Travelodg',
102
+ 'e \n Adidas \n Broadband deals \n Cheap broadband \n Broadband in my area \n Broadband and TV deals \n Mobile deals \n ',
103
+ " SIM-only deals \n \n \n \n \n \n \n \n \n \n \n\t\t(function () {\n\t\t\tdocument.querySelectorAll('.site-header__buttons .e-site-header-button__link').forE",
104
+ 'ach(link => {\n\t\t\t\tlink.addEventListener(\'click\', (e) => {\n\t\t\t\t\teVar94 = "header-search-icon-desktop";\n\t\t\t\t\teVar95 = link.textContent.trim();\n\t\t\t\t\teVar96 = "img";\n\t\t\t\t\teVar97 = document.title;\n\t\t\t\t\ttmg',
105
+ 'ComponentString = eVar94+";"+eVar95+"_"+eVar96+";"+eVar97;\n\t\t\t\t\tlocalStorage.setItem("tmgComponentTracking", tmgComponentString);\n\t\t\t\t});\n\t\t\t});\n\t\t})();\n\t\n \n\t\n\t\t\n\t\t\t\n\t\t Jump to navigation\n \n \n \n',
106
+ " \n \n \n \n \n \n Hitch Hiker's Guide author Douglas Adams dies aged 49\n \n \n \n \n By Andrew Alderson and Daniel Foggo 13 May 2001 • 12:00am \n \n \n ",
107
+ "\n \n \n \n DOUGLAS ADAMS, the thought-provoking author who inspired a generation with his cult science-fiction novel, The Hitch Hiker's Guide to the Galaxy, has died at the age of 4",
108
+ '9 from a heart attack while working out at the gym.\n \n \n \n \n \n \n \n \n \n \n Douglas Adams: inspired a generation with t',
109
+ 'he cult novel, A Hitch Hiker\'s Guide to the Galaxy\n \n \n \n \n \n \n \n Adams\'s age was seven more than his cryptic answer of "42" to the intriguing ques',
110
+ 'tion the comic novel had posed: what is the answer to life, the universe and everything? His book has sold more than 14 million copies worldwide, but Adams became a household name in Britain after it ',
111
+ 'was turned into a BBC television series in the early 1980s.\n \n \n \n \n Adams, 6ft 5in tall and well built, did not have a history of heart problems. However, say friends, he',
112
+ ' had visited the doctor just days ago complaining of a numbness in his arm. He collapsed on Friday while exercising at a gym in Santa Barbara on the west coast of America and never regained consciousn',
113
+ 'ess. He leaves a widow and a six-year-old daughter.\n \n \n \n \n Adams was British but moved with his family to California in 1999, to be involved in a Disney film version of ',
114
+ 'his book: he had previously lived in Islington, north London, for 22 years. A complex man, he was transported from obscurity to fame in 1979 by the instant success of his novel, which became hugely po',
115
+ 'pular with students.\n \n \n \n \n Soon after the book was published, he was invited to sign copies at a small Soho bookshop. On his way there, Adams became convinced he was be',
116
+ 'en caught up in a demonstration, only to discover the crowds were waiting for him.\n \n \n \n \n The book shot to the number one spot in the best-seller list the next day. He s',
117
+ 'aid: "It was like being helicoptered to the top of Mount Everest, or having an orgasm without the foreplay." Adams, however, later suffered from writer\'s block and was so notoriously bad at meeting de',
118
+ "adlines that Sue Freestone, his former publisher, was even known to move into his house to bully him into writing.\n \n \n \n \n Ed Victor, Adams's literary agent for 20 years ",
119
+ 'and a close friend, was devastated by the news yesterday. He said: "I feel as if someone has torn a limb off me. Tragic is an overused word, but this really is a tragic loss.\n \n \n \n ',
120
+ ' \n Mr Victor said: "He was one of the truly original writers and thinkers of our generation who should have had many years ahead of him. He was not only entertaining, but also stimulating an',
121
+ 'd provoking: he was a unique thinker with a huge audience."\n \n \n \n \n Mr Victor said that writer\'s block had been a terrible problem for Adams, who hated spending time alon',
122
+ 'e. He said: "He was once locked in a hotel suite at the Berkeley for two weeks by Sonny Mehta [his former publisher]. When I asked Douglas how it had worked, he said: \'It was simple. I sat at the desk',
123
+ ' and typed and Sonny sat in an armchair and glowered.\' "\n \n \n \n \n Adams was said to have used The Hitch Hiker\'s Guide, which started off as a radio show in the 1970s, to p',
124
+ 'oke fun at those who seek solutions to unanswerable questions. It was intended to highlight the absurdity of attempting to do so.\n \n \n \n \n The novel has since been turned ',
125
+ 'into a play and a computer game, and has spawned four sequels. Adams also set up a website called h2g2, an entertainment guide now run by the BBC, as a spin-off from his book.\n \n \n \n ',
126
+ ' \n In his novel, which deals with the voyages of a suburban earthling, Arthur Dent, Adams describes a race of hyper-intelligent beings, who had reached a point where they were determined to',
127
+ ' understand the purpose of the universe and their own existence.\n \n \n \n \n They built a supercomputer, Deep Thought, and asked it for the answer to the ultimate question of',
128
+ ' life, the universe and everything. The computer worked for several millennia on the answer. Finally, the beings were shocked and disappointed with the computer\'s ridiculous response: "42".\n \n ',
129
+ ' \n \n \n In the book, the Earth is referred to as "mostly harmless", which became a buzz phrase of the 1980s. Adams was born in Cambridge in 1952 and educated at Brentwood School, E',
130
+ "ssex, before returning to Cambridge to study at St John's College.\n \n \n \n \n His early career included work as a radio and television writer and producer. Some of his early",
131
+ " writing was with his friend Graham Chapman, a member of the Monty Python's Flying Circus comedy team.\n \n \n \n \n He later collaborated with Terry Jones, another Python team",
132
+ ' member. Jones was in tears after learning of his friend\'s death yesterday. He told the Telegraph: "Douglas was a total original: he had a beautiful way of thinking and an incisive mind that went stra',
133
+ 'ight to the heart of matters. He had a genius for putting those concepts into words. His books were great works of literature. He was a lovely man, and I loved him."\n \n \n \n \n ',
134
+ ' Senior staff at the BBC, who worked with Adams, were equally sad. Alan Yentob, the corporation\'s director of drama and entertainment, said: "Douglas was a big character who will be hugely missed b',
135
+ 'y a host of friends and millions of fans around the world."\n \n \n \n \n Geoffrey Perkins, the BBC\'s head of comedy and who produced the original radio series of the novel, sa',
136
+ 'id: "I\'ve known Douglas for 25 years. He was absolutely one of the most creative geniuses to ever work in radio comedy."\n \n \n \n \n Adams\'s life was transformed by the publi',
137
+ "cation of The Hitch Hiker's Guide providing him with a wealth he had never imagined. He married Jane Belson, a barrister, in 1991 and they had a daughter, Polly, in 1994.\n \n \n \n ",
138
+ "\n Adams's other bestselling titles include The Restaurant at the End of the Universe; Life, the Universe and Everything and So Long, and Thanks for All the Fish. He was in discussion to turn an",
139
+ "other of his books, Dirk Gently's Holistic Detective Agency, into a film and was working on another novel, which was 12 years late.\n \n \n \n \n \n \n \n \n \n ",
140
+ ' \n Twitter Icon\n \n \n \n Facebook Icon\n \n \n \n WhatsApp Icon\n \n \n \n Email Icon\n ',
141
+ ' \n \n \n \n \n Comment speech bubble\n \n \n \n \n \n \n \n \n \n Advertisement\n \n \n \n\n\tMore stories\n\n\n',
142
+ '\n\n\n\n\n\n\n\t\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n Twitter Icon\n \n \n \n Facebook Icon\n \n \n \n Whats',
143
+ 'App Icon\n \n \n \n Email Icon\n \n \n \n \n \n Comment speech bubble\n \n \n \n \n \n \n \n \n \n \n\n\tMore from The T',
144
+ 'elegraph\n\n\n\n\n\n\n\n\n\n\t\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\n\t\n\n\t\n\n\t\n\n\t\t\n\t\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\t\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\n\n\t\t\t\t\t\n\t\t\t\t\t\tMore stories\n\t\t\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\n\t\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t',
145
+ '\n\n\t\t\n\n\t\n\n\n\n\t\t\n\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n',
146
+ '\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tProtesters charged after blocking coach bound ',
147
+ 'for Bibby Stockholm \n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n',
148
+ '\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tTelegraph Reporters\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 May 2024, 1:53am\n\t\t\t\t\t\t\t\n',
149
+ '\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t',
150
+ '\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t',
151
+ '\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t',
152
+ '\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tCanada police lay charges over murder of Sikh leader and probe Indian ties\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t',
153
+ '\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tOur F',
154
+ 'oreign Staff\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 May 2024, 1:12am\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t',
155
+ '\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t',
156
+ '\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n',
157
+ '\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n',
158
+ '\t\t\t\t\n\n\t\t\t\tKing takes on hundreds of new patronages\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t',
159
+ '\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tVictoria Ward\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 M',
160
+ 'ay 2024, 12:01am\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n',
161
+ '\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t',
162
+ '\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t',
163
+ '\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tLabour’s strategy ‘won’t last’ into a general election, says Cabinet minister\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n',
164
+ '\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t',
165
+ '\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tJack Maidment\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 11:01pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t',
166
+ '\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n',
167
+ '\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t',
168
+ '\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t',
169
+ '\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tLuton waste chance to start great escape in draw with Everton\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t',
170
+ '\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tWill Conroy\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n',
171
+ '\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 10:53pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n',
172
+ '\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t',
173
+ '\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n',
174
+ '\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tSeven things you may have missed in the local elections\n\n\t\t\t',
175
+ '\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t',
176
+ '\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tDominic Penna\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 10:37pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n',
177
+ '\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n',
178
+ '\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t',
179
+ '\n\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n The Telegraph\n \n \n \n Back to top\n \n \n \n \n \n Follow us on:\n \n \n ',
180
+ ' \n Facebook icon\n \n \n \n Instagram icon\n \n \n \n X icon\n \n \n \n Snapchat icon\n \n \n',
181
+ ' \n LinkedIn icon\n \n \n \n YouTube icon \n \n \n \n \n \n \n \n \n \n Help Centre\n About us\n Telegraph Extra\n ',
182
+ ' Reader Prints\n Branded Content\n Syndication and Commissioning\n Fantasy Sport\n UK Voucher Codes\n Betting Offers\n Tax Strategy\n Broadband and Mobile Deals\n',
183
+ ' The Chelsea Magazine Company\n Newsletters\n Download the Telegraph App\n Privacy\n Terms & Conditions\n Modern Slavery\n Advertising terms\n Guidelines\n ',
184
+ " \n \n © Telegraph Media Group Limited 2024\n \n \n \n \n \n \n\twindow.addEventListener( 'DOMContentLoaded', function() {\n\t\t_satellite.pageBottom();\n\t});\n\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\n\t \n\t\t\n\t\t\t\n\t",
185
+ "\t\t\t\n\t\t\t\n\t\n window.RUM_BASE = '/';\nimport { sampleRUM } from '/.rum/@adobe/helix-rum-js@^1/src/index.js';\nsampleRUM('lazy');\nsampleRUM('cwv');\n\n "]"""
186
+ instruct = "Find relevant sentences from text_dump with given the target sentence"
187
+ question = f"target sentence:'Adam douglas was born in Cambrige', text_dump:{sentences}"
188
+ answer = llmQuestion(tokenizer, model, instruct, question, 8192, 8192)
requirements.txt ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.1.0
2
+ aiohttp==3.9.5
3
+ aiosignal==1.3.1
4
+ annotated-types==0.6.0
5
+ appnope==0.1.4
6
+ asttokens==2.4.1
7
+ async-timeout==4.0.3
8
+ attrs==23.2.0
9
+ beautifulsoup4==4.12.3
10
+ blis==0.7.11
11
+ boto3==1.34.95
12
+ botocore==1.34.95
13
+ bs4==0.0.2
14
+ callbacks==0.3.0
15
+ catalogue==2.0.10
16
+ certifi==2024.2.2
17
+ charset-normalizer==3.3.2
18
+ click==8.1.7
19
+ cloudpathlib==0.16.0
20
+ colorama==0.4.6
21
+ comm==0.2.2
22
+ confection==0.1.4
23
+ cymem==2.0.8
24
+ debugpy==1.8.1
25
+ decorator==5.1.1
26
+ exceptiongroup==1.2.1
27
+ executing==2.0.1
28
+ fasttext==0.9.2
29
+ filelock==3.14.0
30
+ frozenlist==1.4.1
31
+ fsspec==2024.3.1
32
+ html5lib==1.1
33
+ huggingface-hub==0.22.2
34
+ idna==3.7
35
+ importlib_metadata==7.1.0
36
+ ipykernel==6.29.4
37
+ ipython==8.18.1
38
+ jedi==0.19.1
39
+ Jinja2==3.1.3
40
+ jmespath==1.0.1
41
+ joblib==1.4.0
42
+ jupyter_client==8.6.1
43
+ jupyter_core==5.7.2
44
+ langcodes==3.4.0
45
+ language_data==1.2.0
46
+ Levenshtein==0.25.1
47
+ lightning-utilities==0.11.2
48
+ lxml==5.2.1
49
+ marisa-trie==1.1.0
50
+ MarkupSafe==2.1.5
51
+ matplotlib-inline==0.1.7
52
+ mpmath==1.3.0
53
+ multidict==6.0.5
54
+ murmurhash==1.0.10
55
+ mypy-extensions==1.0.0
56
+ nest-asyncio==1.6.0
57
+ networkx==3.2.1
58
+ nltk==3.8.1
59
+ numpy==1.26.4
60
+ packaging==24.0
61
+ pandas==2.2.2
62
+ parso==0.8.4
63
+ pexpect==4.9.0
64
+ platformdirs==4.2.1
65
+ portalocker==2.8.2
66
+ preshed==3.0.9
67
+ prompt-toolkit==3.0.43
68
+ psutil==5.9.8
69
+ ptyprocess==0.7.0
70
+ pure-eval==0.2.2
71
+ pybind11==2.12.0
72
+ pydantic==2.7.1
73
+ pydantic_core==2.18.2
74
+ Pygments==2.17.2
75
+ pysbd==0.3.4
76
+ python-dateutil==2.9.0.post0
77
+ pytorch-lightning==2.2.3
78
+ pytz==2024.1
79
+ PyYAML==6.0.1
80
+ pyzmq==26.0.2
81
+ qwikidata==0.4.2
82
+ rapidfuzz==3.8.1
83
+ regex==2024.4.28
84
+ requests==2.31.0
85
+ rouge_score==0.1.2
86
+ s3transfer==0.10.1
87
+ sacrebleu==2.4.2
88
+ safetensors==0.4.3
89
+ sentence-splitter==1.4
90
+ six==1.16.0
91
+ smart-open==6.4.0
92
+ soupsieve==2.5
93
+ spacy==3.7.4
94
+ spacy-legacy==3.0.12
95
+ spacy-loggers==1.0.5
96
+ srsly==2.4.8
97
+ stack-data==0.6.3
98
+ sympy==1.12
99
+ tabulate==0.9.0
100
+ thinc==8.2.3
101
+ tokenizers==0.19.1
102
+ torch==2.2.2
103
+ torchmetrics==1.3.2
104
+ tornado==6.4
105
+ tqdm==4.66.2
106
+ traitlets==5.14.3
107
+ transformers==4.40.1
108
+ typer==0.9.4
109
+ typing_extensions==4.11.0
110
+ tzdata==2024.1
111
+ urllib3==1.26.18
112
+ wasabi==1.1.2
113
+ wcwidth==0.2.13
114
+ weasel==0.3.4
115
+ webencodings==0.5.1
116
+ yarl==1.9.4
117
+ zipp==3.18.1
118
+ sentence_transformers