File size: 4,676 Bytes
49664ed
 
 
 
 
 
 
814e455
49664ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c91a340
6dbe315
607c9e5
49664ed
 
 
 
 
 
 
 
 
6dbe315
49664ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35986fb
49664ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import Wikidata_Text_Parser as wtr
import sqlite3
import Prove_lite as prv
import pandas as pd
import numpy as np
import os
import spaces

def wtr_process(qid):
    try:
        conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
        target_QID = qid
        query = f"SELECT * FROM {'claim_text'}"
        df = pd.read_sql_query(query, conn)
        if target_QID in df['entity_id'].unique():
            pass
        else:
            wtr.claimParser(target_QID) #save results in .db
            filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
            url_set = wtr.urlParser(target_QID) #from ref table in .db
            html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
            claim_text = wtr.claim2text(html_set) #Claims generation
            html_text = wtr.html2text(html_set)
            claim_text = claim_text.astype(str)
            html_text = html_text.astype(str)
            claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
            html_text.to_sql('html_text', conn, if_exists='replace', index=False)
            conn.commit()
        query = f"""
            SELECT
                claim_text.entity_label,
                claim_text.property_label,
                claim_text.object_label,
                html_text.url
            FROM claim_text
            INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id
            WHERE claim_text.entity_id = '{target_QID}'
        """

        result_df = pd.read_sql_query(query, conn)

        conn.commit()
        conn.close()

        return result_df
    
    except Exception as e:
            error_df = pd.DataFrame({'Error': [str(e)]})
            return error_df
        

@spaces.GPU(duration=120)
def prv_process(qid):
    target_QID = qid
    conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
    query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
    claim_df = pd.read_sql_query(query, conn)
    
    query = f"SELECT * FROM html_text Where  entity_id = '{target_QID}'"
    reference_text_df = pd.read_sql_query(query, conn)
    
    verbalised_claims_df_final = prv.verbalisation(claim_df)

    progress = gr.Progress(len(verbalised_claims_df_final))  # Create progress bar for Gradio
    def update_progress(curr_step, total_steps):
        progress((curr_step + 1) / total_steps)

    splited_sentences_from_html = prv.setencesSpliter(verbalised_claims_df_final, reference_text_df, update_progress)

    BATCH_SIZE = 512
    N_TOP_SENTENCES = 5
    SCORE_THRESHOLD = 0
    evidence_df = prv.evidenceSelection(splited_sentences_from_html, BATCH_SIZE, N_TOP_SENTENCES)
    result = prv.textEntailment(evidence_df, SCORE_THRESHOLD)
    display_df = prv.TableMaking(verbalised_claims_df_final, result)
    conn.commit()
    conn.close()
    return display_df



with gr.Blocks() as demo:
    print("gradio started!")
    gr.Markdown(
        """
        # Prove
        This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
        """
    )
    inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q245247)")
    out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)",  headers=["entity_label", "property_label", "object_label", "url"])
    run_button_1 = gr.Button("Start parsing")
    run_button_1.click(wtr_process, inp, out)


    gr.Markdown(
        """
        Pre-trained language models-based text entailment. 
        """
    )
    out_2 = gr.HTML(label="Results")
    run_button_2 = gr.Button("Start processing")
    run_button_2.click(prv_process, inp, out_2)

    
if __name__ == "__main__":
    #DB initialising
    if os.path.isfile('wikidata_claims_refs_parsed.db') != True:
        conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
        target_QID = 'Q115305900'
        wtr.claimParser(target_QID) #save results in .db
        filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
        url_set = wtr.urlParser(target_QID) #from ref table in .db
        html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
        claim_text = wtr.claim2text(html_set) #Claims generation
        html_text = wtr.html2text(html_set)
        claim_text = claim_text.astype(str)
        html_text = html_text.astype(str)
        claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
        html_text.to_sql('html_text', conn, if_exists='replace', index=False)
        conn.commit()
        conn.close()
    demo.launch(share=True)