File size: 5,265 Bytes
953a835
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import gradio as gr
import Wikidata_Text_Parser as wtr
import sqlite3
import CodeArchive.Prove_llm as prv
import pandas as pd

def wtr_process(qid):
    try:
        conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
        target_QID = qid

        cursor = conn.cursor()

        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='claims'")
        table_exists = cursor.fetchone()

        if table_exists:
            cursor.execute("SELECT entity_id FROM claims WHERE entity_id=?", (target_QID,))
            result = cursor.fetchone()
            
            if result is not None and result[0] == target_QID:
                print(result)
                print(f"{target_QID} already exists in the 'claims' table. Skipping execution.")
            else:
                progress = gr.Progress(0)
                progress(0.00, desc="Wikidata claims parsing...")
                wtr.claimParser(target_QID) #save results in .db
                filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
                progress(0.25, desc="URL and HTML parsing...")
                url_set = wtr.urlParser() #from ref table in .db
                html_set = wtr.htmlParser(url_set, qid) #Original html docs collection
                progress(0.50, desc="claim2Text...")
                claim_text = wtr.claim2text(html_set) #Claims generation
                progress(0.74, desc="html2Text...")
                html_text = wtr.html2text(html_set)
                claim_text = claim_text.astype(str)
                html_text = html_text.astype(str)
                claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
                html_text.to_sql('html_text', conn, if_exists='replace', index=False)
                progress(1, desc="completed...")
        else:
            progress = gr.Progress(0)
            progress(0.00, desc="Wikidata claims parsing...")
            wtr.claimParser(target_QID) #save results in .db
            filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
            progress(0.25, desc="URL and HTML parsing...")
            url_set = wtr.urlParser() #from ref table in .db
            html_set = wtr.htmlParser(url_set) #Original html docs collection
            progress(0.50, desc="claim2Text...")
            claim_text = wtr.claim2text(html_set) #Claims generation
            progress(0.74, desc="html2Text...")
            html_text = wtr.html2text(html_set)
            claim_text = claim_text.astype(str)
            html_text = html_text.astype(str)
            claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
            html_text.to_sql('html_text', conn, if_exists='replace', index=False)
            progress(1, desc="completed...")


        query = f"""
            SELECT
                claim_text.entity_label,
                claim_text.property_label,
                claim_text.object_label,
                html_text.url
            FROM claim_text
            INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id
            WHERE claim_text.entity_id = '{target_QID}'
        """

        result_df = pd.read_sql_query(query, conn)

        conn.commit()
        conn.close()

        return result_df
    
    except Exception as e:
            error_df = pd.DataFrame({'Error': [str(e)]})
            return error_df


def prv_process(qid):
    conn = sqlite3.connect('wikidata_claims_refs_parsed.db')

    query = f"""
    SELECT html_text.*
    FROM html_text
    INNER JOIN claim_text ON html_text.reference_id = claim_text.reference_id
    WHERE claim_text.entity_id = '{qid}'
"""
    reference_text_df = pd.read_sql_query(query, conn)
    query = f"SELECT * FROM claim_text WHERE entity_id = '{qid}'"
    claim_df = pd.read_sql_query(query, conn)
    
    verbalised_claims_df_final = prv.verbalisation(claim_df)
    progress = gr.Progress(len(verbalised_claims_df_final))  # Create progress bar

    def update_progress(curr_step, total_steps):
        progress((curr_step + 1) / total_steps)

    result = prv.RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress)

    conn.close()
    return result



with gr.Blocks() as demo:
    print("gradio started!")
    gr.Markdown(
        """
        # Reference Quality Verification Tool
        This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
        Parsing could take 3~5 mins depending on the number of references.
        """
    )
    inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q42)")
    out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)",  headers=["entity_label", "property_label", "object_label", "url"])
    run_button_1 = gr.Button("Start parsing")
    run_button_1.click(wtr_process, inp, out)


    gr.Markdown(
        """
        LLM-based HTML parsing and verification !
        """
    )
    out_2 = gr.DataFrame(label="LLM-based verificaiton result")

    run_button_2 = gr.Button("Start processing")
    run_button_2.click(prv_process, inp, out_2)

    
if __name__ == "__main__":
    demo.launch(share=True)