File size: 5,811 Bytes
eca4d65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import gradio as gr
import json
from nltk.tokenize import sent_tokenize
import torch
import ujson as json
from transformers import AutoModelForCausalLM,LlamaTokenizer
from peft import PeftModel
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
import nltk
nltk.download('punkt')

# loads Guanaco 7B model - takes around 2-3 minutes - can do this separately
model_name = "llama-7b-hf"
adapters_name = 'guanaco-7b'
# print(f"Starting to load the model {model_name} into memory")
m = AutoModelForCausalLM.from_pretrained(
    model_name,
    #load_in_4bit=True,
    torch_dtype=torch.bfloat16,
    device_map='auto'
)
m = PeftModel.from_pretrained(m, adapters_name)
m = m.merge_and_unload()
tok = LlamaTokenizer.from_pretrained(model_name)
tok.bos_token_id = 1
stop_token_ids = [0]
# print(f"Successfully loaded the model {model_name} into memory")
print('Guanaco model loaded into memory.')


def generate(title, abstract):
    print("Started running.")
    '''
    Take gradio input and output data to sample-data.jsonl in readable form for classifier.py to run. 
    '''  
    newline = {}
    text = abstract
    # eliminate word lowercase "abstract" or "abstract." at beginning of abstract text
    if text.lower()[0:9] == "abstract.":
        text = text[9:]
    elif text.lower()[0:8] == "abstract":
        text = text[8:]
    sentences = sent_tokenize(text)
    newline["target"] = sentences
    newline["title"] = title
    first_file = open("data/sample-data.jsonl", "w")
    first_file.write(json.dumps(newline))
    first_file.close()
    print(newline)
    print("Tokenized abstract to sentences.")
    '''
    Main part
    '''
    '''
    This is for summarization
    '''
    tooShortForKeyword = False
    with open("data/sample-data.jsonl", "r") as f:
        obj = [json.loads(l) for l in f]
        doc = ""
        if len(obj[0]["target"]) > 1:
            doc += obj[0]["title"] + ". " + obj[0]["target"][0] + " "  + obj[0]["target"][1]
        elif len(obj[0]["target"]) == 1:
            tooShortForKeyword = True
            doc += obj[0]["title"] + ". " + obj[0]["target"][0]
        else:
            tooShortForKeyword = True
            doc += obj[0]["title"]
    text = doc
    prompt = """
    Can you explain the main idea of what is being studied in the following paragraph for someone who is not familiar with the topic. Comment on areas of application.:
    """
    formatted_prompt = (
        f"A chat between a curious human and an artificial intelligence assistant."
        f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
        f"### Human: {prompt + doc} \n"
        f"### Assistant:"
    )
    inputs = tok(formatted_prompt, return_tensors="pt").to("cuda:1")
    outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
    output = tok.decode(outputs[0], skip_special_tokens=True)
    index_response = output.find("### Assistant: ") + 15
    if (output[index_response:index_response + 10] == "Certainly!"):
        index_response += 10
    end_response = output.rfind('.') + 1
    response = output[index_response:end_response]
    with open("data/guanacoSummaryOutput.txt", "w") as f2:
        f2.write(response)
    print('Plain Language Summary Created.')

    '''
    Keyphrase extraction. 
    '''
    # the document is the title and first two sentences of the abstract. 

    with open("data/sample-data.jsonl", "r") as f:
        obj = [json.loads(l) for l in f]
        doc = ""
        if len(obj[0]["target"]) > 1:
            doc += obj[0]["title"] + ". " + obj[0]["target"][0] + " "  + obj[0]["target"][1]
            kw_model = KeyBERT(model="all-MiniLM-L6-v2")
            vectorizer = KeyphraseCountVectorizer()
            top_n = 2
            keywords = kw_model.extract_keywords(doc, stop_words="english", top_n = top_n, vectorizer=vectorizer, use_mmr=True)
            my_keywords = []
            for i in range(top_n):
                add = True
                for j in range(top_n):
                    if i != j:
                        if keywords[i][0] in keywords[j][0]:
                            add = False
                if add:
                    my_keywords.append(keywords[i][0])
            for entry in my_keywords:
                print(entry)
    '''
    This is for feeding the keyphrases into Guanaco.
    '''
    responseTwo = ""
    keyword_string = ""
    if not tooShortForKeyword:
        separator = ', '
        keyword_string = separator.join(my_keywords)
        prompt = "What is the purpose of studying " + keyword_string + "? Comment on areas of application."

        formatted_prompt = (
            f"A chat between a curious human and an artificial intelligence assistant."
            f"The assistant gives helpful, detailed, and polite answers to the user's questions.\n"
            f"### Human: {prompt} \n"
            f"### Assistant:"
        )
        inputs = tok(formatted_prompt, return_tensors="pt").to("cuda:2")
        outputs = m.generate(inputs=inputs.input_ids, max_new_tokens=300)
        output = tok.decode(outputs[0], skip_special_tokens=True)
        index_response = output.find("### Assistant: ") + 15
        end_response = output.rfind('.') + 1
        responseTwo = output[index_response:end_response]
        with open("data/guanacoElaborationOutput.txt", "w") as f2:
            f2.write(responseTwo)
    print('Keyphrase elaboration ran.')
    return keyword_string, responseTwo, response

demo = gr.Interface(
    fn=generate,
    inputs=[gr.Textbox(label="Title"), gr.Textbox(label="Abstract")],
    outputs=[gr.Textbox(label="Keyphrases"), gr.Textbox(label="Keyphrase Elaboration"), gr.Textbox(label="Plain Language Summary")],
).launch(share = True)

print('after launch') # now executes