File size: 4,738 Bytes
2ebb1f3
 
4b75840
 
67caa78
4b75840
 
 
a00f9ba
 
620af8b
 
59fcc9f
620af8b
 
4b75840
67caa78
4b75840
 
 
620af8b
a00f9ba
620af8b
 
a00f9ba
620af8b
 
a00f9ba
620af8b
 
4b75840
 
 
 
 
 
a00f9ba
620af8b
a00f9ba
620af8b
 
a00f9ba
 
620af8b
 
a00f9ba
620af8b
 
a00f9ba
 
 
 
4b75840
a00f9ba
4b75840
a00f9ba
620af8b
a00f9ba
620af8b
 
a00f9ba
620af8b
 
a00f9ba
620af8b
 
4b75840
a00f9ba
 
4b75840
 
a00f9ba
 
4b75840
 
 
 
 
 
 
 
a00f9ba
620af8b
a00f9ba
620af8b
 
a00f9ba
620af8b
 
a00f9ba
620af8b
 
a00f9ba
4b75840
2919f24
a00f9ba
2919f24
 
a00f9ba
 
 
4b75840
a00f9ba
4b75840
a00f9ba
620af8b
a00f9ba
620af8b
 
a00f9ba
4b75840
620af8b
a00f9ba
620af8b
4b75840
2919f24
4b75840
2919f24
 
a00f9ba
4b75840
2919f24
 
 
a00f9ba
2919f24
 
a00f9ba
4b75840
2919f24
 
4b75840
a00f9ba
4b75840
 
620af8b
a00f9ba
620af8b
 
a00f9ba
 
620af8b
 
a00f9ba
 
620af8b
4b75840
 
 
a00f9ba
 
 
aa75cb7
4b75840
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import nltk
import pytextrank
import re
from operator import itemgetter
import en_core_web_sm


class KeywordExtractor:
    """
    Keyword Extraction on text data

    Attributes:
        nlp: An instance English pipeline optimized for CPU for spacy
    """

    def __init__(self):
        self.nlp = en_core_web_sm.load()
        self.nlp.add_pipe("textrank")

    def get_keywords(self, text, max_keywords):
        """
        Extract keywords from text.

        Parameters:
            text (str): The user input string to extract keywords from

        Returns:
            kws (list): list of extracted keywords
        """

        doc = self.nlp(text)

        kws = [i.text for i in doc._.phrases[:max_keywords]]

        return kws

    def get_keyword_indices(self, kws, text):
        """
        Extract keywords from text.

        Parameters:
            kws (list): list of extracted keywords
            text (str): The user input string to extract keywords from

        Returns:
            keyword_indices (list): list of indices for keyword boundaries in text
        """

        keyword_indices = []
        for s in kws:
            indices = [[m.start(), m.end()] for m in re.finditer(re.escape(s), text)]
            keyword_indices.extend(indices)

        return keyword_indices

    def merge_overlapping_indices(self, keyword_indices):
        """
        Merge overlapping keyword indices.

        Parameters:
            keyword_indices (list): list of indices for keyword boundaries in text

        Returns:
            keyword_indices (list): list of indices for keyword boundaries in with overlapping combined
        """

        # Sort the array on the basis of start values of intervals.
        keyword_indices.sort()

        stack = []
        # insert first interval into stack
        stack.append(keyword_indices[0])
        for i in keyword_indices[1:]:
            # Check for overlapping interval,
            # if interval overlap
            if (stack[-1][0] <= i[0] <= stack[-1][-1]) or (stack[-1][-1] == i[0]-1):
                stack[-1][-1] = max(stack[-1][-1], i[-1])
            else:
                stack.append(i)
        return stack

    def merge_until_finished(self, keyword_indices):
        """
        Loop until no overlapping keyword indices left.

        Parameters:
            keyword_indices (list): list of indices for keyword boundaries in text

        Returns:
            keyword_indices (list): list of indices for keyword boundaries in with overlapping combined
        """

        len_indices = 0
        while True:
            # Merge overlapping indices
            merged = self.merge_overlapping_indices(keyword_indices)
            # Check to see if merging reduced number of annotation indices
            # If merging did not reduce list return final indicies
            if len_indices == len(merged):
                out_indices = sorted(merged, key=itemgetter(0))
                return out_indices
            else:
                len_indices = len(merged)

    def get_annotation(self, text, keyword_indices):
        """
        Create text annotation for extracted keywords.

        Parameters:
            keyword_indices (list): list of indices for keyword boundaries in text

        Returns:
            annotation (list): list of tuples for generating html
        """

        # Turn list to numpy array
        arr = list(text)

        # Loop through indices in list and insert delimeters
        for idx in sorted(keyword_indices, reverse=True):
            arr.insert(idx[0], "<kw>")
            arr.insert(idx[1]+1, "<!kw> <kw>")

        # join array
        joined_annotation = ''.join(arr)

        # split array on delimeter
        split = joined_annotation.split('<kw>')

        # Create annotation for keywords in text
        annotation = [(x.replace('<!kw> ', ''), "KEY", "#26aaef") if "<!kw>" in x else x for x in split]

        return annotation

    def generate(self, text, max_keywords):
        """
        Create text annotation for extracted keywords.

        Parameters:
            text (str): The user input string to extract keywords from
            max_keywords (int): Limit on number of keywords to generate

        Returns:
            annotation (list): list of tuples for generating html
            kws (list): list of extracted keywords
        """

        kws = self.get_keywords(text, max_keywords)

        indices = list(self.get_keyword_indices(kws, text))
        if indices:
            indices_merged = self.merge_until_finished(indices)
            annotation = self.get_annotation(text, indices_merged)
        else:
            annotation = None

        return annotation, kws