minskiter commited on
Commit
9f86c43
β€’
1 Parent(s): 66fa922

feat(models): update models and deploy app.py

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ **/__pycache__
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: NER
3
  emoji: πŸ“‰
4
  colorFrom: indigo
5
  colorTo: red
@@ -7,7 +7,7 @@ sdk: gradio
7
  sdk_version: 3.36.1
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Resume Basic
3
  emoji: πŸ“‰
4
  colorFrom: indigo
5
  colorTo: red
 
7
  sdk_version: 3.36.1
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer,AutoModel
2
+ from transformers.pipelines import pipeline
3
+ from register import register
4
+ import gradio as gr
5
+ from huggingface_hub import login
6
+ import os
7
+ register()
8
+ login(os.environ["HF_Token"])
9
+ tokenizer = BertTokenizer.from_pretrained("minskiter/resume_token_classification",use_auth_token=True)
10
+ model = AutoModel.from_pretrained("minskiter/resume_token_classification",use_auth_token=True)
11
+ ner_predictor = pipeline(
12
+ "ner_predictor",
13
+ model=model,
14
+ tokenizer=tokenizer,
15
+ device="cpu"
16
+ )
17
+
18
+ def ner_predictor_gradio(input):
19
+ return ner_predictor(input)
20
+
21
+ demo = gr.Interface(fn=ner_predictor_gradio, inputs="text", outputs="text")
22
+ demo.launch()
models/bert/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .model_bert import BertCrfModel,BertCrfConfig
models/bert/configuration_bert.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class BertCrfConfig(PretrainedConfig):
4
+
5
+ model_type="bert_crf"
6
+
7
+ def __init__(
8
+ self,
9
+ vocab_size=30522,
10
+ hidden_size=768,
11
+ num_hidden_layers=12,
12
+ num_attention_heads=12,
13
+ intermediate_size=3072,
14
+ hidden_act="gelu",
15
+ hidden_dropout_prob=0.1,
16
+ attention_probs_dropout_prob=0.1,
17
+ max_position_embeddings=512,
18
+ type_vocab_size=2,
19
+ initializer_range=0.02,
20
+ layer_norm_eps=1e-12,
21
+ pad_token_id=0,
22
+ position_embedding_type="absolute",
23
+ use_cache=True,
24
+ classifier_dropout=None,
25
+ lstm_hidden_state=300,
26
+ num_tags=2,
27
+ tag2id={"O":0,"I":1},
28
+ id2tag={"0":"O","1":"I"},
29
+ **kwargs
30
+ ):
31
+ super().__init__(pad_token_id=pad_token_id,**kwargs)
32
+ self.vocab_size = vocab_size
33
+ self.hidden_size = hidden_size
34
+ self.num_hidden_layers = num_hidden_layers
35
+ self.num_attention_heads = num_attention_heads
36
+ self.intermediate_size = intermediate_size
37
+ self.hidden_act = hidden_act
38
+ self.hidden_dropout_prob = hidden_dropout_prob
39
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
40
+ self.max_position_embeddings = max_position_embeddings
41
+ self.type_vocab_size = type_vocab_size
42
+ self.initializer_range = initializer_range
43
+ self.layer_norm_eps = layer_norm_eps
44
+ self.position_embedding_type = position_embedding_type
45
+ self.use_cache = use_cache
46
+ self.classifier_dropout = classifier_dropout
47
+ self.lstm_hidden_state = lstm_hidden_state
48
+ self.num_tags = num_tags
49
+ self.tag2id = tag2id
50
+ self.id2tag = id2tag
51
+
models/bert/model_bert.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedModel,BertModel
2
+ from torch import nn
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from ..crf import CRF
5
+ from .configuration_bert import BertCrfConfig
6
+
7
+ class BertCrfModel(PreTrainedModel):
8
+ """BERT LSTM CRF Classify
9
+
10
+ Args:
11
+ PreTrainedModel (BertConfig): config
12
+
13
+ Returns:
14
+ loss: (torch.Tensor) batch loss
15
+ (best_path, labels): crf best path with true labels
16
+ """
17
+ config_class = BertCrfConfig
18
+
19
+ def __init__(self, config, num_tags = None):
20
+ super().__init__(config)
21
+ if num_tags is not None:
22
+ config.num_tags = num_tags
23
+ self.bert = BertModel(config=config, add_pooling_layer=False)
24
+ self.lstm = nn.LSTM(config.hidden_size, config.lstm_hidden_state, 1, batch_first=True, bidirectional=True)
25
+ self.crf = CRF(config.num_tags)
26
+ self.fc = nn.Linear(config.lstm_hidden_state*2, config.num_tags)
27
+
28
+ def forward(self, input_ids, attention_mask, token_type_ids, input_mask, labels=None):
29
+ outputs = self.bert(
30
+ input_ids = input_ids,
31
+ attention_mask = attention_mask,
32
+ token_type_ids = token_type_ids
33
+ )
34
+ hidden_states = outputs[0]
35
+ lstm_hidden_states = self.lstm(hidden_states)[0]
36
+ emission_scores = self.fc(lstm_hidden_states)
37
+ loss = None
38
+ if labels is not None:
39
+ loss = self.crf.loss(emission_scores, labels, input_mask==0)
40
+ _,best_path = self.crf(emission_scores, input_mask==0)
41
+ return loss,(list(i[1:-1] for i in best_path), labels.cpu() if labels is not None else None)
models/crf/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .model_crf import CRF
models/crf/model_crf.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+ def log_sum_exp(x):
6
+ """calculate log(sum(exp(x))) = max(x) + log(sum(exp(x - max(x))))
7
+ """
8
+ max_score = x.max(-1)[0]
9
+ return max_score + (x - max_score.unsqueeze(-1)).exp().sum(-1).log()
10
+
11
+
12
+ IMPOSSIBLE = -1e4
13
+
14
+
15
+ class CRF(nn.Module):
16
+ """General CRF module.
17
+ The CRF module contain a inner Linear Layer which transform the input from features space to tag space.
18
+ :param in_features: number of features for the input
19
+ :param num_tag: number of tags. DO NOT include START, STOP tags, they are included internal.
20
+ """
21
+
22
+ def __init__(self, num_tags):
23
+ super(CRF, self).__init__()
24
+
25
+ self.num_tags = num_tags + 2
26
+ self.start_idx = self.num_tags - 2
27
+ self.stop_idx = self.num_tags - 1
28
+
29
+ # transition factor, Tij mean transition from j to i
30
+ self.transitions = nn.Parameter(torch.randn(self.num_tags, self.num_tags), requires_grad=True)
31
+ self.transitions.data[self.start_idx, :] = IMPOSSIBLE
32
+ self.transitions.data[:, self.stop_idx] = IMPOSSIBLE
33
+
34
+ def __get_emission_score(self, features):
35
+ # features
36
+ b,seq,_ = features.size()
37
+ start_score = torch.full((b,seq,1),IMPOSSIBLE).to(features.device)
38
+ end_score = torch.full((b,seq,1),IMPOSSIBLE).to(features.device)
39
+ return torch.cat([features,start_score,end_score],dim=-1)
40
+
41
+ def forward(self, features, masks):
42
+ """decode tags
43
+ :param features: [B, L, C], batch of unary scores
44
+ :param masks: [B, L] masks
45
+ :return: (best_score, best_paths)
46
+ best_score: [B]
47
+ best_paths: [B, L]
48
+ """
49
+ features = self.__get_emission_score(features) # [B,L,C] => [B,L,T]
50
+ return self.__viterbi_decode(features, masks[:, :features.size(1)].float())
51
+
52
+ def loss(self, features, ys, masks):
53
+ """negative log likelihood loss
54
+ B: batch size, L: sequence length, D: dimension
55
+ :param features: [B, L, D]
56
+ :param ys: tags, [B, L]
57
+ :param masks: masks for padding, [B, L]
58
+ :return: loss
59
+ """
60
+ features = self.__get_emission_score(features) # [B,L,C] => [B,L,T]
61
+
62
+ L = features.size(1)
63
+ masks_ = masks[:, :L].float()
64
+ forward_score = self.__forward_algorithm(features, masks_)
65
+ ys = ys.clone().detach()
66
+ ys[ys<0] = 0
67
+ gold_score = self.__score_sentence(features, ys[:, :L].long(), masks_)
68
+ loss = (forward_score - gold_score).mean()
69
+ return loss
70
+
71
+ def __score_sentence(self, features, tags, masks):
72
+ """Gives the score of a provided tag sequence
73
+ :param features: [B, L, C]
74
+ :param tags: [B, L]
75
+ :param masks: [B, L]
76
+ :return: [B] score in the log space
77
+ """
78
+ B, L, C = features.shape
79
+
80
+ # emission score
81
+ emit_scores = features.gather(dim=2, index=tags.unsqueeze(-1)).squeeze(-1)
82
+
83
+ # transition score
84
+ start_tag = torch.full((B, 1), self.start_idx, dtype=torch.long, device=tags.device)
85
+ tags = torch.cat([start_tag, tags], dim=1) # [B, L+1]
86
+ trans_scores = self.transitions[tags[:, 1:], tags[:, :-1]]
87
+
88
+ # last transition score to STOP tag
89
+ last_tag = tags.gather(dim=1, index=masks.sum(1).long().unsqueeze(1)).squeeze(1) # [B]
90
+ last_score = self.transitions[self.stop_idx, last_tag]
91
+
92
+ score = ((trans_scores + emit_scores) * masks).sum(1) + last_score
93
+ return score
94
+
95
+ def __viterbi_decode(self, features, masks):
96
+ """decode to tags using viterbi algorithm
97
+ :param features: [B, L, C], batch of unary scores
98
+ :param masks: [B, L] masks
99
+ :return: (best_score, best_paths)
100
+ best_score: [B]
101
+ best_paths: [B, L]
102
+ """
103
+ B, L, C = features.shape
104
+
105
+ bps = torch.zeros(B, L, C, dtype=torch.long, device=features.device) # back pointers
106
+
107
+ # Initialize the viterbi variables in log space
108
+
109
+ max_score = torch.full((B, C), IMPOSSIBLE, device=features.device) # [B, C]
110
+ max_score[:, self.start_idx] = 0
111
+
112
+ for t in range(L):
113
+ mask_t = masks[:, t].unsqueeze(1) # [B, 1]
114
+ emit_score_t = features[:, t] # [B, C]
115
+
116
+ # [B, 1, C] + [C, C]
117
+ acc_score_t = max_score.unsqueeze(1) + self.transitions # [B, C, C]
118
+ acc_score_t, bps[:, t, :] = acc_score_t.max(dim=-1)
119
+ acc_score_t += emit_score_t
120
+ max_score = acc_score_t * mask_t + max_score * (1 - mask_t) # max_score or acc_score_t
121
+
122
+ # Transition to STOP_TAG
123
+ max_score += self.transitions[self.stop_idx]
124
+ best_score, best_tag = max_score.max(dim=-1)
125
+
126
+ # Follow the back pointers to decode the best path.
127
+ best_paths = []
128
+ bps = bps.cpu().numpy()
129
+ for b in range(B):
130
+ best_tag_b = best_tag[b].item()
131
+ seq_len = int(masks[b, :].sum().item())
132
+
133
+ best_path = [best_tag_b]
134
+ for bps_t in reversed(bps[b, :seq_len]):
135
+ best_tag_b = bps_t[best_tag_b]
136
+ best_path.append(best_tag_b)
137
+ # drop the last tag and reverse the left
138
+ best_paths.append(best_path[-2::-1])
139
+
140
+ return best_score, best_paths
141
+
142
+ def __forward_algorithm(self, features, masks):
143
+ """calculate the partition function with forward algorithm.
144
+ TRICK: log_sum_exp([x1, x2, x3, x4, ...]) = log_sum_exp([log_sum_exp([x1, x2]), log_sum_exp([x3, x4]), ...])
145
+ :param features: features. [B, L, C]
146
+ :param masks: [B, L] masks
147
+ :return: [B], score in the log space
148
+ """
149
+ B, L, C = features.shape
150
+
151
+ scores = torch.full((B, C), IMPOSSIBLE, device=features.device) # [B, C]
152
+ scores[:, self.start_idx] = 0.
153
+ trans = self.transitions.unsqueeze(0) # [1, C, C]
154
+
155
+ # Iterate through the sentence
156
+ for t in range(L):
157
+ emit_score_t = features[:, t].unsqueeze(2) # [B, C, 1]
158
+ score_t = scores.unsqueeze(1) + trans + emit_score_t # [B, 1, C] + [1, C, C] + [B, C, 1] => [B, C, C]
159
+ score_t = log_sum_exp(score_t) # [B, C]
160
+
161
+ mask_t = masks[:, t].unsqueeze(1) # [B, 1]
162
+ scores = score_t * mask_t + scores * (1 - mask_t)
163
+ scores = log_sum_exp(scores + self.transitions[self.stop_idx])
164
+ return scores
165
+
166
+
pipelines/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .ner_pipeline import NERPredictorPipe
pipelines/ner_pipeline.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Pipeline
2
+ from typing import Dict, Any, Union
3
+ from transformers.pipelines.base import GenericTensor
4
+ from transformers.modeling_outputs import ModelOutput
5
+ import torch
6
+
7
+ class NERPredictorPipe(Pipeline):
8
+
9
+ def _sanitize_parameters(self, **kwargs):
10
+ return {},{},{}
11
+
12
+ def __token_preprocess(self, input, tokenizer, max_length=512):
13
+ tokenized = tokenizer(input,
14
+ padding="max_length",
15
+ max_length=max_length,
16
+ truncation=True,
17
+ return_tensors="pt"
18
+ )
19
+ return tokenized
20
+
21
+ def preprocess(self, sentence: Union[str,list], max_length=512) -> Dict[str, GenericTensor]:
22
+ input_tensors = self.__token_preprocess(
23
+ sentence,
24
+ self.tokenizer,
25
+ max_length=max_length
26
+ )
27
+ input_tensors["input_mask"] = (~(input_tensors["input_ids"]>0)).long()
28
+ for key in input_tensors:
29
+ if input_tensors[key] is not None:
30
+ input_tensors[key] = input_tensors[key].to(self.device)
31
+ return input_tensors
32
+
33
+ def _forward(self, input_tensors: Dict[str, GenericTensor]) -> ModelOutput:
34
+ self.model.eval()
35
+ with torch.no_grad():
36
+ _,(best_path,_) = self.model(**input_tensors)
37
+ return (input_tensors["input_ids"].tolist(),best_path)
38
+
39
+ def __format_output(self, start, end, text, label):
40
+ return {
41
+ "text": text,
42
+ "start": start,
43
+ "end": end,
44
+ "label": label
45
+ }
46
+
47
+ def postprocess(self, model_outputs: ModelOutput) -> Any:
48
+ batch_slices = []
49
+ input_ids_list = model_outputs[0]
50
+ label_ids_list = model_outputs[1]
51
+ for input_ids,label_ids in zip(input_ids_list,label_ids_list):
52
+ slices = []
53
+ labels = list(self.model.config.id2tag[str(id)] for id in label_ids)
54
+ # get slice
55
+ past = "O"
56
+ start = -1
57
+ end = -1
58
+ for i,label in enumerate(labels):
59
+ if label.startswith("B-"):
60
+ if start!=-1 and end!=-1:
61
+ slices.append(
62
+ self.__format_output(
63
+ start, end,
64
+ ''.join(self.tokenizer.convert_ids_to_tokens(
65
+ input_ids[start+1:end+2])), past
66
+ )
67
+ )
68
+ start = i
69
+ end = i
70
+ past = "-".join(label.split("-")[1:])
71
+ elif label.startswith("I-") or label.startswith("M-") or label.startswith("E-"):
72
+ cur = "-".join(label.split("-")[1:])
73
+ if cur!=past:
74
+ # cut and skip to next entity
75
+ if start!=-1 and end!=-1:
76
+ slices.append(
77
+ self.__format_output(
78
+ start, end,
79
+ ''.join(self.tokenizer.convert_ids_to_tokens(
80
+ input_ids[start+1:end+2])), past
81
+ )
82
+ )
83
+ start = i
84
+ past = cur
85
+ end = i
86
+ elif label.startswith("S-"):
87
+ if start!=-1 and end!=-1:
88
+ slices.append(
89
+ self.__format_output(
90
+ start, end,
91
+ ''.join(self.tokenizer.convert_ids_to_tokens(
92
+ input_ids[start+1:end+2])), past
93
+ )
94
+ )
95
+ slices.append(
96
+ self.__format_output(
97
+ i, i,
98
+ ''.join(self.tokenizer.convert_ids_to_tokens(
99
+ input_ids[i+1:i+2])), past
100
+ )
101
+ )
102
+ start = -1
103
+ end = -1
104
+ past = "O"
105
+ if start!=-1 and end!=-1:
106
+ slices.append(
107
+ self.__format_output(
108
+ start, end,
109
+ ''.join(self.tokenizer.convert_ids_to_tokens(
110
+ input_ids[start+1:end+2])), past
111
+ )
112
+ )
113
+ batch_slices.append(slices)
114
+ return batch_slices
register.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from transformers.pipelines import PIPELINE_REGISTRY,AutoModel,AutoConfig
2
+ from models.bert import BertCrfModel,BertCrfConfig
3
+ from pipelines import NERPredictorPipe
4
+
5
+ def register():
6
+ PIPELINE_REGISTRY.register_pipeline("ner_predictor", pipeline_class=NERPredictorPipe)
7
+ AutoConfig.register("bert_crf",BertCrfConfig)
8
+ AutoModel.register(BertCrfConfig,BertCrfModel)