Update README.md
Browse files
README.md
CHANGED
@@ -28,30 +28,29 @@ This work was funded by the Spanish Government, the Spanish Ministry of Economy
|
|
28 |
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
|
29 |
import torch
|
30 |
|
31 |
-
def
|
32 |
result_words = []
|
33 |
-
|
34 |
-
punc_tags = ['¿', '?', '¡', '!', ',', '.', ':']
|
35 |
-
else:
|
36 |
-
punc_tags = ['?', '!', ',', '.', ':']
|
37 |
|
38 |
for entity in list_entity:
|
39 |
-
tag = entity["entity"]
|
40 |
-
word = entity["word"]
|
41 |
start = entity["start"]
|
42 |
end = entity["end"]
|
|
|
|
|
43 |
|
44 |
# check punctuation
|
45 |
punc_in = next((p for p in punc_tags if p in tag), "")
|
46 |
|
47 |
subword = False
|
48 |
# check subwords
|
49 |
-
if word[0]
|
50 |
subword = True
|
51 |
if punc_in != "":
|
52 |
word = result_words[-1].replace(punc_in, "") + text[start:end]
|
53 |
else:
|
54 |
word = result_words[-1] + text[start:end]
|
|
|
|
|
55 |
|
56 |
if tag == "l":
|
57 |
word = word
|
@@ -71,18 +70,20 @@ def get_result_text_es_pt (list_entity, text, lang):
|
|
71 |
|
72 |
return " ".join(result_words)
|
73 |
|
74 |
-
|
75 |
-
|
|
|
76 |
|
77 |
model = AutoModelForTokenClassification.from_pretrained(model_path)
|
78 |
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
79 |
|
80 |
pipe = pipeline("token-classification", model=model, tokenizer=tokenizer)
|
81 |
-
|
|
|
82 |
result = pipe(text)
|
83 |
|
84 |
print("Source text: "+ text)
|
85 |
-
result_text =
|
86 |
print("Restored text: " +result_text)
|
87 |
```
|
88 |
|
|
|
28 |
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
|
29 |
import torch
|
30 |
|
31 |
+
def get_result_text_ca (list_entity, text):
|
32 |
result_words = []
|
33 |
+
punc_tags = ['?', '!', ',', '.', ':']
|
|
|
|
|
|
|
34 |
|
35 |
for entity in list_entity:
|
|
|
|
|
36 |
start = entity["start"]
|
37 |
end = entity["end"]
|
38 |
+
tag = entity["entity"]
|
39 |
+
word = entity["word"]
|
40 |
|
41 |
# check punctuation
|
42 |
punc_in = next((p for p in punc_tags if p in tag), "")
|
43 |
|
44 |
subword = False
|
45 |
# check subwords
|
46 |
+
if word[0] != "Ġ":
|
47 |
subword = True
|
48 |
if punc_in != "":
|
49 |
word = result_words[-1].replace(punc_in, "") + text[start:end]
|
50 |
else:
|
51 |
word = result_words[-1] + text[start:end]
|
52 |
+
else:
|
53 |
+
word = text[start:end]
|
54 |
|
55 |
if tag == "l":
|
56 |
word = word
|
|
|
70 |
|
71 |
return " ".join(result_words)
|
72 |
|
73 |
+
|
74 |
+
lang = "ca"
|
75 |
+
model_path = "VOCALINLP/catalan_capitalization_punctuation_restoration_sanivert"
|
76 |
|
77 |
model = AutoModelForTokenClassification.from_pretrained(model_path)
|
78 |
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
79 |
|
80 |
pipe = pipeline("token-classification", model=model, tokenizer=tokenizer)
|
81 |
+
|
82 |
+
text = "el pacient presenta els símptomes següents febre dispnea nàusees i vòmits"
|
83 |
result = pipe(text)
|
84 |
|
85 |
print("Source text: "+ text)
|
86 |
+
result_text = get_result_text_ca(result, text)
|
87 |
print("Restored text: " +result_text)
|
88 |
```
|
89 |
|