jcg00v commited on
Commit
2be935d
1 Parent(s): 8ced499

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +79 -1
README.md CHANGED
@@ -8,4 +8,82 @@ widget:
8
  example_title: Example 3
9
  language:
10
  - pt
11
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  example_title: Example 3
9
  language:
10
  - pt
11
+ ---
12
+ # Portuguese punctuation and capitalisation restoration model
13
+ ## Details of the model
14
+ This is a reduced version of the Portuguese capitalisation and punctuation restoration model developed by [VÓCALI](https://www.vocali.net) as part of the SANIVERT project.
15
+
16
+ You can try the model in the following [SPACE](https://huggingface.co/spaces/VOCALINLP/punctuation_and_capitalization_restoration_sanivert)
17
+ ## Details of the dataset
18
+
19
+
20
+ ## Evaluation Metrics
21
+
22
+ ## Funding
23
+ This work was funded by the Spanish Government, the Spanish Ministry of Economy and Digital Transformation through the Digital Transformation through the "Recovery, Transformation and Resilience Plan" and also funded by the European Union NextGenerationEU/PRTR through the research project 2021/C005/0015007
24
+
25
+ ## How to use the model
26
+
27
+ ```py
28
+ from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
29
+ import torch
30
+
31
+ def get_result_text_es_pt (list_entity, text, lang):
32
+ result_words = []
33
+ if lang == "es":
34
+ punc_tags = ['¿', '?', '¡', '!', ',', '.', ':']
35
+ else:
36
+ punc_tags = ['?', '!', ',', '.', ':']
37
+
38
+ for entity in list_entity:
39
+ tag = entity["entity"]
40
+ word = entity["word"]
41
+ start = entity["start"]
42
+ end = entity["end"]
43
+
44
+ # check punctuation
45
+ punc_in = next((p for p in punc_tags if p in tag), "")
46
+
47
+ subword = False
48
+ # check subwords
49
+ if word[0] == "#":
50
+ subword = True
51
+ if punc_in != "":
52
+ word = result_words[-1].replace(punc_in, "") + text[start:end]
53
+ else:
54
+ word = result_words[-1] + text[start:end]
55
+
56
+ if tag == "l":
57
+ word = word
58
+ elif tag == "u":
59
+ word = word.capitalize()
60
+ # case with punctuation
61
+ else:
62
+ if tag[-1] == "l":
63
+ word = (punc_in + word) if punc_in in ["¿", "¡"] else (word + punc_in)
64
+ elif tag[-1] == "u":
65
+ word = (punc_in + word.capitalize()) if punc_in in ["¿", "¡"] else (word.capitalize() + punc_in)
66
+
67
+ if subword == True:
68
+ result_words[-1] = word
69
+ else:
70
+ result_words.append(word)
71
+
72
+ return " ".join(result_words)
73
+
74
+ lang = "pt"
75
+ model_path = "VOCALINLP/portuguese_capitalization_punctuation_restoration_sanivert"
76
+
77
+ model = AutoModelForTokenClassification.from_pretrained(model_path)
78
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
79
+
80
+ pipe = pipeline("token-classification", model=model, tokenizer=tokenizer)
81
+ text = "é preciso fazer análises ao sangue à urina e aos ouvidos"
82
+ result = pipe(text)
83
+
84
+ print("Source text: "+ text)
85
+ result_text = get_result_text_es_pt(result, text, lang)
86
+ print("Restored text: " +result_text)
87
+ ```
88
+
89
+ > Created by [VOCALI SISSTEMAS INTELIGENTES S.L.](https://www.vocali.net)