jcg00v commited on
Commit
6623419
1 Parent(s): 2be935d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +29 -16
README.md CHANGED
@@ -15,7 +15,7 @@ This is a reduced version of the Portuguese capitalisation and punctuation resto
15
 
16
  You can try the model in the following [SPACE](https://huggingface.co/spaces/VOCALINLP/punctuation_and_capitalization_restoration_sanivert)
17
  ## Details of the dataset
18
-
19
 
20
  ## Evaluation Metrics
21
 
@@ -23,6 +23,9 @@ You can try the model in the following [SPACE](https://huggingface.co/spaces/VOC
23
  This work was funded by the Spanish Government, the Spanish Ministry of Economy and Digital Transformation through the Digital Transformation through the "Recovery, Transformation and Resilience Plan" and also funded by the European Union NextGenerationEU/PRTR through the research project 2021/C005/0015007
24
 
25
  ## How to use the model
 
 
 
26
 
27
  ```py
28
  from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
@@ -30,31 +33,38 @@ import torch
30
 
31
  def get_result_text_es_pt (list_entity, text, lang):
32
  result_words = []
 
33
  if lang == "es":
34
  punc_tags = ['¿', '?', '¡', '!', ',', '.', ':']
35
  else:
36
  punc_tags = ['?', '!', ',', '.', ':']
37
-
38
- for entity in list_entity:
39
  tag = entity["entity"]
40
  word = entity["word"]
41
  start = entity["start"]
42
  end = entity["end"]
43
-
44
  # check punctuation
45
  punc_in = next((p for p in punc_tags if p in tag), "")
46
-
47
  subword = False
48
  # check subwords
49
- if word[0] == "#":
50
  subword = True
51
- if punc_in != "":
52
- word = result_words[-1].replace(punc_in, "") + text[start:end]
53
- else:
54
- word = result_words[-1] + text[start:end]
55
-
56
- if tag == "l":
57
- word = word
 
 
 
 
 
 
58
  elif tag == "u":
59
  word = word.capitalize()
60
  # case with punctuation
@@ -62,9 +72,12 @@ def get_result_text_es_pt (list_entity, text, lang):
62
  if tag[-1] == "l":
63
  word = (punc_in + word) if punc_in in ["¿", "¡"] else (word + punc_in)
64
  elif tag[-1] == "u":
65
- word = (punc_in + word.capitalize()) if punc_in in ["¿", "¡"] else (word.capitalize() + punc_in)
66
-
67
- if subword == True:
 
 
 
68
  result_words[-1] = word
69
  else:
70
  result_words.append(word)
 
15
 
16
  You can try the model in the following [SPACE](https://huggingface.co/spaces/VOCALINLP/punctuation_and_capitalization_restoration_sanivert)
17
  ## Details of the dataset
18
+ The model was fine-tuned for punctuation restoration using clinical reports and the OpusParaCrawl dataset.
19
 
20
  ## Evaluation Metrics
21
 
 
23
  This work was funded by the Spanish Government, the Spanish Ministry of Economy and Digital Transformation through the Digital Transformation through the "Recovery, Transformation and Resilience Plan" and also funded by the European Union NextGenerationEU/PRTR through the research project 2021/C005/0015007
24
 
25
  ## How to use the model
26
+ The metrics used to the evaluation of the model are the Macro and the Weighted F1 scores.
27
+
28
+
29
 
30
  ```py
31
  from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
 
33
 
34
  def get_result_text_es_pt (list_entity, text, lang):
35
  result_words = []
36
+ tmp_word = ""
37
  if lang == "es":
38
  punc_tags = ['¿', '?', '¡', '!', ',', '.', ':']
39
  else:
40
  punc_tags = ['?', '!', ',', '.', ':']
41
+
42
+ for idx, entity in enumerate(list_entity):
43
  tag = entity["entity"]
44
  word = entity["word"]
45
  start = entity["start"]
46
  end = entity["end"]
47
+
48
  # check punctuation
49
  punc_in = next((p for p in punc_tags if p in tag), "")
50
+
51
  subword = False
52
  # check subwords
53
+ if word[0] == "#":
54
  subword = True
55
+ if tmp_word == "":
56
+ p_s = list_entity[idx-1]["start"]
57
+ p_e = list_entity[idx-1]["end"]
58
+ tmp_word = text[p_s:p_e] + text[start:end]
59
+ else:
60
+ tmp_word = tmp_word + text[start:end]
61
+ word = tmp_word
62
+ else:
63
+ tmp_word = ""
64
+ word = text[start:end]
65
+
66
+ if tag == "l":
67
+ word = word
68
  elif tag == "u":
69
  word = word.capitalize()
70
  # case with punctuation
 
72
  if tag[-1] == "l":
73
  word = (punc_in + word) if punc_in in ["¿", "¡"] else (word + punc_in)
74
  elif tag[-1] == "u":
75
+ word = (punc_in + word.capitalize()) if punc_in in ["¿", "¡"] else (word.capitalize() + punc_in)
76
+
77
+ if tag != "l":
78
+ word = '<span style="font-weight:bold; color:rgb(142, 208, 129);">' + word + '</span>'
79
+
80
+ if subword == True:
81
  result_words[-1] = word
82
  else:
83
  result_words.append(word)