jcg00v commited on
Commit
9d8ed09
1 Parent(s): 1534ac8

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +75 -1
README.md CHANGED
@@ -10,4 +10,78 @@ language:
10
  - ca
11
  ---
12
  # Catalan punctuation and capisalization restoration model
13
- This model is for
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  - ca
11
  ---
12
  # Catalan punctuation and capisalization restoration model
13
+ ## Details of the model
14
+
15
+ Explicarlo
16
+
17
+ ## Details of the dataset
18
+ The dataset used for training the model has been XXXXXXX
19
+
20
+ ## Evaluation Metrics
21
+
22
+ ## Funding
23
+
24
+ ## How to use the model
25
+
26
+ ```py
27
+ from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
28
+ import torch
29
+
30
+ def get_result_text_es_pt (list_entity, text, lang):
31
+ result_words = []
32
+ if lang == "es":
33
+ punc_tags = ['¿', '?', '¡', '!', ',', '.', ':']
34
+ else:
35
+ punc_tags = ['?', '!', ',', '.', ':']
36
+
37
+ for entity in list_entity:
38
+ tag = entity["entity"]
39
+ word = entity["word"]
40
+ start = entity["start"]
41
+ end = entity["end"]
42
+
43
+ # check punctuation
44
+ punc_in = next((p for p in punc_tags if p in tag), "")
45
+
46
+ subword = False
47
+ # check subwords
48
+ if word[0] == "#":
49
+ subword = True
50
+ if punc_in != "":
51
+ word = result_words[-1].replace(punc_in, "") + text[start:end]
52
+ else:
53
+ word = result_words[-1] + text[start:end]
54
+
55
+ if tag == "l":
56
+ word = word
57
+ elif tag == "u":
58
+ word = word.capitalize()
59
+ # case with punctuation
60
+ else:
61
+ if tag[-1] == "l":
62
+ word = (punc_in + word) if punc_in in ["¿", "¡"] else (word + punc_in)
63
+ elif tag[-1] == "u":
64
+ word = (punc_in + word.capitalize()) if punc_in in ["¿", "¡"] else (word.capitalize() + punc_in)
65
+
66
+ if subword == True:
67
+ result_words[-1] = word
68
+ else:
69
+ result_words.append(word)
70
+
71
+ return " ".join(result_words)
72
+
73
+ lang = "es"
74
+ model_path = "VOCALINLP/spanish_capitalization_punctuation_restoration_sanivert"
75
+
76
+ model = AutoModelForTokenClassification.from_pretrained(model_path)
77
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
78
+
79
+ pipe = pipeline("token-classification", model=model, tokenizer=tokenizer)
80
+ text = "el paciente presenta los siguientes síntomas náuseas vértigo disnea fiebre y dolor abdominal"
81
+ result = pipe(text)
82
+
83
+ print("Source text: "+ text)
84
+ result_text = get_result_text_es_pt(result, text, lang)
85
+ print("Restored text: " +result_text)```
86
+
87
+ > Created by [VOCALI SISSTEMAS INTELIGENTES/@VOCALINLP](https://twitter.com/vocalinet)