VOCALINLP
/

catalan_capitalization_punctuation_restoration_sanivert

Token Classification

Inference Endpoints

Model card Files Files and versions Community

jcg00v commited on Mar 4

Commit

9d8ed09

•

1 Parent(s): 1534ac8

Update README.md

Files changed (1) hide show

README.md +75 -1

README.md CHANGED Viewed

@@ -10,4 +10,78 @@ language:
   - ca
 ---
 # Catalan punctuation and capisalization restoration model
-This model is for

   - ca
 ---
 # Catalan punctuation and capisalization restoration model
+## Details of the model
+Explicarlo
+## Details of the dataset
+The dataset used for training the model has been XXXXXXX
+## Evaluation Metrics
+## Funding
+## How to use the model
+```py
+from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
+import torch
+def get_result_text_es_pt (list_entity, text, lang):
+    result_words = []
+    if lang == "es":
+        punc_tags = ['¿', '?', '¡', '!', ',', '.', ':']
+    else:
+        punc_tags = ['?', '!', ',', '.', ':']
+    for entity in list_entity:
+        tag = entity["entity"]
+        word = entity["word"]
+        start = entity["start"]
+        end = entity["end"]
+        # check punctuation
+        punc_in = next((p for p in punc_tags if p in tag), "")
+        subword = False
+        # check subwords
+        if word[0] == "#":
+            subword = True
+            if punc_in != "":
+                word = result_words[-1].replace(punc_in, "") + text[start:end]
+            else:
+                word = result_words[-1] + text[start:end]
+        if tag == "l":
+            word = word
+        elif tag == "u":
+            word = word.capitalize()
+        # case with punctuation
+        else:
+            if tag[-1] == "l":
+                word = (punc_in + word) if punc_in in ["¿", "¡"] else (word + punc_in)
+            elif tag[-1] == "u":
+                word = (punc_in + word.capitalize()) if punc_in in ["¿", "¡"] else (word.capitalize() + punc_in)
+        if subword == True:
+            result_words[-1] = word
+        else:
+            result_words.append(word)
+    return " ".join(result_words)
+lang = "es"
+model_path = "VOCALINLP/spanish_capitalization_punctuation_restoration_sanivert"
+model = AutoModelForTokenClassification.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+pipe = pipeline("token-classification", model=model, tokenizer=tokenizer)
+text = "el paciente presenta los siguientes síntomas náuseas vértigo disnea fiebre y dolor abdominal"
+result = pipe(text)
+print("Source text: "+ text)
+result_text = get_result_text_es_pt(result, text, lang)
+print("Restored text: " +result_text)```
+> Created by [VOCALI SISSTEMAS INTELIGENTES/@VOCALINLP](https://twitter.com/vocalinet)