VOCALINLP
/

catalan_capitalization_punctuation_restoration_sanivert

Token Classification

Inference Endpoints

Model card Files Files and versions Community

jcg00v commited on Mar 4

Commit

f0bef02

•

1 Parent(s): 8e0b485

Update README.md

Files changed (1) hide show

README.md +13 -12

README.md CHANGED Viewed

@@ -28,30 +28,29 @@ This work was funded by the Spanish Government, the Spanish Ministry of Economy
 from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
 import torch
-def get_result_text_es_pt (list_entity, text, lang):
     result_words = []
-    if lang == "es":
-        punc_tags = ['¿', '?', '¡', '!', ',', '.', ':']
-    else:
-        punc_tags = ['?', '!', ',', '.', ':']
     for entity in list_entity:
-        tag = entity["entity"]
-        word = entity["word"]
         start = entity["start"]
         end = entity["end"]
         # check punctuation
         punc_in = next((p for p in punc_tags if p in tag), "")
         subword = False
         # check subwords
-        if word[0] == "#":
             subword = True
             if punc_in != "":
                 word = result_words[-1].replace(punc_in, "") + text[start:end]
             else:
                 word = result_words[-1] + text[start:end]
         if tag == "l":
             word = word
@@ -71,18 +70,20 @@ def get_result_text_es_pt (list_entity, text, lang):
     return " ".join(result_words)
-lang = "es"
-model_path = "VOCALINLP/spanish_capitalization_punctuation_restoration_sanivert"
 model = AutoModelForTokenClassification.from_pretrained(model_path)
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 pipe = pipeline("token-classification", model=model, tokenizer=tokenizer)
-text = "el paciente presenta los siguientes síntomas náuseas vértigo disnea fiebre y dolor abdominal"
 result = pipe(text)
 print("Source text: "+ text)
-result_text = get_result_text_es_pt(result, text, lang)
 print("Restored text: " +result_text)
 ```

 from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
 import torch
+def get_result_text_ca (list_entity, text):
     result_words = []
+    punc_tags = ['?', '!', ',', '.', ':']
     for entity in list_entity:
         start = entity["start"]
         end = entity["end"]
+        tag = entity["entity"]
+        word = entity["word"]
         # check punctuation
         punc_in = next((p for p in punc_tags if p in tag), "")
         subword = False
         # check subwords
+        if word[0] != "Ġ":
             subword = True
             if punc_in != "":
                 word = result_words[-1].replace(punc_in, "") + text[start:end]
             else:
                 word = result_words[-1] + text[start:end]
+        else:
+            word = text[start:end]
         if tag == "l":
             word = word
     return " ".join(result_words)
+lang = "ca"
+model_path = "VOCALINLP/catalan_capitalization_punctuation_restoration_sanivert"
 model = AutoModelForTokenClassification.from_pretrained(model_path)
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 pipe = pipeline("token-classification", model=model, tokenizer=tokenizer)
+text = "el pacient presenta els símptomes següents febre dispnea nàusees i vòmits"
 result = pipe(text)
 print("Source text: "+ text)
+result_text = get_result_text_ca(result, text)
 print("Restored text: " +result_text)
 ```