jcg00v commited on
Commit
f0bef02
1 Parent(s): 8e0b485

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +13 -12
README.md CHANGED
@@ -28,30 +28,29 @@ This work was funded by the Spanish Government, the Spanish Ministry of Economy
28
  from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
29
  import torch
30
 
31
- def get_result_text_es_pt (list_entity, text, lang):
32
  result_words = []
33
- if lang == "es":
34
- punc_tags = ['¿', '?', '¡', '!', ',', '.', ':']
35
- else:
36
- punc_tags = ['?', '!', ',', '.', ':']
37
 
38
  for entity in list_entity:
39
- tag = entity["entity"]
40
- word = entity["word"]
41
  start = entity["start"]
42
  end = entity["end"]
 
 
43
 
44
  # check punctuation
45
  punc_in = next((p for p in punc_tags if p in tag), "")
46
 
47
  subword = False
48
  # check subwords
49
- if word[0] == "#":
50
  subword = True
51
  if punc_in != "":
52
  word = result_words[-1].replace(punc_in, "") + text[start:end]
53
  else:
54
  word = result_words[-1] + text[start:end]
 
 
55
 
56
  if tag == "l":
57
  word = word
@@ -71,18 +70,20 @@ def get_result_text_es_pt (list_entity, text, lang):
71
 
72
  return " ".join(result_words)
73
 
74
- lang = "es"
75
- model_path = "VOCALINLP/spanish_capitalization_punctuation_restoration_sanivert"
 
76
 
77
  model = AutoModelForTokenClassification.from_pretrained(model_path)
78
  tokenizer = AutoTokenizer.from_pretrained(model_path)
79
 
80
  pipe = pipeline("token-classification", model=model, tokenizer=tokenizer)
81
- text = "el paciente presenta los siguientes síntomas náuseas vértigo disnea fiebre y dolor abdominal"
 
82
  result = pipe(text)
83
 
84
  print("Source text: "+ text)
85
- result_text = get_result_text_es_pt(result, text, lang)
86
  print("Restored text: " +result_text)
87
  ```
88
 
 
28
  from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
29
  import torch
30
 
31
+ def get_result_text_ca (list_entity, text):
32
  result_words = []
33
+ punc_tags = ['?', '!', ',', '.', ':']
 
 
 
34
 
35
  for entity in list_entity:
 
 
36
  start = entity["start"]
37
  end = entity["end"]
38
+ tag = entity["entity"]
39
+ word = entity["word"]
40
 
41
  # check punctuation
42
  punc_in = next((p for p in punc_tags if p in tag), "")
43
 
44
  subword = False
45
  # check subwords
46
+ if word[0] != "Ġ":
47
  subword = True
48
  if punc_in != "":
49
  word = result_words[-1].replace(punc_in, "") + text[start:end]
50
  else:
51
  word = result_words[-1] + text[start:end]
52
+ else:
53
+ word = text[start:end]
54
 
55
  if tag == "l":
56
  word = word
 
70
 
71
  return " ".join(result_words)
72
 
73
+
74
+ lang = "ca"
75
+ model_path = "VOCALINLP/catalan_capitalization_punctuation_restoration_sanivert"
76
 
77
  model = AutoModelForTokenClassification.from_pretrained(model_path)
78
  tokenizer = AutoTokenizer.from_pretrained(model_path)
79
 
80
  pipe = pipeline("token-classification", model=model, tokenizer=tokenizer)
81
+
82
+ text = "el pacient presenta els símptomes següents febre dispnea nàusees i vòmits"
83
  result = pipe(text)
84
 
85
  print("Source text: "+ text)
86
+ result_text = get_result_text_ca(result, text)
87
  print("Restored text: " +result_text)
88
  ```
89