TwentyNine
commited on
Commit
•
88c2b0e
1
Parent(s):
6d0cacf
Update model reference
Browse files
README.md
CHANGED
@@ -20,7 +20,7 @@ The following is adapted from [slone/nllb-rus-tyv-v1](https://huggingface.co/slo
|
|
20 |
import torch
|
21 |
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
|
22 |
|
23 |
-
def fix_tokenizer(tokenizer, new_lang
|
24 |
""" Add a new language token to the tokenizer vocabulary (this should be done each time after its initialization) """
|
25 |
old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
|
26 |
tokenizer.lang_code_to_id[new_lang] = old_len-1
|
@@ -36,10 +36,11 @@ def fix_tokenizer(tokenizer, new_lang='ain_Latn'):
|
|
36 |
tokenizer.added_tokens_encoder = {}
|
37 |
tokenizer.added_tokens_decoder = {}
|
38 |
|
39 |
-
MODEL_URL = "TwentyNine/nllb-
|
40 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_URL)
|
41 |
tokenizer = NllbTokenizer.from_pretrained(MODEL_URL)
|
42 |
-
fix_tokenizer(tokenizer)
|
|
|
43 |
|
44 |
def translate(
|
45 |
text,
|
|
|
20 |
import torch
|
21 |
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
|
22 |
|
23 |
+
def fix_tokenizer(tokenizer, new_lang):
|
24 |
""" Add a new language token to the tokenizer vocabulary (this should be done each time after its initialization) """
|
25 |
old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
|
26 |
tokenizer.lang_code_to_id[new_lang] = old_len-1
|
|
|
36 |
tokenizer.added_tokens_encoder = {}
|
37 |
tokenizer.added_tokens_decoder = {}
|
38 |
|
39 |
+
MODEL_URL = "TwentyNine/nllb-ain-kana-latin-converter-v1"
|
40 |
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_URL)
|
41 |
tokenizer = NllbTokenizer.from_pretrained(MODEL_URL)
|
42 |
+
fix_tokenizer(tokenizer, 'ain_Jpan')
|
43 |
+
fix_tokenizer(tokenizer, 'ain_Latn')
|
44 |
|
45 |
def translate(
|
46 |
text,
|