template_maker / languages.py
TheDarkLord69696969's picture
Upload languages.py
12f6a20
raw
history blame
No virus
2.65 kB
from docx import Document
import os
import sys
import transformers
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from mosestokenizer import *
from indicnlp.tokenize import sentence_tokenize
# import zipfile
# with zipfile.ZipFile(r"C:\Users\Prince Raj\Desktop\BOT\cuad-training\data.zip") as zip_ref:
# zip_ref.extractall(r"C:\Users\Prince Raj\Desktop\BOT\cuad-training")
os.chdir(r"C:\Users\Prince Raj\Desktop\BOT\transformers")
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
lang_dict = {
'english' : 'eng_Latn',
'assamese' : 'asm_Beng',
'awadhi' : 'awa_Deva' ,
'bengali' : 'ben_Beng',
'bhojpuri' : 'bho_Deva',
'gujarati' : 'guj_Gujr',
'hindi' : 'hin_Deva',
'kannada' : 'kan_Knda',
'kashmiri' : 'kas_Deva',
'maithili' : 'mai_Deva',
'malayalam' : 'mal_Mlym',
'marathi' : 'mar_Deva',
'odia' : 'ory_Orya',
'punjabi' : 'pan_Guru',
'sanskrit' : 'san_Deva',
'sindhi' : 'snd_Arab' ,
'tamil' : 'tam_Taml' ,
'telugu' : 'tel_Telu',
'urdu' : 'urd_Arab'
}
def translate_sentence(article, target):
inputs = tokenizer(article, return_tensors="pt").to(device)
translated_tokens = model.generate(
**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[lang_dict[target]], max_length=100)
return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
INDIC_DICT = {"assamese" :"as", 'bengali' : 'bn', 'gujarati' : 'gu',
'hindi' : 'hi',
'kannada' : 'kn',
'malayalam' : 'ml',
'marathi' : 'mr',
'odia' : 'or',
'punjabi' : 'pa',
'tamil' : 'ta' ,
'telugu' : 'te'}
def split_sentences(paragraph, language):
if language in INDIC_DICT.keys():
return sentence_tokenize.sentence_split(paragraph, lang=INDIC_DICT[language])
elif language == 'en':
with MosesSentenceSplitter('en') as splitter:
return splitter([paragraph])
else:
return paragraph.split(".")
def languages(paragraph, source, target):
if len(paragraph.split()) < 100:
return translate_sentence(paragraph, target)
else:
sentences = split_sentences(paragraph, source)
outputs = []
for each_sentence in sentences:
outputs.append(translate_sentence(each_sentence, target))
return " ".join(outputs)
sys.modules[__name__] = languages
# sent = "I am hungry now"
# src = "english"
# trg = "hindi"
# print(languages(sent, src, trg))