TheDarkLord69696969 commited on
Commit
12f6a20
1 Parent(s): a872010

Upload languages.py

Browse files
Files changed (1) hide show
  1. languages.py +85 -0
languages.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from docx import Document
2
+ import os
3
+ import sys
4
+ import transformers
5
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
6
+ import torch
7
+ from mosestokenizer import *
8
+ from indicnlp.tokenize import sentence_tokenize
9
+
10
+
11
+ # import zipfile
12
+ # with zipfile.ZipFile(r"C:\Users\Prince Raj\Desktop\BOT\cuad-training\data.zip") as zip_ref:
13
+ # zip_ref.extractall(r"C:\Users\Prince Raj\Desktop\BOT\cuad-training")
14
+ os.chdir(r"C:\Users\Prince Raj\Desktop\BOT\transformers")
15
+ tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
16
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ model = model.to(device)
19
+
20
+ lang_dict = {
21
+ 'english' : 'eng_Latn',
22
+ 'assamese' : 'asm_Beng',
23
+ 'awadhi' : 'awa_Deva' ,
24
+ 'bengali' : 'ben_Beng',
25
+ 'bhojpuri' : 'bho_Deva',
26
+ 'gujarati' : 'guj_Gujr',
27
+ 'hindi' : 'hin_Deva',
28
+ 'kannada' : 'kan_Knda',
29
+ 'kashmiri' : 'kas_Deva',
30
+ 'maithili' : 'mai_Deva',
31
+ 'malayalam' : 'mal_Mlym',
32
+ 'marathi' : 'mar_Deva',
33
+ 'odia' : 'ory_Orya',
34
+ 'punjabi' : 'pan_Guru',
35
+ 'sanskrit' : 'san_Deva',
36
+ 'sindhi' : 'snd_Arab' ,
37
+ 'tamil' : 'tam_Taml' ,
38
+ 'telugu' : 'tel_Telu',
39
+ 'urdu' : 'urd_Arab'
40
+ }
41
+
42
+ def translate_sentence(article, target):
43
+ inputs = tokenizer(article, return_tensors="pt").to(device)
44
+
45
+ translated_tokens = model.generate(
46
+ **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[lang_dict[target]], max_length=100)
47
+
48
+ return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
49
+
50
+
51
+ INDIC_DICT = {"assamese" :"as", 'bengali' : 'bn', 'gujarati' : 'gu',
52
+ 'hindi' : 'hi',
53
+ 'kannada' : 'kn',
54
+ 'malayalam' : 'ml',
55
+ 'marathi' : 'mr',
56
+ 'odia' : 'or',
57
+ 'punjabi' : 'pa',
58
+ 'tamil' : 'ta' ,
59
+ 'telugu' : 'te'}
60
+
61
+ def split_sentences(paragraph, language):
62
+ if language in INDIC_DICT.keys():
63
+ return sentence_tokenize.sentence_split(paragraph, lang=INDIC_DICT[language])
64
+ elif language == 'en':
65
+ with MosesSentenceSplitter('en') as splitter:
66
+ return splitter([paragraph])
67
+ else:
68
+ return paragraph.split(".")
69
+
70
+ def languages(paragraph, source, target):
71
+ if len(paragraph.split()) < 100:
72
+ return translate_sentence(paragraph, target)
73
+ else:
74
+ sentences = split_sentences(paragraph, source)
75
+ outputs = []
76
+ for each_sentence in sentences:
77
+ outputs.append(translate_sentence(each_sentence, target))
78
+ return " ".join(outputs)
79
+
80
+ sys.modules[__name__] = languages
81
+
82
+ # sent = "I am hungry now"
83
+ # src = "english"
84
+ # trg = "hindi"
85
+ # print(languages(sent, src, trg))