hadiqa123 commited on
Commit
a5d55b8
1 Parent(s): a5c56c1

add tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +0 -16
  2. tokenizer_config.json +0 -2
  3. vocab.json +44 -44
special_tokens_map.json CHANGED
@@ -1,20 +1,4 @@
1
  {
2
- "additional_special_tokens": [
3
- {
4
- "content": "<s>",
5
- "lstrip": false,
6
- "normalized": true,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "</s>",
12
- "lstrip": false,
13
- "normalized": true,
14
- "rstrip": false,
15
- "single_word": false
16
- }
17
- ],
18
  "bos_token": "<s>",
19
  "eos_token": "</s>",
20
  "pad_token": "[PAD]",
 
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "bos_token": "<s>",
3
  "eos_token": "</s>",
4
  "pad_token": "[PAD]",
tokenizer_config.json CHANGED
@@ -2,10 +2,8 @@
2
  "bos_token": "<s>",
3
  "do_lower_case": false,
4
  "eos_token": "</s>",
5
- "name_or_path": "./",
6
  "pad_token": "[PAD]",
7
  "replace_word_delimiter_char": " ",
8
- "special_tokens_map_file": null,
9
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
10
  "unk_token": "[UNK]",
11
  "word_delimiter_token": "|"
 
2
  "bos_token": "<s>",
3
  "do_lower_case": false,
4
  "eos_token": "</s>",
 
5
  "pad_token": "[PAD]",
6
  "replace_word_delimiter_char": " ",
 
7
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
8
  "unk_token": "[UNK]",
9
  "word_delimiter_token": "|"
vocab.json CHANGED
@@ -1,47 +1,47 @@
1
  {
2
- " ": 6,
3
- "[PAD]": 44,
4
- "[UNK]": 43,
5
- "آ": 35,
6
- "أ": 0,
7
- "ؤ": 24,
8
- "ئ": 11,
9
- "ا": 9,
10
- "ب": 31,
11
- "ت": 39,
12
- "ث": 12,
13
- "ج": 10,
14
  "ح": 16,
15
- "خ": 28,
16
- "د": 7,
17
- "ذ": 19,
18
- "ر": 42,
19
- "ز": 5,
20
- "س": 8,
21
- "ش": 18,
22
- "ص": 4,
23
- "ض": 14,
24
- "ط": 37,
25
- "ظ": 13,
26
- "ع": 25,
27
- "غ": 38,
28
- "ف": 3,
29
- "ق": 21,
30
- "ل": 33,
31
- "م": 26,
32
- "ن": 1,
33
- "و": 40,
34
- "ً": 29,
35
- "ٹ": 15,
36
- "پ": 20,
37
- "چ": 27,
38
- "ڈ": 41,
39
- "ڑ": 36,
40
- "ک": 32,
41
- "گ": 34,
42
- "ں": 30,
43
- "ھ": 22,
44
- "ہ": 2,
45
- "ی": 23,
46
- "ے": 17
47
  }
 
1
  {
2
+ " ": 26,
3
+ "PAD": 43,
4
+ "[UNK]": 44,
5
+ "آ": 33,
6
+ "أ": 28,
7
+ "ؤ": 27,
8
+ "ئ": 35,
9
+ "ا": 29,
10
+ "ب": 4,
11
+ "ت": 42,
12
+ "ث": 20,
13
+ "ج": 41,
14
  "ح": 16,
15
+ "خ": 21,
16
+ "د": 0,
17
+ "ذ": 5,
18
+ "ر": 32,
19
+ "ز": 12,
20
+ "س": 1,
21
+ "ش": 31,
22
+ "ص": 17,
23
+ "ض": 8,
24
+ "ط": 34,
25
+ "ظ": 19,
26
+ "ع": 22,
27
+ "غ": 10,
28
+ "ف": 23,
29
+ "ق": 25,
30
+ "ل": 30,
31
+ "م": 15,
32
+ "ن": 9,
33
+ "و": 3,
34
+ "ً": 14,
35
+ "ٹ": 37,
36
+ "پ": 39,
37
+ "چ": 6,
38
+ "ڈ": 38,
39
+ "ڑ": 7,
40
+ "ک": 24,
41
+ "گ": 18,
42
+ "ں": 2,
43
+ "ھ": 36,
44
+ "ہ": 40,
45
+ "ی": 11,
46
+ "ے": 13
47
  }