benjamin commited on
Commit
9bf5852
1 Parent(s): a5b6bdf

update tokenizer

Browse files
Files changed (1) hide show
  1. tokenizer.json +21 -5
tokenizer.json CHANGED
@@ -31,12 +31,28 @@
31
  "special": true
32
  }
33
  ],
34
- "normalizer": null,
 
 
 
35
  "pre_tokenizer": {
36
- "type": "ByteLevel",
37
- "add_prefix_space": true,
38
- "trim_offsets": true,
39
- "use_regex": true
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  },
41
  "post_processor": {
42
  "type": "TemplateProcessing",
 
31
  "special": true
32
  }
33
  ],
34
+ "normalizer": {
35
+ "type": "Prepend",
36
+ "prepend": " "
37
+ },
38
  "pre_tokenizer": {
39
+ "type": "Sequence",
40
+ "pretokenizers": [
41
+ {
42
+ "type": "Split",
43
+ "pattern": {
44
+ "Regex": "'s|'t|'re|'ve|'m|'ll|'d| ?[\\p{L}\\p{M}]+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+"
45
+ },
46
+ "behavior": "Removed",
47
+ "invert": true
48
+ },
49
+ {
50
+ "type": "ByteLevel",
51
+ "add_prefix_space": false,
52
+ "trim_offsets": true,
53
+ "use_regex": false
54
+ }
55
+ ]
56
  },
57
  "post_processor": {
58
  "type": "TemplateProcessing",