paulhindemith commited on
Commit
301bdd5
1 Parent(s): 5cfa3e2

commit files to HF hub

Browse files
Files changed (3) hide show
  1. config.json +1 -1
  2. fasttext_fsc.py +1 -3
  3. mecab_tokenizer.py +28 -15
config.json CHANGED
@@ -18,7 +18,7 @@
18
  "neutral": 1
19
  },
20
  "max_length": 128,
21
- "model_type": "fasttext_jp",
22
  "ngram": 2,
23
  "tokenizerI_class": "FastTextJpTokenizer",
24
  "tokenizer_class": "FastTextJpTokenizer",
 
18
  "neutral": 1
19
  },
20
  "max_length": 128,
21
+ "model_type": "fasttext_classification",
22
  "ngram": 2,
23
  "tokenizerI_class": "FastTextJpTokenizer",
24
  "tokenizer_class": "FastTextJpTokenizer",
fasttext_fsc.py CHANGED
@@ -1,6 +1,4 @@
1
  from __future__ import annotations
2
- from transformers import PretrainedConfig
3
- from torch import nn
4
  import torch
5
  from torchtyping import TensorType
6
  from .fasttext_jp_embedding import FastTextJpModel, FastTextJpConfig
@@ -10,7 +8,7 @@ from transformers.modeling_outputs import SequenceClassifierOutput
10
  class FastTextForSeuqenceClassificationConfig(FastTextJpConfig):
11
  """FastTextJpModelのConfig
12
  """
13
- model_type = "fasttext_jp"
14
 
15
  def __init__(self,
16
  ngram: int = 2,
 
1
  from __future__ import annotations
 
 
2
  import torch
3
  from torchtyping import TensorType
4
  from .fasttext_jp_embedding import FastTextJpModel, FastTextJpConfig
 
8
  class FastTextForSeuqenceClassificationConfig(FastTextJpConfig):
9
  """FastTextJpModelのConfig
10
  """
11
+ model_type = "fasttext_classification"
12
 
13
  def __init__(self,
14
  ngram: int = 2,
mecab_tokenizer.py CHANGED
@@ -14,9 +14,9 @@ class MeCabResult(NamedTuple):
14
  hinshi_saibunrui_3: str
15
  katsuyokei_1: str
16
  katsuyokei_2: str
17
- genkei: str
18
- yomi: str
19
- hatsuon: str
20
 
21
 
22
  class MeCabTokenizer(PreTrainedTokenizer):
@@ -34,9 +34,9 @@ class MeCabTokenizer(PreTrainedTokenizer):
34
 
35
  self.target_hinshi = hinshi
36
  if mecab_dicdir is not None:
37
- self.mecab = MeCab.Tagger(f"-d {mecab_dicdir}")
38
  else:
39
- self.mecab = MeCab.Tagger()
40
 
41
  super().__init__(**kwargs)
42
 
@@ -76,17 +76,30 @@ class MeCabTokenizer(PreTrainedTokenizer):
76
  Returns:
77
  list[MeCabResult]: MeCabの解析結果
78
  """
79
- node = self.mecab.parseToNode(text)
80
  #形態素1つ1つを処理
81
  out = []
82
- while node:
83
- args = []
84
- args.append(node.surface)
85
- feature = node.feature.split(",")
86
- args.extend(feature)
87
- mecab_result = MeCabResult(args[0], args[1], args[2], args[3],
88
- args[4], args[5], args[6], args[7],
89
- args[8], args[9])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  out.append(mecab_result)
91
- node = node.next # 最後のEOSを省く
92
  return out
 
14
  hinshi_saibunrui_3: str
15
  katsuyokei_1: str
16
  katsuyokei_2: str
17
+ genkei: str = ""
18
+ yomi: str = ""
19
+ hatsuon: str = ""
20
 
21
 
22
  class MeCabTokenizer(PreTrainedTokenizer):
 
34
 
35
  self.target_hinshi = hinshi
36
  if mecab_dicdir is not None:
37
+ self.mecab = MeCab.Tagger(f"-d {mecab_dicdir} -O '' -F '%m,%H\n'")
38
  else:
39
+ self.mecab = MeCab.Tagger("-O '' -F '%m,%H\n'")
40
 
41
  super().__init__(**kwargs)
42
 
 
76
  Returns:
77
  list[MeCabResult]: MeCabの解析結果
78
  """
79
+ nodes = self.mecab.parse(text).split("\n")
80
  #形態素1つ1つを処理
81
  out = []
82
+ for node in nodes:
83
+ args = node.split(",")
84
+ if args[0] in ["EOS", ""]:
85
+ continue
86
+ # 辞書によって異なる
87
+ if len(args) == 10:
88
+ mecab_result = MeCabResult(args[0], args[1], args[2], args[3],
89
+ args[4], args[5], args[6], args[7],
90
+ args[8], args[9])
91
+ elif len(args) == 7:
92
+ # 英語
93
+ mecab_result = MeCabResult(args[0], args[1], args[2], args[3],
94
+ args[4], args[5], args[6], "*", "*",
95
+ "*")
96
+ elif len(args) == 27:
97
+ # 補助記号
98
+ mecab_result = MeCabResult(args[0], args[1], args[2], args[3],
99
+ args[4], args[5], args[6], "*", "*",
100
+ "*")
101
+ else:
102
+ raise NotImplementedError(
103
+ f"unsupposed parse args_length: {len(args)}, args: {args}")
104
  out.append(mecab_result)
 
105
  return out