BAAI
/

AquilaMoE

@@ -677,7 +677,7 @@ class AquilaMoeBLockSparseTop2MLP(nn.Module):
         return routing_weights * current_hidden_states
-MISTRAL_ATTENTION_CLASSES = {
     "eager": AquilaMoeAttention,
     "flash_attention_2": AquilaMoeFlashAttention2,
 }
@@ -758,7 +758,7 @@ class AquilaMoeDecoderLayer(nn.Module):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
         self.block_sparse_moe = AquilaMoeSparseMoeBlock(config)
         self.input_layernorm = AquilaMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

         return routing_weights * current_hidden_states
+AQUILAMOE_ATTENTION_CLASSES = {
     "eager": AquilaMoeAttention,
     "flash_attention_2": AquilaMoeFlashAttention2,
 }
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.self_attn = AQUILAMOE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
         self.block_sparse_moe = AquilaMoeSparseMoeBlock(config)
         self.input_layernorm = AquilaMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)