Update modeling_aquilamoe.py
Browse files- modeling_aquilamoe.py +2 -2
modeling_aquilamoe.py
CHANGED
@@ -677,7 +677,7 @@ class AquilaMoeBLockSparseTop2MLP(nn.Module):
|
|
677 |
return routing_weights * current_hidden_states
|
678 |
|
679 |
|
680 |
-
|
681 |
"eager": AquilaMoeAttention,
|
682 |
"flash_attention_2": AquilaMoeFlashAttention2,
|
683 |
}
|
@@ -758,7 +758,7 @@ class AquilaMoeDecoderLayer(nn.Module):
|
|
758 |
super().__init__()
|
759 |
self.hidden_size = config.hidden_size
|
760 |
|
761 |
-
self.self_attn =
|
762 |
|
763 |
self.block_sparse_moe = AquilaMoeSparseMoeBlock(config)
|
764 |
self.input_layernorm = AquilaMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
|
|
677 |
return routing_weights * current_hidden_states
|
678 |
|
679 |
|
680 |
+
AQUILAMOE_ATTENTION_CLASSES = {
|
681 |
"eager": AquilaMoeAttention,
|
682 |
"flash_attention_2": AquilaMoeFlashAttention2,
|
683 |
}
|
|
|
758 |
super().__init__()
|
759 |
self.hidden_size = config.hidden_size
|
760 |
|
761 |
+
self.self_attn = AQUILAMOE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
|
762 |
|
763 |
self.block_sparse_moe = AquilaMoeSparseMoeBlock(config)
|
764 |
self.input_layernorm = AquilaMoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|