ltg
/

deberta-xxlarge-fixed

@@ -1376,11 +1376,6 @@ class DebertaV2LMPredictionHead(nn.Module):
         # an output-only bias for each token.
         self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=True)
-        #self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        #self.decoder.bias = self.bias
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
         hidden_states = self.decoder(hidden_states)
@@ -1398,13 +1393,6 @@ class DebertaV2OnlyMLMHead(nn.Module):
         return prediction_scores
-@add_start_docstrings(
-    """
-    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
-    pooled output) e.g. for GLUE tasks.
-    """,
-    DEBERTA_START_DOCSTRING,
-)
 class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1517,14 +1505,6 @@ class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
         )
-@add_start_docstrings(
-    """
-    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
-    Named-Entity-Recognition (NER) tasks.
-    """,
-    DEBERTA_START_DOCSTRING,
-)
-# Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2
 class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1591,13 +1571,6 @@ class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
         )
-@add_start_docstrings(
-    """
-    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    DEBERTA_START_DOCSTRING,
-)
 class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1691,13 +1664,6 @@ class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
         )
-@add_start_docstrings(
-    """
-    DeBERTa Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
-    softmax) e.g. for RocStories/SWAG tasks.
-    """,
-    DEBERTA_START_DOCSTRING,
-)
 class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)

         # an output-only bias for each token.
         self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=True)
     def forward(self, hidden_states):
         hidden_states = self.transform(hidden_states)
         hidden_states = self.decoder(hidden_states)
         return prediction_scores
 class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         )
 class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         )
 class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         )
 class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)