modularStarEncoder
/

ModularStarEncoder

@@ -1,4 +1,4 @@
-from transformers import AutoConfig, Starcoder2Model, Starcoder2Config
 import sys
 from config import ModularStarEncoderConfig
 import os
@@ -13,7 +13,6 @@ from transformers.activations import ACT2FN
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     ModelOutput,
     logging,
 )
@@ -34,9 +33,6 @@ class StarEncoder2PreTrainedModel(PreTrainedModel):
     _supports_sdpa = True
     _supports_cache_class = True
-    # def __init__(self):
-    #   self._supports_flash_attn_2 = True
-    #   super().__init__()
     def _init_weights(self, module):
@@ -81,7 +77,7 @@ class ModularStarEncoderOutput(ModelOutput):
         prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
         seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
             before SoftMax).
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
@@ -249,11 +245,9 @@ class ModularStarEncoder(StarEncoder2PreTrainedModel):
                 config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
                 the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
             next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
-                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
-                - 0 indicates sequence B is a continuation of sequence A,
-                - 1 indicates sequence B is a random sequence.
             kwargs (`Dict[str, any]`, optional, defaults to *{}*):
                 Used to hide legacy arguments that have been deprecated.

+from transformers import  Starcoder2Model
 import sys
 from config import ModularStarEncoderConfig
 import os
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     ModelOutput,
     logging,
 )
     _supports_sdpa = True
     _supports_cache_class = True
     def _init_weights(self, module):
         prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
         seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+            Prediction scores of the in context classification (classification) head (scores of True/False continuation
             before SoftMax).
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
                 config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
                 the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
             next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                This label is assigned to the in context loss:
+                - 0 indicates sequence B belongs to the same repository of A,
+                - 1 indicates sequence B is a random repository.
             kwargs (`Dict[str, any]`, optional, defaults to *{}*):
                 Used to hide legacy arguments that have been deprecated.