ModernVBERT
/

modernvbert

@@ -1,18 +1,26 @@
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss
-from transformers import AutoConfig, AutoModel, AutoModelForMaskedLM, PreTrainedModel, logging
-from transformers.modeling_outputs import BaseModelOutput
-from transformers.models.bert.modeling_bert import BaseModelOutputWithPoolingAndCrossAttentions, MaskedLMOutput
 from .configuration_modernvbert import ModernVBertConfig
-logger = logging.get_logger(__name__)
 class DecoupledEmbedding(nn.Embedding):
     # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html#Embedding
@@ -97,7 +105,7 @@ class DecoupledEmbedding(nn.Embedding):
         # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
         input_ids[additional_vocab_indices] = 0
         full_vector = F.embedding(input_ids, self.weight)
-        full_vector[additional_vocab_indices] = additional_embeddings      # overwrite the records with high indices
         return full_vector
@@ -124,10 +132,11 @@ class ModernVBertBaseModelOutput(BaseModelOutput):
             sequence_length, hidden_size)`.
             image_hidden_states of the model produced by the vision encoder
     """
     last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 @dataclass
@@ -137,7 +146,7 @@ class ModernVBertMaskedLMOutput(MaskedLMOutput):
     Args:
         loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
             Masked language modeling (MLM) loss.
-        logits (`torch.FloatTensor`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
@@ -153,15 +162,17 @@ class ModernVBertMaskedLMOutput(MaskedLMOutput):
             sequence_length, hidden_size)`.
             image_hidden_states of the model produced by the vision encoder
     """
     loss: Optional[torch.FloatTensor] = None
     logits: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
-    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     image_hidden_states: Optional[torch.FloatTensor] = None
 class ModernVBertSimpleMLP(nn.Module):
     """A simple linear projection layer to project the vision hidden states to the text hidden states."""
     def __init__(self, input_size, output_size):
         super().__init__()
         self.proj = nn.Linear(input_size, output_size, bias=False)
@@ -175,26 +186,32 @@ class ModernVBertConnector(nn.Module):
     Connector module for ModernVBERT. It performs a pixel shuffle operation followed by a linear projection to match the text model's hidden size.
     Based on https://pytorch.org/docs/stable/generated/torch.nn.PixelShuffle.html
     """
     def __init__(self, config):
         super().__init__()
-        self.scale_factor = config.pixel_shuffle_factor
         self.modality_projection = ModernVBertSimpleMLP(
-            input_size=config.vision_config.hidden_size * (config.scale_factor**2),
             output_size=config.text_config.hidden_size,
         )
-    def pixel_shuffle(self, x, scale_factor):
         bsz, seq, embed_dim = x.size()
         height = width = int(seq**0.5)
         x = x.view(bsz, height, width, embed_dim)
-        x = x.view(bsz, height, int(width / scale_factor), embed_dim * scale_factor)
         x = x.permute(0, 2, 1, 3)
-        x = x.reshape(bsz, int(width / scale_factor), int(height / scale_factor), embed_dim * (scale_factor**2))
         x = x.permute(0, 2, 1, 3)
-        return x.reshape(bsz, int(seq / (scale_factor**2)), embed_dim * (scale_factor**2))
     def forward(self, image_hidden_states):
-        image_hidden_states = self.pixel_shuffle(image_hidden_states, self.scale_factor)
         return self.modality_projection(image_hidden_states)
@@ -217,55 +234,55 @@ class ModernVBertPreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
 class ModernVBertModel(ModernVBertPreTrainedModel):
     def __init__(self, config: ModernVBertConfig):
         super().__init__(config)
         self.vision_model = ModernVBertModel.init_vision_model(config)
         self.connector = ModernVBertConnector(config)
         self.text_model = ModernVBertModel.init_language_model(config)
-        self.image_seq_len = int(
-            ((config.vision_config.image_size // config.vision_config.patch_size) ** 2) / (config.scale_factor**2)
-        )
-        self.image_token_id = config.image_token_id
-        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         # set the correct dtype for vision and text models
         self.vision_model.to(self.dtype)
         self.text_model.to(self.dtype)
         self.post_init()
     @staticmethod
     def init_vision_model(config: ModernVBertConfig):
-        vision_model_config = AutoConfig.from_pretrained(
             config.vision_config.vision_model_name,
             _attn_implementation=config._attn_implementation,
         )
-        vision_model = AutoModel.from_config(
-            vision_model_config,
-            trust_remote_code=True,
-        )
-        return getattr(vision_model, "vision_model", vision_model)
     @staticmethod
     def init_language_model(config: ModernVBertConfig):
-        text_model_config = AutoConfig.from_pretrained(
             config.text_config.text_model_name,
             _attn_implementation=config._attn_implementation,
-            trust_remote_code=True,
-        )
-        text_model = AutoModel.from_config(
-            text_model_config,
-            trust_remote_code=True
         )
         embed_layer = DecoupledEmbedding(
             num_embeddings=text_model_config.vocab_size,
             num_additional_embeddings=config.additional_vocab_size,
             embedding_dim=config.hidden_size,
-            partially_freeze=config.freeze_config["freeze_text_layers"],
             padding_idx=config.pad_token_id,
         )
         text_model.set_input_embeddings(embed_layer)
         return text_model
     def enable_input_require_grads(self):
         """
         Enables the gradients for the input embeddings.
@@ -292,12 +309,65 @@ class ModernVBertModel(ModernVBertPreTrainedModel):
             make_inputs_require_grads
         )
     def get_input_embeddings(self):
         return self.text_model.get_input_embeddings()
     def set_input_embeddings(self, value):
         self.text_model.set_input_embeddings(value)
     def inputs_merger(self, input_ids, inputs_embeds, image_hidden_states):
         """Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/smolvlm/modeling_smolvlm.py
@@ -311,21 +381,47 @@ class ModernVBertModel(ModernVBertPreTrainedModel):
         """
         _, patch_size, _ = image_hidden_states.shape
-        image_mask = input_ids == self.image_token_id
         num_image_tokens = image_mask.sum(dim=1)
         if not torch.all(num_image_tokens % patch_size == 0):
             raise ValueError("Number of <image> tokens not divisible by patch_size.")
         blocks_per_sample = num_image_tokens // patch_size
         offsets = torch.nn.functional.pad(blocks_per_sample.cumsum(dim=0), (1, 0), value=0)
         block_offset = offsets[:-1]
         row_cum = image_mask.cumsum(dim=-1)
         chunk_idx = (row_cum - 1) // patch_size
         local_idx = (row_cum - 1) % patch_size
         block_idx = block_offset.unsqueeze(1) + chunk_idx
         image_embeds = torch.zeros_like(inputs_embeds)
         image_embeds[image_mask] = image_hidden_states[block_idx[image_mask], local_idx[image_mask], :]
         return torch.where(image_mask.unsqueeze(-1), image_embeds, inputs_embeds)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -338,28 +434,44 @@ class ModernVBertModel(ModernVBertPreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if inputs_embeds is None:
             inputs_embeds = self.text_model.get_input_embeddings()(input_ids).to(input_ids.device)
         if pixel_values is not None:
-            batch_size, num_images, _, _, _ = pixel_values.shape
-            pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
-            nb_values_per_image = pixel_values.shape[1:].numel()
-            real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
-            if not any(real_images_inds):
-                real_images_inds[0] = True
-            pixel_values = pixel_values[real_images_inds].contiguous()
-            image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state
             image_hidden_states = self.connector(image_hidden_states)
-        elif image_hidden_states is not None:
-            image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
-        if inputs_embeds is not None and image_hidden_states is not None:
-            inputs_embeds = self.inputs_merger(input_ids, inputs_embeds, image_hidden_states)
         outputs = self.text_model(
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
@@ -367,9 +479,9 @@ class ModernVBertModel(ModernVBertPreTrainedModel):
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        if not return_dict:
-            return tuple(v for v in [*outputs, image_hidden_states] if v is not None)
         return ModernVBertBaseModelOutput(
             last_hidden_state=outputs.last_hidden_state,
             hidden_states=outputs.hidden_states,
@@ -377,11 +489,12 @@ class ModernVBertModel(ModernVBertPreTrainedModel):
             image_hidden_states=image_hidden_states,
         )
 class ModernVBertLMHead(nn.Module):
     def __init__(self, config):
         super().__init__()
-        pretrained_config = AutoConfig.from_pretrained(config.text_config.text_model_name, trust_remote_code=True)
-        pretrained_model = AutoModelForMaskedLM.from_config(pretrained_config, trust_remote_code=True)
         self.head = pretrained_model.head
         self.decoder = pretrained_model.decoder
@@ -389,10 +502,12 @@ class ModernVBertLMHead(nn.Module):
         return self.decoder(self.head(hidden_states))
 class ModernVBertForMaskedLM(ModernVBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.image_token_id = config.image_token_id
         self.in_features = config.hidden_size
         self.out_additional_features = config.additional_vocab_size
         self.vocab_size = config.vocab_size
@@ -403,6 +518,24 @@ class ModernVBertForMaskedLM(ModernVBertPreTrainedModel):
         self.lm_head.to(self.dtype)
         self.post_init()
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -416,7 +549,19 @@ class ModernVBertForMaskedLM(ModernVBertPreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         labels: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, ModernVBertMaskedLMOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -434,23 +579,32 @@ class ModernVBertForMaskedLM(ModernVBertPreTrainedModel):
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
         if self.out_additional_features > 0:
             proj_states = self.lm_head.head(hidden_states)
             additional_features = self.additional_fc(proj_states)
             logits = torch.cat((logits, additional_features), -1)
         loss = None
         if labels is not None:
             loss = CrossEntropyLoss()(logits.view(-1, self.vocab_size + self.out_additional_features), labels.view(-1))
         if not return_dict:
             output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
         return ModernVBertMaskedLMOutput(
             loss=loss,
             logits=logits.float(),
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
             image_hidden_states=outputs.image_hidden_states,
-        )

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/modernvbert/modular_modernvbert.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_modernvbert.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 from dataclasses import dataclass
+from typing import Optional, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPoolingAndCrossAttentions, MaskedLMOutput
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, can_return_tuple
+from ..modernbert import ModernBertConfig, ModernBertForMaskedLM, ModernBertModel
+from ..siglip import SiglipVisionConfig, SiglipVisionModel
 from .configuration_modernvbert import ModernVBertConfig
 class DecoupledEmbedding(nn.Embedding):
     # Derived from https://pytorch.org/docs/stable/_modules/torch/nn/modules/sparse.html#Embedding
         # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
         input_ids[additional_vocab_indices] = 0
         full_vector = F.embedding(input_ids, self.weight)
+        full_vector[additional_vocab_indices] = additional_embeddings  # overwrite the records with high indices
         return full_vector
             sequence_length, hidden_size)`.
             image_hidden_states of the model produced by the vision encoder
     """
     last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
 @dataclass
     Args:
         loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
             Masked language modeling (MLM) loss.
+        logits (`torch.FloatTensor`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
             sequence_length, hidden_size)`.
             image_hidden_states of the model produced by the vision encoder
     """
     loss: Optional[torch.FloatTensor] = None
     logits: torch.FloatTensor = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
     image_hidden_states: Optional[torch.FloatTensor] = None
 class ModernVBertSimpleMLP(nn.Module):
     """A simple linear projection layer to project the vision hidden states to the text hidden states."""
     def __init__(self, input_size, output_size):
         super().__init__()
         self.proj = nn.Linear(input_size, output_size, bias=False)
     Connector module for ModernVBERT. It performs a pixel shuffle operation followed by a linear projection to match the text model's hidden size.
     Based on https://pytorch.org/docs/stable/generated/torch.nn.PixelShuffle.html
     """
     def __init__(self, config):
         super().__init__()
+        self.pixel_shuffle_factor = config.pixel_shuffle_factor
         self.modality_projection = ModernVBertSimpleMLP(
+            input_size=config.vision_config.hidden_size * (config.pixel_shuffle_factor**2),
             output_size=config.text_config.hidden_size,
         )
+    def pixel_shuffle(self, x, pixel_shuffle_factor):
         bsz, seq, embed_dim = x.size()
         height = width = int(seq**0.5)
         x = x.view(bsz, height, width, embed_dim)
+        x = x.view(bsz, height, int(width / pixel_shuffle_factor), embed_dim * pixel_shuffle_factor)
         x = x.permute(0, 2, 1, 3)
+        x = x.reshape(
+            bsz,
+            int(width / pixel_shuffle_factor),
+            int(height / pixel_shuffle_factor),
+            embed_dim * (pixel_shuffle_factor**2),
+        )
         x = x.permute(0, 2, 1, 3)
+        return x.reshape(bsz, int(seq / (pixel_shuffle_factor**2)), embed_dim * (pixel_shuffle_factor**2))
     def forward(self, image_hidden_states):
+        image_hidden_states = self.pixel_shuffle(image_hidden_states, self.pixel_shuffle_factor)
         return self.modality_projection(image_hidden_states)
                 module.weight.data[module.padding_idx].zero_()
+@auto_docstring
 class ModernVBertModel(ModernVBertPreTrainedModel):
     def __init__(self, config: ModernVBertConfig):
         super().__init__(config)
+        # init components
         self.vision_model = ModernVBertModel.init_vision_model(config)
         self.connector = ModernVBertConnector(config)
         self.text_model = ModernVBertModel.init_language_model(config)
         # set the correct dtype for vision and text models
         self.vision_model.to(self.dtype)
         self.text_model.to(self.dtype)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.image_seq_len = int(
+            ((config.vision_config.image_size // config.vision_config.patch_size) ** 2)
+            / (config.pixel_shuffle_factor**2)
+        )
         self.post_init()
     @staticmethod
     def init_vision_model(config: ModernVBertConfig):
+        vision_model_config = SiglipVisionConfig.from_pretrained(
             config.vision_config.vision_model_name,
             _attn_implementation=config._attn_implementation,
         )
+        vision_model = SiglipVisionModel(vision_model_config).vision_model
+        return vision_model
     @staticmethod
     def init_language_model(config: ModernVBertConfig):
+        text_model_config = ModernBertConfig.from_pretrained(
             config.text_config.text_model_name,
             _attn_implementation=config._attn_implementation,
         )
+        text_model = ModernBertModel(text_model_config)
         embed_layer = DecoupledEmbedding(
             num_embeddings=text_model_config.vocab_size,
             num_additional_embeddings=config.additional_vocab_size,
             embedding_dim=config.hidden_size,
+            partially_freeze=getattr(config, "freeze_config", {"freeze_text_layers": False})["freeze_text_layers"],
             padding_idx=config.pad_token_id,
         )
         text_model.set_input_embeddings(embed_layer)
         return text_model
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2Model.enable_input_require_grads
     def enable_input_require_grads(self):
         """
         Enables the gradients for the input embeddings.
             make_inputs_require_grads
         )
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2Model.disable_input_require_grads
+    def disable_input_require_grads(self):
+        self._text_require_grads_hook.remove()
+        self._vision_require_grads_hook.remove()
     def get_input_embeddings(self):
         return self.text_model.get_input_embeddings()
     def set_input_embeddings(self, value):
         self.text_model.set_input_embeddings(value)
+    def get_image_features(
+        self, pixel_values: torch.FloatTensor, pixel_attention_mask: Optional[torch.LongTensor] = None
+    ):
+        """
+        Derived from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/smolvlm/modeling_smolvlm.py
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+            pixel_attention_mask (`torch.LongTensor`, *optional*):
+                The attention mask indicating padded regions in the image.
+        """
+        batch_size, num_images, num_channels, height, width = pixel_values.shape
+        pixel_values = pixel_values.to(dtype=self.dtype)  # fp16 compatibility
+        pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
+        # Remove padding images - padding images are full 0.
+        nb_values_per_image = pixel_values.shape[1:].numel()
+        real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
+        if not any(real_images_inds):
+            real_images_inds[0] = True
+        pixel_values = pixel_values[real_images_inds].contiguous()
+        # Handle the vision attention mask
+        if pixel_attention_mask is None:
+            pixel_attention_mask = torch.ones(
+                size=[pixel_values.shape[i] for i in (0, 2, 3)],
+                dtype=torch.bool,
+                device=pixel_values.device,
+            )
+        else:
+            # Remove padding images from the mask
+            pixel_attention_mask = pixel_attention_mask.view(batch_size * num_images, *pixel_attention_mask.shape[2:])
+            pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
+        patch_size = self.config.vision_config.patch_size
+        patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
+        patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
+        patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+        # Get sequence from the vision encoder
+        image_hidden_states = self.vision_model(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
+        image_hidden_states = image_hidden_states.last_hidden_state
+        return image_hidden_states
     def inputs_merger(self, input_ids, inputs_embeds, image_hidden_states):
         """Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/smolvlm/modeling_smolvlm.py
         """
         _, patch_size, _ = image_hidden_states.shape
+        if input_ids is None:
+            image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            image_mask = image_mask[..., 0]  # slice off the hidden dim
+        else:
+            image_mask = input_ids == self.config.image_token_id
+        # Assert that the input <image> tokens are valid (i.e. multiple of patch_size)
         num_image_tokens = image_mask.sum(dim=1)
         if not torch.all(num_image_tokens % patch_size == 0):
             raise ValueError("Number of <image> tokens not divisible by patch_size.")
         blocks_per_sample = num_image_tokens // patch_size
         offsets = torch.nn.functional.pad(blocks_per_sample.cumsum(dim=0), (1, 0), value=0)
         block_offset = offsets[:-1]
         row_cum = image_mask.cumsum(dim=-1)
         chunk_idx = (row_cum - 1) // patch_size
         local_idx = (row_cum - 1) % patch_size
         block_idx = block_offset.unsqueeze(1) + chunk_idx
         image_embeds = torch.zeros_like(inputs_embeds)
         image_embeds[image_mask] = image_hidden_states[block_idx[image_mask], local_idx[image_mask], :]
         return torch.where(image_mask.unsqueeze(-1), image_embeds, inputs_embeds)
+    @can_return_tuple
+    @auto_docstring(
+        custom_intro="""
+        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
+        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
+        max_num_images is the maximum number of images among the batch_size samples in the batch.
+        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
+        For efficiency, we only pass through the vision_model's forward the real images by
+        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
+        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
+        """,
+        checkpoint="modernvbert/ModernVBert",
+    )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
+            Mask to avoid performing attention on padding pixel indices.
+        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+            The hidden states of the image encoder after modality projection.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or `model.image_token_id`. Tokens with indices set to `model.image_token_id` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if inputs_embeds is None:
             inputs_embeds = self.text_model.get_input_embeddings()(input_ids).to(input_ids.device)
+        # Images processing
         if pixel_values is not None:
+            # Vision encoder pass
+            image_hidden_states = self.get_image_features(
+                pixel_values=pixel_values, pixel_attention_mask=pixel_attention_mask
+            )
+            # Modality projection & resampling
             image_hidden_states = self.connector(image_hidden_states)
+        # Merge image and text embeddings
+        if image_hidden_states is not None:
+            image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=inputs_embeds.device)
+            inputs_embeds = self.inputs_merger(
+                input_ids=input_ids, inputs_embeds=inputs_embeds, image_hidden_states=image_hidden_states
+            )
+        # Language model pass
         outputs = self.text_model(
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            **kwargs,
         )
         return ModernVBertBaseModelOutput(
             last_hidden_state=outputs.last_hidden_state,
             hidden_states=outputs.hidden_states,
             image_hidden_states=image_hidden_states,
         )
 class ModernVBertLMHead(nn.Module):
     def __init__(self, config):
         super().__init__()
+        pretrained_config = ModernBertConfig.from_pretrained(config.text_config.text_model_name)
+        pretrained_model = ModernBertForMaskedLM(pretrained_config)
         self.head = pretrained_model.head
         self.decoder = pretrained_model.decoder
         return self.decoder(self.head(hidden_states))
+@auto_docstring
 class ModernVBertForMaskedLM(ModernVBertPreTrainedModel):
+    _tied_weights_keys = ["lm_head.decoder.weight", "model.text_model.embeddings.word_embeddings.weight"]
     def __init__(self, config):
         super().__init__(config)
         self.in_features = config.hidden_size
         self.out_additional_features = config.additional_vocab_size
         self.vocab_size = config.vocab_size
         self.lm_head.to(self.dtype)
         self.post_init()
+    # Copied from transformers.models.idefics2.modeling_idefics2.Idefics2ForConditionalGeneration.disable_input_require_grads
+    def disable_input_require_grads(self):
+        self._text_require_grads_hook.remove()
+        self._vision_require_grads_hook.remove()
+    @can_return_tuple
+    @auto_docstring(
+        custom_intro="""
+        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
+        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
+        max_num_images is the maximum number of images among the batch_size samples in the batch.
+        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
+        For efficiency, we only pass through the vision_model's forward the real images by
+        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
+        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
+        """,
+        checkpoint="modernvbert/ModernVBert",
+    )
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         labels: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, ModernVBertMaskedLMOutput]:
+        r"""
+        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
+            Mask to avoid performing attention on padding pixel indices.
+        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+            The hidden states of the image encoder after modality projection.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or `model.image_token_id`. Tokens with indices set to `model.image_token_id` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            **kwargs,
         )
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
         if self.out_additional_features > 0:
             proj_states = self.lm_head.head(hidden_states)
             additional_features = self.additional_fc(proj_states)
             logits = torch.cat((logits, additional_features), -1)
         loss = None
         if labels is not None:
             loss = CrossEntropyLoss()(logits.view(-1, self.vocab_size + self.out_additional_features), labels.view(-1))
         if not return_dict:
             output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
         return ModernVBertMaskedLMOutput(
             loss=loss,
             logits=logits.float(),
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
             image_hidden_states=outputs.image_hidden_states,
+        )
+__all__ = ["ModernVBertPreTrainedModel", "ModernVBertModel", "ModernVBertForMaskedLM"]