Adirazgold commited on
Commit
087af17
·
verified ·
1 Parent(s): 0f8595a
README.md CHANGED
@@ -12,7 +12,7 @@ Granite-vision-3.3-2b-embedding is an efficient embedding model based on granite
12
  By removing the need for OCR-based text extractions, granite-vision-3.3-2b-embedding can help simplify and accelerate RAG pipelines.
13
 
14
  **Evaluations:**
15
- We evaluated granite-vision-3.3-2b-embedding alongside other top colBERT style multi-modal embedding models in the 1B-4B parameter range using two benchmark: Vidore2 and [Real-MM-RAG-Bench](https://arxiv.org/abs/2502.12342) which aim to specifically address complex multimodal document retrieval tasks.
16
 
17
  ## **NDCG@5 - ViDoRe V2**
18
  | Collection \ Model | ColPali-v1.3 | ColQwen2.5-v0.2 | ColNomic-3b | ColSmolvlm-v0.1 | granite-vision-3.3-2b-embedding |
@@ -102,7 +102,7 @@ print(f"📊 Similarity between image and text: {similarity.item():.4f}")
102
  print("=" * 50)
103
  ```
104
  ### Use granite-vision-embedding-3.3-2b for MM RAG
105
- For an example of MM-RAG using granite-vision-3.3-2b-embedding refer to [this notebook](......).
106
 
107
  **Model Architecture:**
108
  The architecture of granite-vision-3.3-2b-embedding follows ColPali(https://arxiv.org/abs/2407.01449) approach and consists of the following components:
 
12
  By removing the need for OCR-based text extractions, granite-vision-3.3-2b-embedding can help simplify and accelerate RAG pipelines.
13
 
14
  **Evaluations:**
15
+ We evaluated granite-vision-3.3-2b-embedding alongside other top colBERT style multi-modal embedding models in the 1B-4B parameter range using two benchmark: [Vidore2] (https://github.com/illuin-tech/vidore-benchmark/) and [Real-MM-RAG-Bench](https://arxiv.org/abs/2502.12342)([dataset](https://huggingface.co/collections/ibm-research/real-mm-rag-bench-67d2dc0ddf2dfafe66f09d34)) which aim to specifically address complex multimodal document retrieval tasks.
16
 
17
  ## **NDCG@5 - ViDoRe V2**
18
  | Collection \ Model | ColPali-v1.3 | ColQwen2.5-v0.2 | ColNomic-3b | ColSmolvlm-v0.1 | granite-vision-3.3-2b-embedding |
 
102
  print("=" * 50)
103
  ```
104
  ### Use granite-vision-embedding-3.3-2b for MM RAG
105
+ For an example of MM-RAG using granite-vision-3.3-2b-embedding refer to [this notebook](https://github.com/ibm-granite/granite-vision-models/tree/main/cookbooks/GraniteVisionEmbedding_MM-RAG_Notebook).
106
 
107
  **Model Architecture:**
108
  The architecture of granite-vision-3.3-2b-embedding follows ColPali(https://arxiv.org/abs/2407.01449) approach and consists of the following components:
config.json CHANGED
@@ -1,18 +1,18 @@
1
  {
2
- "_name_or_path": "ibm-granite/granite-vision-3.3-2b",
3
  "adapter_path": null,
4
- "auto_map": {
5
- "AutoModel": "modeling_colgranitevision.ColGraniteVision",
6
- "AutoProcessor": "processing_colgranitevision.ColGraniteVisionProcessor",
7
- "AutoConfig": "colgranitevision_config.ColGraniteVisionConfig"
8
  },
9
  "architectures": [
10
- "ColGraniteVision"
11
  ],
 
12
  "base_model": null,
13
  "emb_dim_doc": 128,
14
  "emb_dim_query": 128,
15
- "base_image_feature_location": "last",
16
  "image_grid_pinpoints": [
17
  [
18
  384,
@@ -121,7 +121,7 @@
121
  ],
122
  "image_seq_length": 576,
123
  "image_token_index": 49155,
124
- "model_type": "colgranitevision",
125
  "multimodal_projector_bias": true,
126
  "pretrained_language_model": "",
127
  "pretrained_vision_tower": "",
@@ -149,12 +149,12 @@
149
  "rms_norm_eps": 1e-05,
150
  "rope_theta": 300000,
151
  "tie_word_embeddings": true,
152
- "torch_dtype": "float32",
153
  "vocab_size": 49156
154
  },
155
  "tie_word_embeddings": true,
156
  "torch_dtype": "float32",
157
- "transformers_version": "4.50.0.dev0",
158
  "use_image_newline_parameter": true,
159
  "vision_config": {
160
  "_attn_implementation_autoset": true,
@@ -167,7 +167,7 @@
167
  "num_attention_heads": 16,
168
  "num_hidden_layers": 27,
169
  "patch_size": 14,
170
- "torch_dtype": "float32"
171
  },
172
  "vision_feature_layer": [
173
  -24,
 
1
  {
2
+ "_name_or_path": "ibm_granite/granite-vision-3.3-2b",
3
  "adapter_path": null,
4
+ "auto_map": {
5
+ "AutoModel": "modeling_granite_vision_embedding.GraniteVisionEmb",
6
+ "AutoProcessor": "processing_granite_vision_embedding.GraniteVisionEmbProcessor",
7
+ "AutoConfig": "granite_vision_embedding_config.GraniteVisionEmbConfig"
8
  },
9
  "architectures": [
10
+ "GraniteVisionEmb"
11
  ],
12
+ "base_image_feature_location": "last",
13
  "base_model": null,
14
  "emb_dim_doc": 128,
15
  "emb_dim_query": 128,
 
16
  "image_grid_pinpoints": [
17
  [
18
  384,
 
121
  ],
122
  "image_seq_length": 576,
123
  "image_token_index": 49155,
124
+ "model_type": "granitevisionemb",
125
  "multimodal_projector_bias": true,
126
  "pretrained_language_model": "",
127
  "pretrained_vision_tower": "",
 
149
  "rms_norm_eps": 1e-05,
150
  "rope_theta": 300000,
151
  "tie_word_embeddings": true,
152
+ "torch_dtype": "bfloat16",
153
  "vocab_size": 49156
154
  },
155
  "tie_word_embeddings": true,
156
  "torch_dtype": "float32",
157
+ "transformers_version": "4.49.0",
158
  "use_image_newline_parameter": true,
159
  "vision_config": {
160
  "_attn_implementation_autoset": true,
 
167
  "num_attention_heads": 16,
168
  "num_hidden_layers": 27,
169
  "patch_size": 14,
170
+ "torch_dtype": "bfloat16"
171
  },
172
  "vision_feature_layer": [
173
  -24,
colgranitevision_config.py → granite_vision_embedding_config.py RENAMED
@@ -1,8 +1,8 @@
1
  from transformers import LlavaNextConfig
2
 
3
 
4
- class ColGraniteVisionConfig(LlavaNextConfig):
5
- model_type = "colgranitevision"
6
 
7
  def __init__(self, **kwargs):
8
  self.base_model = kwargs.get("base_model", None)
@@ -11,3 +11,5 @@ class ColGraniteVisionConfig(LlavaNextConfig):
11
  self.base_image_feature_location = kwargs.get("base_image_feature_location", "last")
12
  self.adapter_path = kwargs.get("adapter_path", None)
13
  super().__init__(**kwargs)
 
 
 
1
  from transformers import LlavaNextConfig
2
 
3
 
4
+ class GraniteVisionEmbConfig(LlavaNextConfig):
5
+ model_type = "granitevisionemb"
6
 
7
  def __init__(self, **kwargs):
8
  self.base_model = kwargs.get("base_model", None)
 
11
  self.base_image_feature_location = kwargs.get("base_image_feature_location", "last")
12
  self.adapter_path = kwargs.get("adapter_path", None)
13
  super().__init__(**kwargs)
14
+
15
+
modeling_colgranitevision.py → modeling_granite_vision_embedding.py RENAMED
@@ -7,11 +7,10 @@ from transformers import LlavaNextPreTrainedModel
7
  from transformers.models.llava_next.modeling_llava_next import LlavaNextForConditionalGeneration
8
  from transformers.models.llava_next.modeling_llava_next import unpad_image, get_anyres_image_grid_shape
9
 
10
- from .colgranitevision_config import ColGraniteVisionConfig
11
-
12
 
13
  class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
14
-
15
  def pack_image_features(
16
  self,
17
  image_features,
@@ -93,15 +92,15 @@ class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
93
  return image_features, feature_lens
94
 
95
 
96
- class ColGraniteVision(LlavaNextPreTrainedModel):
97
  """
98
- ColGraniteVision model implementation.
99
  """
100
 
101
  main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related
102
- config_class = ColGraniteVisionConfig
103
 
104
- def __init__(self, config: ColGraniteVisionConfig):
105
  super().__init__(config=config)
106
 
107
  model = LlavaNextWithCustomPacking(config=config)
@@ -109,8 +108,6 @@ class ColGraniteVision(LlavaNextPreTrainedModel):
109
  self._tied_weights_keys = [f"model.language_model.{k}" for k in model.language_model._tied_weights_keys]
110
  self.model = model
111
 
112
- # TODO: Wait for ColPali2 to create a ColPaliConfig to allow specifying the embedding dimension.
113
- # We could do it now but it would break all the models trying to load the model from the checkpoint.
114
  self.dim = 128
115
  self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
116
 
 
7
  from transformers.models.llava_next.modeling_llava_next import LlavaNextForConditionalGeneration
8
  from transformers.models.llava_next.modeling_llava_next import unpad_image, get_anyres_image_grid_shape
9
 
10
+ from .granite_vision_embedding_config import GraniteVisionEmbConfig
 
11
 
12
  class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
13
+
14
  def pack_image_features(
15
  self,
16
  image_features,
 
92
  return image_features, feature_lens
93
 
94
 
95
+ class GraniteVisionEmb(LlavaNextPreTrainedModel):
96
  """
97
+ GraniteVisionEmb model implementation.
98
  """
99
 
100
  main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related
101
+ config_class = GraniteVisionEmbConfig
102
 
103
+ def __init__(self, config: GraniteVisionEmbConfig):
104
  super().__init__(config=config)
105
 
106
  model = LlavaNextWithCustomPacking(config=config)
 
108
  self._tied_weights_keys = [f"model.language_model.{k}" for k in model.language_model._tied_weights_keys]
109
  self.model = model
110
 
 
 
111
  self.dim = 128
112
  self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
113
 
preprocessor_config.json CHANGED
@@ -127,7 +127,7 @@
127
  0.5,
128
  0.5
129
  ],
130
- "processor_class": "ColGraniteVisionProcessor",
131
  "resample": 3,
132
  "rescale_factor": 0.00392156862745098,
133
  "size": {
 
127
  0.5,
128
  0.5
129
  ],
130
+ "processor_class": "GraniteVisionEmbProcessor",
131
  "resample": 3,
132
  "rescale_factor": 0.00392156862745098,
133
  "size": {
processing_colgranitevision.py → processing_granite_vision_embedding.py RENAMED
@@ -21,7 +21,7 @@ def floor_by_factor(number: float, factor: int) -> int:
21
  return math.floor(number / factor) * factor
22
 
23
 
24
- class ColGraniteVisionProcessor(LlavaNextProcessor):
25
  """
26
  Processor for ColPali.
27
  """
@@ -140,14 +140,14 @@ class ColGraniteVisionProcessor(LlavaNextProcessor):
140
  max_size=self.max_size,
141
  fill_color=0
142
  )
143
-
144
  def resize_and_pad_centered_to_long_side(
145
- self,
146
- image: Image.Image,
147
- factor: int,
148
- min_size: int,
149
- max_size: int,
150
- fill_color=0
151
  ) -> Image.Image:
152
  """
153
  Resizes and pads an image such that:
@@ -183,10 +183,10 @@ class ColGraniteVisionProcessor(LlavaNextProcessor):
183
 
184
  # Resize the image
185
  resized_image = image.resize((target_width, target_height), Image.LANCZOS)
186
- final_image =resized_image.convert("RGB")
187
 
188
  return final_image
189
-
190
  def resize_and_pad_centered(self,
191
  image: Image.Image,
192
  factor: int,
@@ -439,4 +439,4 @@ class ColGraniteVisionProcessor(LlavaNextProcessor):
439
  assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
440
 
441
  scores = scores.to(torch.float32)
442
- return scores
 
21
  return math.floor(number / factor) * factor
22
 
23
 
24
+ class GraniteVisionEmbProcessor(LlavaNextProcessor):
25
  """
26
  Processor for ColPali.
27
  """
 
140
  max_size=self.max_size,
141
  fill_color=0
142
  )
143
+
144
  def resize_and_pad_centered_to_long_side(
145
+ self,
146
+ image: Image.Image,
147
+ factor: int,
148
+ min_size: int,
149
+ max_size: int,
150
+ fill_color=0
151
  ) -> Image.Image:
152
  """
153
  Resizes and pads an image such that:
 
183
 
184
  # Resize the image
185
  resized_image = image.resize((target_width, target_height), Image.LANCZOS)
186
+ final_image = resized_image.convert("RGB")
187
 
188
  return final_image
189
+
190
  def resize_and_pad_centered(self,
191
  image: Image.Image,
192
  factor: int,
 
439
  assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
440
 
441
  scores = scores.to(torch.float32)
442
+ return scores
processor_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "processor_class": "ColGraniteVisionProcessor",
3
  "auto_map": {
4
- "AutoProcessor": "processing_colgranitevision.ColGraniteVisionProcessor"
5
  }
6
  }
 
1
  {
2
+ "processor_class": "GraniteVisionEmbProcessor",
3
  "auto_map": {
4
+ "AutoProcessor": "processing_granite_vision_embedding.GraniteVisionEmbProcessor"
5
  }
6
  }