model name refactoring (#9)
Browse files- refactor code (ed9e65ece65a6aa036374f03e5a11c6d6d38037d)
- update modeling (f6e6df119cb052a1bcbac7e497e1b4fefdeb7bb5)
- update readme (4f3aaec32267bf7d09fe740d647c74f98261a34e)
- README.md +2 -2
- config.json +11 -11
- colgranitevision_config.py → granite_vision_embedding_config.py +4 -2
- modeling_colgranitevision.py → modeling_granite_vision_embedding.py +6 -9
- preprocessor_config.json +1 -1
- processing_colgranitevision.py → processing_granite_vision_embedding.py +11 -11
- processor_config.json +2 -2
README.md
CHANGED
@@ -12,7 +12,7 @@ Granite-vision-3.3-2b-embedding is an efficient embedding model based on granite
|
|
12 |
By removing the need for OCR-based text extractions, granite-vision-3.3-2b-embedding can help simplify and accelerate RAG pipelines.
|
13 |
|
14 |
**Evaluations:**
|
15 |
-
We evaluated granite-vision-3.3-2b-embedding alongside other top colBERT style multi-modal embedding models in the 1B-4B parameter range using two benchmark: Vidore2 and [Real-MM-RAG-Bench](https://arxiv.org/abs/2502.12342) which aim to specifically address complex multimodal document retrieval tasks.
|
16 |
|
17 |
## **NDCG@5 - ViDoRe V2**
|
18 |
| Collection \ Model | ColPali-v1.3 | ColQwen2.5-v0.2 | ColNomic-3b | ColSmolvlm-v0.1 | granite-vision-3.3-2b-embedding |
|
@@ -102,7 +102,7 @@ print(f"📊 Similarity between image and text: {similarity.item():.4f}")
|
|
102 |
print("=" * 50)
|
103 |
```
|
104 |
### Use granite-vision-embedding-3.3-2b for MM RAG
|
105 |
-
For an example of MM-RAG using granite-vision-3.3-2b-embedding refer to [this notebook](
|
106 |
|
107 |
**Model Architecture:**
|
108 |
The architecture of granite-vision-3.3-2b-embedding follows ColPali(https://arxiv.org/abs/2407.01449) approach and consists of the following components:
|
|
|
12 |
By removing the need for OCR-based text extractions, granite-vision-3.3-2b-embedding can help simplify and accelerate RAG pipelines.
|
13 |
|
14 |
**Evaluations:**
|
15 |
+
We evaluated granite-vision-3.3-2b-embedding alongside other top colBERT style multi-modal embedding models in the 1B-4B parameter range using two benchmark: [Vidore2] (https://github.com/illuin-tech/vidore-benchmark/) and [Real-MM-RAG-Bench](https://arxiv.org/abs/2502.12342)([dataset](https://huggingface.co/collections/ibm-research/real-mm-rag-bench-67d2dc0ddf2dfafe66f09d34)) which aim to specifically address complex multimodal document retrieval tasks.
|
16 |
|
17 |
## **NDCG@5 - ViDoRe V2**
|
18 |
| Collection \ Model | ColPali-v1.3 | ColQwen2.5-v0.2 | ColNomic-3b | ColSmolvlm-v0.1 | granite-vision-3.3-2b-embedding |
|
|
|
102 |
print("=" * 50)
|
103 |
```
|
104 |
### Use granite-vision-embedding-3.3-2b for MM RAG
|
105 |
+
For an example of MM-RAG using granite-vision-3.3-2b-embedding refer to [this notebook](https://github.com/ibm-granite/granite-vision-models/tree/main/cookbooks/GraniteVisionEmbedding_MM-RAG_Notebook).
|
106 |
|
107 |
**Model Architecture:**
|
108 |
The architecture of granite-vision-3.3-2b-embedding follows ColPali(https://arxiv.org/abs/2407.01449) approach and consists of the following components:
|
config.json
CHANGED
@@ -1,18 +1,18 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "
|
3 |
"adapter_path": null,
|
4 |
-
|
5 |
-
"AutoModel": "
|
6 |
-
"AutoProcessor": "
|
7 |
-
"AutoConfig": "
|
8 |
},
|
9 |
"architectures": [
|
10 |
-
"
|
11 |
],
|
|
|
12 |
"base_model": null,
|
13 |
"emb_dim_doc": 128,
|
14 |
"emb_dim_query": 128,
|
15 |
-
"base_image_feature_location": "last",
|
16 |
"image_grid_pinpoints": [
|
17 |
[
|
18 |
384,
|
@@ -121,7 +121,7 @@
|
|
121 |
],
|
122 |
"image_seq_length": 576,
|
123 |
"image_token_index": 49155,
|
124 |
-
"model_type": "
|
125 |
"multimodal_projector_bias": true,
|
126 |
"pretrained_language_model": "",
|
127 |
"pretrained_vision_tower": "",
|
@@ -149,12 +149,12 @@
|
|
149 |
"rms_norm_eps": 1e-05,
|
150 |
"rope_theta": 300000,
|
151 |
"tie_word_embeddings": true,
|
152 |
-
"torch_dtype": "
|
153 |
"vocab_size": 49156
|
154 |
},
|
155 |
"tie_word_embeddings": true,
|
156 |
"torch_dtype": "float32",
|
157 |
-
"transformers_version": "4.
|
158 |
"use_image_newline_parameter": true,
|
159 |
"vision_config": {
|
160 |
"_attn_implementation_autoset": true,
|
@@ -167,7 +167,7 @@
|
|
167 |
"num_attention_heads": 16,
|
168 |
"num_hidden_layers": 27,
|
169 |
"patch_size": 14,
|
170 |
-
"torch_dtype": "
|
171 |
},
|
172 |
"vision_feature_layer": [
|
173 |
-24,
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "ibm_granite/granite-vision-3.3-2b",
|
3 |
"adapter_path": null,
|
4 |
+
"auto_map": {
|
5 |
+
"AutoModel": "modeling_granite_vision_embedding.GraniteVisionEmb",
|
6 |
+
"AutoProcessor": "processing_granite_vision_embedding.GraniteVisionEmbProcessor",
|
7 |
+
"AutoConfig": "granite_vision_embedding_config.GraniteVisionEmbConfig"
|
8 |
},
|
9 |
"architectures": [
|
10 |
+
"GraniteVisionEmb"
|
11 |
],
|
12 |
+
"base_image_feature_location": "last",
|
13 |
"base_model": null,
|
14 |
"emb_dim_doc": 128,
|
15 |
"emb_dim_query": 128,
|
|
|
16 |
"image_grid_pinpoints": [
|
17 |
[
|
18 |
384,
|
|
|
121 |
],
|
122 |
"image_seq_length": 576,
|
123 |
"image_token_index": 49155,
|
124 |
+
"model_type": "granitevisionemb",
|
125 |
"multimodal_projector_bias": true,
|
126 |
"pretrained_language_model": "",
|
127 |
"pretrained_vision_tower": "",
|
|
|
149 |
"rms_norm_eps": 1e-05,
|
150 |
"rope_theta": 300000,
|
151 |
"tie_word_embeddings": true,
|
152 |
+
"torch_dtype": "bfloat16",
|
153 |
"vocab_size": 49156
|
154 |
},
|
155 |
"tie_word_embeddings": true,
|
156 |
"torch_dtype": "float32",
|
157 |
+
"transformers_version": "4.49.0",
|
158 |
"use_image_newline_parameter": true,
|
159 |
"vision_config": {
|
160 |
"_attn_implementation_autoset": true,
|
|
|
167 |
"num_attention_heads": 16,
|
168 |
"num_hidden_layers": 27,
|
169 |
"patch_size": 14,
|
170 |
+
"torch_dtype": "bfloat16"
|
171 |
},
|
172 |
"vision_feature_layer": [
|
173 |
-24,
|
colgranitevision_config.py → granite_vision_embedding_config.py
RENAMED
@@ -1,8 +1,8 @@
|
|
1 |
from transformers import LlavaNextConfig
|
2 |
|
3 |
|
4 |
-
class
|
5 |
-
model_type = "
|
6 |
|
7 |
def __init__(self, **kwargs):
|
8 |
self.base_model = kwargs.get("base_model", None)
|
@@ -11,3 +11,5 @@ class ColGraniteVisionConfig(LlavaNextConfig):
|
|
11 |
self.base_image_feature_location = kwargs.get("base_image_feature_location", "last")
|
12 |
self.adapter_path = kwargs.get("adapter_path", None)
|
13 |
super().__init__(**kwargs)
|
|
|
|
|
|
1 |
from transformers import LlavaNextConfig
|
2 |
|
3 |
|
4 |
+
class GraniteVisionEmbConfig(LlavaNextConfig):
|
5 |
+
model_type = "granitevisionemb"
|
6 |
|
7 |
def __init__(self, **kwargs):
|
8 |
self.base_model = kwargs.get("base_model", None)
|
|
|
11 |
self.base_image_feature_location = kwargs.get("base_image_feature_location", "last")
|
12 |
self.adapter_path = kwargs.get("adapter_path", None)
|
13 |
super().__init__(**kwargs)
|
14 |
+
|
15 |
+
|
modeling_colgranitevision.py → modeling_granite_vision_embedding.py
RENAMED
@@ -7,11 +7,10 @@ from transformers import LlavaNextPreTrainedModel
|
|
7 |
from transformers.models.llava_next.modeling_llava_next import LlavaNextForConditionalGeneration
|
8 |
from transformers.models.llava_next.modeling_llava_next import unpad_image, get_anyres_image_grid_shape
|
9 |
|
10 |
-
from .
|
11 |
-
|
12 |
|
13 |
class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
|
14 |
-
|
15 |
def pack_image_features(
|
16 |
self,
|
17 |
image_features,
|
@@ -93,15 +92,15 @@ class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
|
|
93 |
return image_features, feature_lens
|
94 |
|
95 |
|
96 |
-
class
|
97 |
"""
|
98 |
-
|
99 |
"""
|
100 |
|
101 |
main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related
|
102 |
-
config_class =
|
103 |
|
104 |
-
def __init__(self, config:
|
105 |
super().__init__(config=config)
|
106 |
|
107 |
model = LlavaNextWithCustomPacking(config=config)
|
@@ -109,8 +108,6 @@ class ColGraniteVision(LlavaNextPreTrainedModel):
|
|
109 |
self._tied_weights_keys = [f"model.language_model.{k}" for k in model.language_model._tied_weights_keys]
|
110 |
self.model = model
|
111 |
|
112 |
-
# TODO: Wait for ColPali2 to create a ColPaliConfig to allow specifying the embedding dimension.
|
113 |
-
# We could do it now but it would break all the models trying to load the model from the checkpoint.
|
114 |
self.dim = 128
|
115 |
self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
|
116 |
|
|
|
7 |
from transformers.models.llava_next.modeling_llava_next import LlavaNextForConditionalGeneration
|
8 |
from transformers.models.llava_next.modeling_llava_next import unpad_image, get_anyres_image_grid_shape
|
9 |
|
10 |
+
from .granite_vision_embedding_config import GraniteVisionEmbConfig
|
|
|
11 |
|
12 |
class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
|
13 |
+
|
14 |
def pack_image_features(
|
15 |
self,
|
16 |
image_features,
|
|
|
92 |
return image_features, feature_lens
|
93 |
|
94 |
|
95 |
+
class GraniteVisionEmb(LlavaNextPreTrainedModel):
|
96 |
"""
|
97 |
+
GraniteVisionEmb model implementation.
|
98 |
"""
|
99 |
|
100 |
main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related
|
101 |
+
config_class = GraniteVisionEmbConfig
|
102 |
|
103 |
+
def __init__(self, config: GraniteVisionEmbConfig):
|
104 |
super().__init__(config=config)
|
105 |
|
106 |
model = LlavaNextWithCustomPacking(config=config)
|
|
|
108 |
self._tied_weights_keys = [f"model.language_model.{k}" for k in model.language_model._tied_weights_keys]
|
109 |
self.model = model
|
110 |
|
|
|
|
|
111 |
self.dim = 128
|
112 |
self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
|
113 |
|
preprocessor_config.json
CHANGED
@@ -127,7 +127,7 @@
|
|
127 |
0.5,
|
128 |
0.5
|
129 |
],
|
130 |
-
"processor_class": "
|
131 |
"resample": 3,
|
132 |
"rescale_factor": 0.00392156862745098,
|
133 |
"size": {
|
|
|
127 |
0.5,
|
128 |
0.5
|
129 |
],
|
130 |
+
"processor_class": "GraniteVisionEmbProcessor",
|
131 |
"resample": 3,
|
132 |
"rescale_factor": 0.00392156862745098,
|
133 |
"size": {
|
processing_colgranitevision.py → processing_granite_vision_embedding.py
RENAMED
@@ -21,7 +21,7 @@ def floor_by_factor(number: float, factor: int) -> int:
|
|
21 |
return math.floor(number / factor) * factor
|
22 |
|
23 |
|
24 |
-
class
|
25 |
"""
|
26 |
Processor for ColPali.
|
27 |
"""
|
@@ -140,14 +140,14 @@ class ColGraniteVisionProcessor(LlavaNextProcessor):
|
|
140 |
max_size=self.max_size,
|
141 |
fill_color=0
|
142 |
)
|
143 |
-
|
144 |
def resize_and_pad_centered_to_long_side(
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
) -> Image.Image:
|
152 |
"""
|
153 |
Resizes and pads an image such that:
|
@@ -183,10 +183,10 @@ class ColGraniteVisionProcessor(LlavaNextProcessor):
|
|
183 |
|
184 |
# Resize the image
|
185 |
resized_image = image.resize((target_width, target_height), Image.LANCZOS)
|
186 |
-
final_image =resized_image.convert("RGB")
|
187 |
|
188 |
return final_image
|
189 |
-
|
190 |
def resize_and_pad_centered(self,
|
191 |
image: Image.Image,
|
192 |
factor: int,
|
@@ -439,4 +439,4 @@ class ColGraniteVisionProcessor(LlavaNextProcessor):
|
|
439 |
assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
|
440 |
|
441 |
scores = scores.to(torch.float32)
|
442 |
-
return scores
|
|
|
21 |
return math.floor(number / factor) * factor
|
22 |
|
23 |
|
24 |
+
class GraniteVisionEmbProcessor(LlavaNextProcessor):
|
25 |
"""
|
26 |
Processor for ColPali.
|
27 |
"""
|
|
|
140 |
max_size=self.max_size,
|
141 |
fill_color=0
|
142 |
)
|
143 |
+
|
144 |
def resize_and_pad_centered_to_long_side(
|
145 |
+
self,
|
146 |
+
image: Image.Image,
|
147 |
+
factor: int,
|
148 |
+
min_size: int,
|
149 |
+
max_size: int,
|
150 |
+
fill_color=0
|
151 |
) -> Image.Image:
|
152 |
"""
|
153 |
Resizes and pads an image such that:
|
|
|
183 |
|
184 |
# Resize the image
|
185 |
resized_image = image.resize((target_width, target_height), Image.LANCZOS)
|
186 |
+
final_image = resized_image.convert("RGB")
|
187 |
|
188 |
return final_image
|
189 |
+
|
190 |
def resize_and_pad_centered(self,
|
191 |
image: Image.Image,
|
192 |
factor: int,
|
|
|
439 |
assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
|
440 |
|
441 |
scores = scores.to(torch.float32)
|
442 |
+
return scores
|
processor_config.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
-
"processor_class": "
|
3 |
"auto_map": {
|
4 |
-
"AutoProcessor": "
|
5 |
}
|
6 |
}
|
|
|
1 |
{
|
2 |
+
"processor_class": "GraniteVisionEmbProcessor",
|
3 |
"auto_map": {
|
4 |
+
"AutoProcessor": "processing_granite_vision_embedding.GraniteVisionEmbProcessor"
|
5 |
}
|
6 |
}
|