Adi Raz Goldfarb [email protected] commited on
Commit
a479d88
·
1 Parent(s): 345c9a7

remove files

Browse files
.ipynb_checkpoints/config-checkpoint.json DELETED
@@ -1,178 +0,0 @@
1
- {
2
- "_name_or_path": "ibm-granite/granite-vision-3.3-2b",
3
- "adapter_path": null,
4
- "auto_map": {
5
- "AutoModel": "modeling_colgranitevision.ColGraniteVision",
6
- "AutoProcessor": "processing_colgranitevision.ColGraniteVisionProcessor",
7
- "AutoConfig": "colgranitevision_config.ColGraniteVisionConfig"
8
- },
9
- "architectures": [
10
- "ColGraniteVision"
11
- ],
12
- "base_model": null,
13
- "emb_dim_doc": 128,
14
- "emb_dim_query": 128,
15
- "image_grid_pinpoints": [
16
- [
17
- 384,
18
- 768
19
- ],
20
- [
21
- 384,
22
- 1152
23
- ],
24
- [
25
- 384,
26
- 1536
27
- ],
28
- [
29
- 384,
30
- 1920
31
- ],
32
- [
33
- 384,
34
- 2304
35
- ],
36
- [
37
- 384,
38
- 2688
39
- ],
40
- [
41
- 384,
42
- 3072
43
- ],
44
- [
45
- 384,
46
- 3456
47
- ],
48
- [
49
- 384,
50
- 3840
51
- ],
52
- [
53
- 768,
54
- 384
55
- ],
56
- [
57
- 768,
58
- 768
59
- ],
60
- [
61
- 768,
62
- 1152
63
- ],
64
- [
65
- 768,
66
- 1536
67
- ],
68
- [
69
- 768,
70
- 1920
71
- ],
72
- [
73
- 1152,
74
- 384
75
- ],
76
- [
77
- 1152,
78
- 768
79
- ],
80
- [
81
- 1152,
82
- 1152
83
- ],
84
- [
85
- 1536,
86
- 384
87
- ],
88
- [
89
- 1536,
90
- 768
91
- ],
92
- [
93
- 1920,
94
- 384
95
- ],
96
- [
97
- 1920,
98
- 768
99
- ],
100
- [
101
- 2304,
102
- 384
103
- ],
104
- [
105
- 2688,
106
- 384
107
- ],
108
- [
109
- 3072,
110
- 384
111
- ],
112
- [
113
- 3456,
114
- 384
115
- ],
116
- [
117
- 3840,
118
- 384
119
- ]
120
- ],
121
- "image_seq_length": 576,
122
- "image_token_index": 49155,
123
- "model_type": "colgranitevision",
124
- "multimodal_projector_bias": true,
125
- "pretrained_language_model": "",
126
- "pretrained_vision_tower": "",
127
- "projector_hidden_act": "gelu",
128
- "text_config": {
129
- "_attn_implementation_autoset": true,
130
- "_name_or_path": "ibm-granite/granite-3.1-2b-instruct",
131
- "architectures": [
132
- "GraniteForCausalLM"
133
- ],
134
- "attention_dropout": 0.1,
135
- "attention_multiplier": 0.015625,
136
- "bos_token_id": 0,
137
- "embedding_multiplier": 12.0,
138
- "eos_token_id": 0,
139
- "hidden_size": 2048,
140
- "intermediate_size": 8192,
141
- "logits_scaling": 8.0,
142
- "max_position_embeddings": 131072,
143
- "model_type": "granite",
144
- "num_hidden_layers": 40,
145
- "num_key_value_heads": 8,
146
- "pad_token_id": 0,
147
- "residual_multiplier": 0.22,
148
- "rms_norm_eps": 1e-05,
149
- "rope_theta": 300000,
150
- "tie_word_embeddings": true,
151
- "torch_dtype": "float32",
152
- "vocab_size": 49156
153
- },
154
- "tie_word_embeddings": true,
155
- "torch_dtype": "float32",
156
- "transformers_version": "4.50.0.dev0",
157
- "use_image_newline_parameter": true,
158
- "vision_config": {
159
- "_attn_implementation_autoset": true,
160
- "hidden_act": "gelu_pytorch_tanh",
161
- "hidden_size": 1152,
162
- "image_size": 384,
163
- "intermediate_size": 4304,
164
- "layer_norm_eps": 1e-06,
165
- "model_type": "siglip_vision_model",
166
- "num_attention_heads": 16,
167
- "num_hidden_layers": 27,
168
- "patch_size": 14,
169
- "torch_dtype": "float32"
170
- },
171
- "vision_feature_layer": [
172
- -24,
173
- -20,
174
- -12,
175
- -1
176
- ],
177
- "vision_feature_select_strategy": "full"
178
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.ipynb_checkpoints/modeling_colgranitevision-checkpoint.py DELETED
@@ -1,192 +0,0 @@
1
- from typing import ClassVar, Optional
2
-
3
- import numpy as np
4
- import torch
5
- from torch import nn
6
- from transformers import LlavaNextPreTrainedModel
7
- from transformers.models.llava_next.modeling_llava_next import LlavaNextForConditionalGeneration
8
- from transformers.models.llava_next.modeling_llava_next import unpad_image, get_anyres_image_grid_shape
9
-
10
- from .colgranitevision_config import ColGraniteVisionConfig
11
-
12
-
13
- class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
14
- def pack_image_features(
15
- self,
16
- image_features,
17
- image_sizes,
18
- vision_feature_select_strategy,
19
- image_newline=None,
20
- base_image_feature_location="last",
21
- ):
22
- """
23
- Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
24
-
25
- Args:
26
- image_features (`List[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
27
- List of image feature tensor, each contains all the visual feature of all patches.
28
- image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
29
- Actual image size of each images (H, W).
30
- vision_feature_select_strategy (`str`)
31
- The feature selection strategy used to select the vision feature from the vision backbone.
32
- image_newline (`torch.Tensor` of shape `(embed_dim)`)
33
- New line embedding vector.
34
- Returns:
35
- image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
36
- feature_lens (`List[int]`)
37
- token length of each image in image_features
38
- """
39
-
40
- new_image_features = []
41
- feature_lens = []
42
- for image_idx, image_feature in enumerate(image_features):
43
- if image_feature.shape[0] > 1:
44
- base_image_feature = image_feature[0]
45
- image_feature = image_feature[1:]
46
- height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
47
-
48
- num_patch_height, num_patch_width = get_anyres_image_grid_shape(
49
- image_sizes[image_idx],
50
- self.config.image_grid_pinpoints,
51
- self.config.vision_config.image_size,
52
- )
53
-
54
- if (
55
- np.prod(image_feature.shape) % (num_patch_height * num_patch_width * height * width) != 0
56
- and vision_feature_select_strategy == "default"
57
- ):
58
- print(
59
- "Image feature shape does not line up with the provided patch size. "
60
- "You may be using the `default` vision_feature_select_strategy with a"
61
- " visual encoder that does not have CLS."
62
- )
63
-
64
- image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
65
- image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
66
- image_feature = image_feature.flatten(1, 2).flatten(2, 3)
67
- image_feature = unpad_image(image_feature, image_sizes[image_idx])
68
- if image_newline is not None:
69
- image_feature = torch.cat(
70
- (
71
- image_feature,
72
- image_newline[:, None, None]
73
- .expand(*image_feature.shape[:-1], 1)
74
- .to(image_feature.device, image_feature.dtype),
75
- ),
76
- dim=-1,
77
- )
78
- image_feature = image_feature.flatten(1, 2).transpose(0, 1)
79
- if base_image_feature_location == "last":
80
- image_feature = torch.cat((image_feature, base_image_feature), dim=0)
81
- else:
82
- image_feature = torch.cat((base_image_feature, image_feature), dim=0)
83
-
84
- else:
85
- image_feature = image_feature[0]
86
- if image_newline is not None:
87
- image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
88
- new_image_features.append(image_feature)
89
- feature_lens.append(image_feature.size(0))
90
- image_features = torch.cat(new_image_features, dim=0)
91
- feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features.device)
92
- return image_features, feature_lens
93
-
94
-
95
- class ColGraniteVision(LlavaNextPreTrainedModel):
96
- """
97
- ColGraniteVision model implementation.
98
- """
99
-
100
- main_input_name: ClassVar[str] = "doc_input_ids" # transformers-related
101
- config_class = ColGraniteVisionConfig
102
-
103
- def __init__(self, config: ColGraniteVisionConfig):
104
- super().__init__(config=config)
105
-
106
- model = LlavaNextWithCustomPacking(config=config)
107
- if model.language_model._tied_weights_keys is not None:
108
- self._tied_weights_keys = [f"model.language_model.{k}" for k in model.language_model._tied_weights_keys]
109
- self.model = model
110
-
111
- # TODO: Wait for ColPali2 to create a ColPaliConfig to allow specifying the embedding dimension.
112
- # We could do it now but it would break all the models trying to load the model from the checkpoint.
113
- self.dim = 128
114
- self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
115
-
116
- self.post_init()
117
-
118
- def forward(self, *args, **kwargs) -> torch.Tensor:
119
- # Delete output_hidden_states from kwargs
120
- kwargs.pop("output_hidden_states", None)
121
- if "pixel_values" in kwargs:
122
- kwargs["pixel_values"] = kwargs["pixel_values"].to(dtype=self.dtype)
123
-
124
- outputs = self.model(*args, output_hidden_states=True, **kwargs) # (batch_size, sequence_length, hidden_size)
125
- last_hidden_states = outputs.hidden_states[-1] # (batch_size, sequence_length, hidden_size)
126
-
127
- attention_mask = kwargs["attention_mask"]
128
- if "pixel_values" in kwargs:
129
- input_ids = kwargs['input_ids']
130
- image_mask = (input_ids == self.config.image_token_index)
131
- # inputs_embeds = last_hidden_states.masked_scatter(image_mask)
132
- N, M = image_mask.shape
133
- # Create an index matrix: each row is 0, 1, ..., M-1
134
- idx = torch.arange(M, device=image_mask.device).expand(N, M)
135
- # Replace False positions with -1 so they are ignored by topk (since all valid indices are >=0)
136
- masked_idx = torch.where(image_mask, idx, torch.tensor(-1, device=image_mask.device))
137
- topk_values, _ = torch.topk(masked_idx, k=729, dim=1)
138
- last_k_indices, _ = torch.sort(topk_values, dim=1)
139
- last_k_indices_exp = last_k_indices.unsqueeze(-1).expand(-1, -1, last_hidden_states.size(-1))
140
- last_hidden_states = torch.gather(last_hidden_states, 1, last_k_indices_exp)
141
- attention_mask = torch.gather(attention_mask, 1, last_k_indices)
142
-
143
- attention_mask = attention_mask.unsqueeze(-1)
144
-
145
- proj = self.custom_text_proj(last_hidden_states) # (batch_size, sequence_length, dim)
146
-
147
- # L2 normalization
148
- proj = proj / (proj.norm(dim=-1, keepdim=True) + 1e-8)
149
-
150
- # proj = proj * kwargs["attention_mask"].unsqueeze(-1) # (batch_size, sequence_length, dim)
151
- proj = proj * attention_mask # (batch_size, sequence_length, dim)
152
-
153
- return proj
154
-
155
- def get_input_embeddings(self):
156
- return self.model.language_model.get_input_embeddings()
157
-
158
- def set_input_embeddings(self, value):
159
- self.model.language_model.set_input_embeddings(value)
160
-
161
- def get_output_embeddings(self):
162
- return self.model.language_model.get_output_embeddings()
163
-
164
- def set_output_embeddings(self, new_embeddings):
165
- self.model.language_model.set_output_embeddings(new_embeddings)
166
-
167
- def set_decoder(self, decoder):
168
- self.model.language_model.set_decoder(decoder)
169
-
170
- def get_decoder(self):
171
- return self.model.language_model.get_decoder()
172
-
173
- def tie_weights(self):
174
- return self.model.language_model.tie_weights()
175
-
176
- def resize_token_embeddings(
177
- self,
178
- new_num_tokens: Optional[int] = None,
179
- pad_to_multiple_of=None,
180
- ) -> nn.Embedding:
181
- model_embeds = self.model.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
182
-
183
- # Update vocab size
184
- self.config.text_config.vocab_size = model_embeds.num_embeddings
185
- self.config.vocab_size = model_embeds.num_embeddings
186
- self.model.vocab_size = model_embeds.num_embeddings
187
-
188
- return model_embeds
189
-
190
- @property
191
- def patch_size(self) -> int:
192
- return self.model.vision_tower.config.patch_size