Adirazgold commited on
Commit
b2b95f4
·
verified ·
1 Parent(s): dc33fbc
README.md DELETED
@@ -1,3 +0,0 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
adapter_config.json DELETED
@@ -1,27 +0,0 @@
1
- {
2
- "alpha_pattern": {},
3
- "base_model_name_or_path": "ibm-granite/granite-vision-3.1-2b-preview",
4
- "bias": "none",
5
- "fan_in_fan_out": false,
6
- "inference_mode": true,
7
- "init_lora_weights": "gaussian",
8
- "layer_replication": null,
9
- "layers_pattern": null,
10
- "layers_to_transform": null,
11
- "loftq_config": {},
12
- "lora_alpha": 32,
13
- "lora_dropout": 0.1,
14
- "megatron_config": null,
15
- "megatron_core": "megatron.core",
16
- "modules_to_save": [
17
- "custom_text_proj"
18
- ],
19
- "peft_type": "LORA",
20
- "r": 32,
21
- "rank_pattern": {},
22
- "revision": null,
23
- "target_modules": "(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$)",
24
- "task_type": "FEATURE_EXTRACTION",
25
- "use_dora": false,
26
- "use_rslora": false
27
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
added_tokens.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "<image>": 49155,
3
- "<|end_of_role|>": 49153,
4
- "<|start_of_role|>": 49152,
5
- "<|tool_call|>": 49154
6
- }
 
 
 
 
 
 
 
chat_template.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "chat_template": "{%- if tools %}\n {{- '<|start_of_role|>available_tools<|end_of_role|>\n' }}\n {%- for tool in tools %}\n {{- tool | tojson(indent=4) }}\n {%- if not loop.last %}\n {{- '\n\n' }}\n {%- endif %}\n {%- endfor %}\n {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- for message in messages if message['role'] == 'system'%}{% else %}<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n{% endfor %}{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{- '<|system|>\n' + message['content'][0]['text'] + '\n' }}\n {%- elif message['role'] == 'user' %}<|user|>\n {# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + '\n' }}{% endfor %}\n{%- elif message['role'] == 'assistant' %}\n {{- '<|assistant|>\n' + message['content'][0]['text'] + '<|end_of_text|>' }}\n {%- elif message['role'] == 'assistant_tool_call' %}\n {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'][0]['text'] + '<|end_of_text|>\n' }}\n {%- elif message['role'] == 'tool_response' %}\n {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'][0]['text'] + '<|end_of_text|>\n' }}\n {%- endif %}\n {%- if loop.last and add_generation_prompt %}\n {{- '<|assistant|>\n' }}\n {%- endif %}\n{%- endfor %}"
3
- }
 
 
 
 
custom_llava_next.py DELETED
@@ -1,122 +0,0 @@
1
- from transformers.models.llava_next.modeling_llava_next import LlavaNextForConditionalGeneration
2
- import torch
3
- from transformers.models.llava_next.modeling_llava_next import unpad_image, get_anyres_image_grid_shape
4
- import numpy as np
5
-
6
- class LlavaNextWithCustomPacking(LlavaNextForConditionalGeneration):
7
- def pack_image_features(
8
- self,
9
- image_features,
10
- image_sizes,
11
- vision_feature_select_strategy,
12
- image_newline=None,
13
- base_image_feature_location="last",
14
- ):
15
- """
16
- Reshape, unpad and then pack each image_feature into a single image_features tensor containing all visual vectors.
17
-
18
- Args:
19
- image_features (`List[torch.Tensor]` of length num_images, each of shape `(num_patches, image_length, embed_dim)`)
20
- List of image feature tensor, each contains all the visual feature of all patches.
21
- image_sizes (`torch.Tensor` of shape `(num_images, 2)`)
22
- Actual image size of each images (H, W).
23
- vision_feature_select_strategy (`str`)
24
- The feature selection strategy used to select the vision feature from the vision backbone.
25
- image_newline (`torch.Tensor` of shape `(embed_dim)`)
26
- New line embedding vector.
27
- Returns:
28
- image_features (`torch.Tensor` of shape `(all_feat_len, embed_dim)`)
29
- feature_lens (`List[int]`)
30
- token length of each image in image_features
31
- """
32
-
33
-
34
- new_image_features = []
35
- feature_lens = []
36
- for image_idx, image_feature in enumerate(image_features):
37
- if image_feature.shape[0] > 1:
38
- base_image_feature = image_feature[0]
39
- image_feature = image_feature[1:]
40
- height = width = self.config.vision_config.image_size // self.config.vision_config.patch_size
41
-
42
- num_patch_height, num_patch_width = get_anyres_image_grid_shape(
43
- image_sizes[image_idx],
44
- self.config.image_grid_pinpoints,
45
- self.config.vision_config.image_size,
46
- )
47
-
48
- if (
49
- np.prod(image_feature.shape) % (num_patch_height * num_patch_width * height * width) != 0
50
- and vision_feature_select_strategy == "default"
51
- ):
52
- logger.warning_once(
53
- "Image feature shape does not line up with the provided patch size. "
54
- "You may be using the `default` vision_feature_select_strategy with a"
55
- " visual encoder that does not have CLS."
56
- )
57
-
58
- image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
59
- image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
60
- image_feature = image_feature.flatten(1, 2).flatten(2, 3)
61
- image_feature = unpad_image(image_feature, image_sizes[image_idx])
62
- if image_newline is not None:
63
- image_feature = torch.cat(
64
- (
65
- image_feature,
66
- image_newline[:, None, None]
67
- .expand(*image_feature.shape[:-1], 1)
68
- .to(image_feature.device, image_feature.dtype),
69
- ),
70
- dim=-1,
71
- )
72
- image_feature = image_feature.flatten(1, 2).transpose(0, 1)
73
- if base_image_feature_location == "last":
74
- image_feature = torch.cat((image_feature, base_image_feature), dim=0)
75
- else:
76
- image_feature = torch.cat((base_image_feature, image_feature), dim=0)
77
-
78
- else:
79
- image_feature = image_feature[0]
80
- if image_newline is not None:
81
- image_feature = torch.cat((image_feature, image_newline[None].to(image_feature)), dim=0)
82
- new_image_features.append(image_feature)
83
- feature_lens.append(image_feature.size(0))
84
- image_features = torch.cat(new_image_features, dim=0)
85
- feature_lens = torch.tensor(feature_lens, dtype=torch.long, device=image_features.device)
86
- return image_features, feature_lens
87
-
88
-
89
-
90
- def main():
91
- import torch
92
- from transformers import AutoConfig
93
-
94
- # Load config and model
95
- model_id = "ibm-granite/granite-vision-3.1-2b-preview"
96
- config = AutoConfig.from_pretrained(model_id)
97
- model = LlavaNextWithCustomPacking.from_pretrained(model_id, config=config)
98
-
99
- # Dummy image features for 2 images (1 base + 3x3 patch grid flattened)
100
- B = 2 # batch size
101
- num_views = 3
102
- num_patches = 729
103
- embed_dim = model.config.text_config.hidden_size
104
- image_features = [
105
- torch.randn(num_views, num_patches, embed_dim) for _ in range(B)
106
- ]
107
- image_sizes = torch.tensor([[384, 384], [384, 384]]) # H, W for each image
108
-
109
- # Call overridden pack_image_features
110
- packed_feats, lengths = model.pack_image_features(
111
- image_features=image_features,
112
- image_sizes=image_sizes,
113
- vision_feature_select_strategy="default",
114
- image_newline=model.image_newline,
115
- base_image_feature_location="last",
116
- )
117
-
118
- print("Packed features shape:", packed_feats.shape)
119
- print("Feature lengths:", lengths)
120
-
121
- if __name__ == "__main__":
122
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
example_simple.py DELETED
@@ -1,72 +0,0 @@
1
- from PIL import Image
2
- import requests
3
- from io import BytesIO
4
- import torch
5
- from transformers import AutoModel, AutoProcessor, AutoConfig, AutoModelForVision2Seq
6
-
7
- # from granite_cola import ColGraniteVisionConfig, ColGraniteVision, ColGraniteVisionProcessor
8
-
9
- # --- 1) Register your custom classes so AutoModel/AutoProcessor work out-of-the-box
10
- # AutoConfig.register("colgranitevision", ColGraniteVisionConfig)
11
- # AutoModel.register(ColGraniteVisionConfig, ColGraniteVision)
12
- # AutoProcessor.register(ColGraniteVisionConfig, ColGraniteVisionProcessor)
13
-
14
- # ─────────────────────────────────────────────
15
- # 2) Load model & processor
16
- # ─────────────────────────────────────────────
17
- model_dir = "."
18
-
19
- model = AutoModelForVision2Seq.from_pretrained(
20
- model_dir,
21
- trust_remote_code=True,
22
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
23
- )
24
-
25
- # self.model = PeftModel.from_pretrained(self.model, peft_path).eval()
26
-
27
- processor = AutoProcessor.from_pretrained(
28
- model_dir,
29
- trust_remote_code=True,
30
- use_fast=True
31
- )
32
-
33
- # Set patch_size explicitly if needed
34
- if hasattr(processor, 'patch_size') and processor.patch_size is None:
35
- processor.patch_size = 14 # Default patch size for vision transformers
36
-
37
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
38
- model = model.to(device).eval()
39
-
40
- # ─────────────────────────────────────────────
41
- # 3) Download sample image + build a prompt containing <image>
42
- # ─────────────────────────────────────────────
43
- image_url = "https://huggingface.co/datasets/mishig/sample_images/resolve/main/tiger.jpg"
44
- resp = requests.get(image_url)
45
- image = Image.open(BytesIO(resp.content)).convert("RGB")
46
-
47
- # ─────────────────────────────────────────────
48
- # 4) Process image and text
49
- # ─────────────────────────────────────────────
50
- # Process image
51
- image_inputs = processor.process_images([image])
52
- image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
53
-
54
- # Process text
55
- text = "A photo of a tiger"
56
- text_inputs = processor.process_queries([text])
57
- text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
58
-
59
- # ─────────────────────────────────────────────
60
- # 5) Get embeddings and score
61
- # ─────────────────────────────────────────────
62
- with torch.no_grad():
63
- # Get image embedding
64
- image_embedding = model(**image_inputs)
65
-
66
- # Get text embedding
67
- text_embedding = model(**text_inputs)
68
-
69
- # Calculate similarity score
70
- score = torch.matmul(text_embedding, image_embedding.T).item()
71
-
72
- print(f"Similarity score between text and image: {score:.4f}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json DELETED
@@ -1,136 +0,0 @@
1
- {
2
- "crop_size": {
3
- "height": 384,
4
- "width": 384
5
- },
6
- "do_center_crop": true,
7
- "do_convert_rgb": null,
8
- "do_normalize": true,
9
- "do_pad": true,
10
- "do_rescale": true,
11
- "do_resize": true,
12
- "image_grid_pinpoints": [
13
- [
14
- 384,
15
- 768
16
- ],
17
- [
18
- 384,
19
- 1152
20
- ],
21
- [
22
- 384,
23
- 1536
24
- ],
25
- [
26
- 384,
27
- 1920
28
- ],
29
- [
30
- 384,
31
- 2304
32
- ],
33
- [
34
- 384,
35
- 2688
36
- ],
37
- [
38
- 384,
39
- 3072
40
- ],
41
- [
42
- 384,
43
- 3456
44
- ],
45
- [
46
- 384,
47
- 3840
48
- ],
49
- [
50
- 768,
51
- 384
52
- ],
53
- [
54
- 768,
55
- 768
56
- ],
57
- [
58
- 768,
59
- 1152
60
- ],
61
- [
62
- 768,
63
- 1536
64
- ],
65
- [
66
- 768,
67
- 1920
68
- ],
69
- [
70
- 1152,
71
- 384
72
- ],
73
- [
74
- 1152,
75
- 768
76
- ],
77
- [
78
- 1152,
79
- 1152
80
- ],
81
- [
82
- 1536,
83
- 384
84
- ],
85
- [
86
- 1536,
87
- 768
88
- ],
89
- [
90
- 1920,
91
- 384
92
- ],
93
- [
94
- 1920,
95
- 768
96
- ],
97
- [
98
- 2304,
99
- 384
100
- ],
101
- [
102
- 2688,
103
- 384
104
- ],
105
- [
106
- 3072,
107
- 384
108
- ],
109
- [
110
- 3456,
111
- 384
112
- ],
113
- [
114
- 3840,
115
- 384
116
- ]
117
- ],
118
- "image_mean": [
119
- 0.5,
120
- 0.5,
121
- 0.5
122
- ],
123
- "image_processor_type": "LlavaNextImageProcessor",
124
- "image_std": [
125
- 0.5,
126
- 0.5,
127
- 0.5
128
- ],
129
- "processor_class": "ColGraniteVisionProcessor",
130
- "resample": 3,
131
- "rescale_factor": 0.00392156862745098,
132
- "size": {
133
- "height": 384,
134
- "width": 384
135
- }
136
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
processing_colgranitevision.py DELETED
@@ -1,395 +0,0 @@
1
- import math
2
- from typing import ClassVar, List, Optional, Tuple, Union
3
-
4
- import torch
5
- from PIL import Image, ImageOps
6
- from transformers import BatchFeature, LlavaNextProcessor
7
-
8
-
9
- def round_by_factor(number: float, factor: int) -> int:
10
- """Returns the closest integer to 'number' that is divisible by 'factor'."""
11
- return round(number / factor) * factor
12
-
13
-
14
- def ceil_by_factor(number: float, factor: int) -> int:
15
- """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
16
- return math.ceil(number / factor) * factor
17
-
18
-
19
- def floor_by_factor(number: float, factor: int) -> int:
20
- """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
21
- return math.floor(number / factor) * factor
22
-
23
-
24
- class ColGraniteVisionProcessor(LlavaNextProcessor):
25
- """
26
- Processor for ColPali.
27
- """
28
-
29
- visual_prompt_prefix: ClassVar[str] = "<|user|>\n<image>\nDescribe the image.\n"
30
- system_message: ClassVar[
31
- str] = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
32
- query_prefix: ClassVar[str] = "Query: "
33
- query_start: ClassVar[str] = "<|user|>\n"
34
-
35
- def __init__(self, *args, **kwargs):
36
- super().__init__(*args, **kwargs)
37
- self.factor = 14
38
- self.min_size = 384
39
- self.max_size = 384 * 2
40
- self.suffix_len = 10
41
-
42
- @property
43
- def query_augmentation_token(self) -> str:
44
- """
45
- Return the query augmentation token.
46
- Query augmentation buffers are used as reasoning buffers during inference.
47
- """
48
- return self.tokenizer.pad_token
49
-
50
- @staticmethod
51
- def smart_resize_helper(
52
- width: int,
53
- height: int,
54
- factor: int,
55
- min_size: int,
56
- max_size: int
57
- ) -> Tuple[int, int]:
58
- """
59
- Returns the resized image dimensions such that:
60
- 1. The smaller dimension is set to 'min_size'.
61
- 2. The larger dimension is scaled proportionally to maintain aspect ratio.
62
- 3. If the larger dimension exceeds 'max_size', it is clipped to 'max_size',
63
- and the smaller dimension is adjusted accordingly to maintain aspect ratio.
64
- 4. Both dimensions are divisible by 'factor'.
65
- """
66
-
67
- # Determine scale factor based on min_size
68
- if height < width:
69
- scale_factor = min_size / height
70
- else:
71
- scale_factor = min_size / width
72
-
73
- new_width = round(width * scale_factor)
74
- new_height = round(height * scale_factor)
75
-
76
- # If the longer dimension exceeds max_size, adjust accordingly
77
- if max(new_width, new_height) > max_size:
78
- clip_factor = max_size / max(new_width, new_height)
79
- new_width = round(new_width * clip_factor)
80
- new_height = round(new_height * clip_factor)
81
-
82
- # Ensure dimensions are divisible by factor
83
- # new_width = round_by_factor(new_width, factor)
84
- # new_height = round_by_factor(new_height, factor)
85
-
86
- return new_width, new_height
87
-
88
- @staticmethod
89
- def pad_image_center(image: Image.Image,
90
- target_width: int,
91
- target_height: int,
92
- fill_color=(0, 0, 0)) -> Image.Image:
93
- """
94
- Pads the given image to be centered within the target dimensions.
95
-
96
- :param image: PIL Image to be padded.
97
- :param target_width: The desired width after padding.
98
- :param target_height: The desired height after padding.
99
- :param fill_color: Background color (default is black).
100
- :return: Padded image with centered content.
101
- """
102
-
103
- # Get original image size
104
- img_width, img_height = image.size
105
-
106
- # Compute padding values
107
- pad_left = (target_width - img_width) // 2
108
- pad_top = (target_height - img_height) // 2
109
- pad_right = target_width - img_width - pad_left
110
- pad_bottom = target_height - img_height - pad_top
111
-
112
- # Apply padding
113
- padded_image = ImageOps.expand(image, (pad_left, pad_top, pad_right, pad_bottom), fill_color).convert("RGB")
114
-
115
- return padded_image
116
-
117
- def smart_resize(self, image: Image.Image) -> Image.Image:
118
- """
119
- Resize and convert the image to the required format.
120
- """
121
- image_size = image.size
122
- resized_height, resized_width = self.smart_resize_helper(
123
- width=image_size[0],
124
- height=image_size[1],
125
- factor=self.factor,
126
- min_size=self.min_size,
127
- max_size=self.max_size
128
- )
129
- return image.convert("RGB").resize((resized_width, resized_height))
130
-
131
- def smart_resize_and_pad(self, image: Image.Image) -> Image.Image:
132
- """
133
- Resize and pad the image to the required format.
134
- """
135
- return self.resize_and_pad_centered(
136
- image=image,
137
- factor=self.factor,
138
- min_size=self.min_size,
139
- max_size=self.max_size,
140
- fill_color=0
141
- )
142
-
143
- def resize_and_pad_centered(self,
144
- image: Image.Image,
145
- factor: int,
146
- min_size: int,
147
- max_size: int,
148
- fill_color=0
149
- ) -> Image.Image:
150
- """
151
- Resizes and pads an image such that:
152
- - The short side is set to `min_size`.
153
- - The long side is scaled proportionally but clipped to `max_size`.
154
- - The image is centered within the final padded area.
155
-
156
- :param image: PIL Image
157
- :param factor: Factor to make dimensions divisible by
158
- :param min_size: Minimum size for the short side
159
- :param max_size: Maximum allowed size for the long side
160
- :param fill_color: Background padding color (default black)
161
- :return: Resized and padded image
162
- """
163
-
164
- # Get original size
165
- width, height = image.size
166
-
167
- if min_size == -1 or max_size == -1:
168
- return image.convert("RGB")
169
-
170
- # Determine scale factor based on the short side (min_size)
171
- if width < height:
172
- scale_factor = min_size / width
173
- target_width = min_size
174
- max_scale_factor = min(max_size / height, scale_factor)
175
- target_height = round(height * max_scale_factor)
176
- else:
177
- scale_factor = min_size / height
178
- target_height = min_size
179
- max_scale_factor = min(max_size / width, scale_factor)
180
- target_width = round(width * max_scale_factor)
181
-
182
- # Ensure the longer side does not exceed max_size
183
- # if max(target_width, target_height) > max_size:
184
- # clip_factor = max_size / max(target_width, target_height)
185
- # target_width = round(target_width * clip_factor)
186
- # target_height = round(target_height * clip_factor)
187
-
188
- # Ensure dimensions are divisible by factor
189
- # target_width = round_by_factor(target_width, factor)
190
- # target_height = round_by_factor(target_height, factor)
191
-
192
- # Resize the image
193
- resized_image = image.resize((target_width, target_height), Image.LANCZOS)
194
-
195
- # Determine final padded dimensions (aligned to short side)
196
- if width < height:
197
- final_width, final_height = min_size, max_size
198
- else:
199
- final_width, final_height = max_size, min_size
200
-
201
- # Compute padding to center the image
202
- pad_left = (final_width - target_width) // 2
203
- pad_top = (final_height - target_height) // 2
204
- pad_right = final_width - target_width - pad_left
205
- pad_bottom = final_height - target_height - pad_top
206
-
207
- # Apply centered padding
208
- # final_image = ImageOps.expand(resized_image, (pad_left, pad_top, pad_right, pad_bottom), fill_color).convert("RGB")
209
- final_image = resized_image.convert("RGB")
210
-
211
- return final_image
212
-
213
- def format_data(self, question, image):
214
- return [
215
- {
216
- "role": "system",
217
- "content": [{"type": "text", "text": self.system_message}],
218
- },
219
- {
220
- "role": "user",
221
- "content": [
222
- {
223
- "type": "image",
224
- "image": image,
225
- },
226
- {
227
- "type": "text",
228
- "text": question,
229
- },
230
- ],
231
- }
232
- ]
233
-
234
- def format_data_wo_role(self, question, image=None):
235
- return [
236
- {
237
- "role": "user",
238
- "content": [
239
- {
240
- "type": "image",
241
- "image": image,
242
- },
243
- {
244
- "type": "text",
245
- "text": question,
246
- },
247
- ],
248
- }
249
- ]
250
-
251
- def process_images(
252
- self,
253
- images: List[Image.Image],
254
- ) -> BatchFeature:
255
- """
256
- Process images for ColPali.
257
- """
258
- # texts_doc = [self.apply_chat_template(self.format_data_wo_role(self.visual_prompt_prefix, img),tokenize=False ) for img in images]
259
- texts_doc = [self.visual_prompt_prefix for _ in images]
260
- images = [self.smart_resize_and_pad(image) for image in images]
261
-
262
- batch_doc = self(
263
- text=texts_doc,
264
- images=images,
265
- return_tensors="pt",
266
- padding="longest",
267
- )
268
- return batch_doc
269
-
270
- def process_queries(self, queries, max_length=2048, suffix=None):
271
- if suffix is None:
272
- suffix = self.query_augmentation_token * self.suffix_len
273
-
274
- processed = []
275
- for q in queries:
276
- q = self.query_start + self.query_prefix + q
277
- # truncate before it eats actual query content
278
- if len(q) + len(suffix) > max_length:
279
- q = q[: max_length - len(suffix) - 1]
280
- q += suffix + "\n"
281
- processed.append(q)
282
-
283
- return self(
284
- text=processed,
285
- images=None,
286
- return_tensors="pt",
287
- padding="longest",
288
- truncation=True,
289
- max_length=max_length,
290
- )
291
-
292
- def score(
293
- self,
294
- qs: List[torch.Tensor],
295
- ps: List[torch.Tensor],
296
- device: Optional[Union[str, torch.device]] = None,
297
- **kwargs,
298
- ) -> torch.Tensor:
299
- """
300
- Compute the MaxSim score (ColBERT-like) for the given multi-vector query and passage embeddings.
301
- """
302
- return self.score_multi_vector(qs, ps, device=device, **kwargs)
303
-
304
- def get_n_patches(
305
- self,
306
- image_size: Tuple[int, int],
307
- patch_size: int,
308
- ) -> Tuple[int, int]:
309
- n_patches_x = self.image_processor.size["width"] // patch_size
310
- n_patches_y = self.image_processor.size["height"] // patch_size
311
-
312
- return n_patches_x, n_patches_y
313
-
314
- def get_image_mask(self, batch_images: BatchFeature) -> torch.Tensor:
315
- return batch_images.input_ids == self.image_token_id
316
-
317
- @staticmethod
318
- def score_single_vector(
319
- qs: List[torch.Tensor],
320
- ps: List[torch.Tensor],
321
- device: Optional[Union[str, torch.device]] = None,
322
- ) -> torch.Tensor:
323
- """
324
- Compute the dot product score for the given single-vector query and passage embeddings.
325
- """
326
-
327
- if len(qs) == 0:
328
- raise ValueError("No queries provided")
329
- if len(ps) == 0:
330
- raise ValueError("No passages provided")
331
-
332
- qs_stacked = torch.stack(qs).to(device)
333
- ps_stacked = torch.stack(ps).to(device)
334
-
335
- scores = torch.einsum("bd,cd->bc", qs_stacked, ps_stacked)
336
- assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
337
-
338
- scores = scores.to(torch.float32)
339
- return scores
340
-
341
- @staticmethod
342
- def score_multi_vector(
343
- qs: Union[torch.Tensor, List[torch.Tensor]],
344
- ps: Union[torch.Tensor, List[torch.Tensor]],
345
- batch_size: int = 128,
346
- device: Optional[Union[str, torch.device]] = None,
347
- ) -> torch.Tensor:
348
- """
349
- Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
350
- query embeddings (`qs`) and passage embeddings (`ps`). For ColPali, a passage is the
351
- image of a document page.
352
-
353
- Because the embedding tensors are multi-vector and can thus have different shapes, they
354
- should be fed as:
355
- (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
356
- (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
357
- obtained by padding the list of tensors.
358
-
359
- Args:
360
- qs (`Union[torch.Tensor, List[torch.Tensor]`): Query embeddings.
361
- ps (`Union[torch.Tensor, List[torch.Tensor]`): Passage embeddings.
362
- batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
363
- device (`Union[str, torch.device]`, *optional*): Device to use for computation. If not
364
- provided, uses `get_torch_device("auto")`.
365
-
366
- Returns:
367
- `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
368
- tensor is saved on the "cpu" device.
369
- """
370
-
371
- if len(qs) == 0:
372
- raise ValueError("No queries provided")
373
- if len(ps) == 0:
374
- raise ValueError("No passages provided")
375
-
376
- scores_list: List[torch.Tensor] = []
377
-
378
- for i in range(0, len(qs), batch_size):
379
- scores_batch = []
380
- qs_batch = torch.nn.utils.rnn.pad_sequence(qs[i: i + batch_size], batch_first=True, padding_value=0).to(
381
- device
382
- )
383
- for j in range(0, len(ps), batch_size):
384
- ps_batch = torch.nn.utils.rnn.pad_sequence(
385
- ps[j: j + batch_size], batch_first=True, padding_value=0
386
- ).to(device)
387
- scores_batch.append(torch.einsum("bnd,csd->bcns", qs_batch, ps_batch).max(dim=3)[0].sum(dim=2))
388
- scores_batch = torch.cat(scores_batch, dim=1).cpu()
389
- scores_list.append(scores_batch)
390
-
391
- scores = torch.cat(scores_list, dim=0)
392
- assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
393
-
394
- scores = scores.to(torch.float32)
395
- return scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
processor_config.json DELETED
@@ -1,12 +0,0 @@
1
- {
2
-
3
- "processor_class": "ColGraniteVisionProcessor",
4
- "auto_map": {
5
- "AutoProcessor": "processing_colgranitevision.ColGraniteVisionProcessor"
6
- },
7
- "model_type": "llava_next",
8
- "base_model": "ibm-granite/granite-vision-3.1-2b-preview"
9
-
10
-
11
-
12
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
save_as_pretrained.py DELETED
@@ -1,70 +0,0 @@
1
- # import os
2
- # import json
3
- # from transformers import AutoProcessor, PreTrainedModel
4
- # from colgranitevision_config import ColGraniteVisionConfig
5
- # from modeling_colgranitevision import ColGraniteVision
6
- # from processing_colgranitevision import ColGraniteVisionProcessor
7
- # # ─────────────────────────────────────────────
8
- # # 1) Paths & basic config
9
- # # ─────────────────────────────────────────────
10
- # BASE_MODEL = "ibm-granite/granite-vision-3.1-2b-preview"
11
- # ADAPTER_PATH = r"C:\Users\Y9F7WJ756\Documents\workspace\colgranite\granite_cola\adapters"
12
- # OUTPUT_DIR = r"/colgranitevision_new"
13
- #
14
- # os.makedirs(OUTPUT_DIR, exist_ok=True)
15
- #
16
- # # ─────────────────────────────────────────────
17
- # # 2) Build custom config
18
- # # ─────────────────────────────────────────────
19
- # config = ColGraniteVisionConfig(
20
- # base_model=BASE_MODEL,
21
- # adapter_path=ADAPTER_PATH,
22
- # emb_dim_query=128,
23
- # emb_dim_doc=128,
24
- # torch_dtype="float16",
25
- # )
26
- #
27
- # # Override inherited wrong config
28
- # config.architectures = ["ColGraniteVision"]
29
- # config.model_type = "colgranitevision"
30
- #
31
- # # Save corrected config
32
- # with open(os.path.join(OUTPUT_DIR, "../colgranitevision/config.json"), "w") as f:
33
- # json.dump(config.to_dict(), f, indent=2)
34
- #
35
- # # ─────────────────────────────────────────────
36
- # # 3) Instantiate model & processor
37
- # # ─────────────────────────────────────────────
38
- # model = ColGraniteVision(config)
39
- processor = ColGraniteVisionProcessor.from_pretrained(BASE_MODEL)
40
- #
41
- # # ─────────────────────────────────────────────
42
- # # 4) Save model, config, and processor
43
- # # ─────────────────────────────────────────────
44
- # model.save_pretrained(
45
- # OUTPUT_DIR,
46
- # safe_serialization=True,
47
- # max_shard_size="2GB",
48
- # )
49
- processor.save_pretrained(OUTPUT_DIR)
50
- #
51
- # print("✅ PEFT-adapted model and processor saved to:", OUTPUT_DIR)
52
- #
53
- # # ─────────────────────────────────────────────
54
- # # 5) Reload test
55
- # # ─────────────────────────────────────────────
56
- # try:
57
- # from transformers import AutoConfig, AutoModel, AutoProcessor
58
- # from granite_cola import granitevision
59
- #
60
- # AutoConfig.register("colgranitevision", ColGraniteVisionConfig)
61
- # AutoModel.register(ColGraniteVisionConfig, ColGraniteVision)
62
- # AutoProcessor.register(ColGraniteVisionConfig, ColGraniteVisionProcessor)
63
- #
64
- # config = AutoConfig.from_pretrained(OUTPUT_DIR)
65
- # model = AutoModel.from_pretrained(OUTPUT_DIR, config=config)
66
- # processor = AutoProcessor.from_pretrained(OUTPUT_DIR)
67
- #
68
- # print("🚀 Reload successful:", type(model), type(processor))
69
- # except Exception as e:
70
- # print("❌ Reload test failed:", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
special_tokens_map.json DELETED
@@ -1,35 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|start_of_role|>",
4
- "<|end_of_role|>",
5
- "<|tool_call|>"
6
- ],
7
- "bos_token": {
8
- "content": "<|end_of_text|>",
9
- "lstrip": false,
10
- "normalized": false,
11
- "rstrip": false,
12
- "single_word": false
13
- },
14
- "eos_token": {
15
- "content": "<|end_of_text|>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false
20
- },
21
- "pad_token": {
22
- "content": "<|end_of_text|>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false
27
- },
28
- "unk_token": {
29
- "content": "<|end_of_text|>",
30
- "lstrip": false,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false
34
- }
35
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json DELETED
@@ -1,208 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
4
- "added_tokens_decoder": {
5
- "0": {
6
- "content": "<|end_of_text|>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "1": {
14
- "content": "<fim_prefix>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "2": {
22
- "content": "<fim_middle>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "3": {
30
- "content": "<fim_suffix>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "4": {
38
- "content": "<fim_pad>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "5": {
46
- "content": "<filename>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "6": {
54
- "content": "<gh_stars>",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- },
61
- "7": {
62
- "content": "<issue_start>",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": true
68
- },
69
- "8": {
70
- "content": "<issue_comment>",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": true
76
- },
77
- "9": {
78
- "content": "<issue_closed>",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": true
84
- },
85
- "10": {
86
- "content": "<jupyter_start>",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": true
92
- },
93
- "11": {
94
- "content": "<jupyter_text>",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": true
100
- },
101
- "12": {
102
- "content": "<jupyter_code>",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": true
108
- },
109
- "13": {
110
- "content": "<jupyter_output>",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": true
116
- },
117
- "14": {
118
- "content": "<empty_output>",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": true
124
- },
125
- "15": {
126
- "content": "<commit_before>",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": true
132
- },
133
- "16": {
134
- "content": "<commit_msg>",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": true
140
- },
141
- "17": {
142
- "content": "<commit_after>",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": true
148
- },
149
- "18": {
150
- "content": "<reponame>",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": true
156
- },
157
- "49152": {
158
- "content": "<|start_of_role|>",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": true
164
- },
165
- "49153": {
166
- "content": "<|end_of_role|>",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": true
172
- },
173
- "49154": {
174
- "content": "<|tool_call|>",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": true
180
- },
181
- "49155": {
182
- "content": "<image>",
183
- "lstrip": false,
184
- "normalized": false,
185
- "rstrip": false,
186
- "single_word": false,
187
- "special": true
188
- }
189
- },
190
- "additional_special_tokens": [
191
- "<|start_of_role|>",
192
- "<|end_of_role|>",
193
- "<|tool_call|>"
194
- ],
195
- "bos_token": "<|end_of_text|>",
196
- "chat_template": "{%- if tools %}\n {{- '<|start_of_role|>available_tools<|end_of_role|>\n' }}\n {%- for tool in tools %}\n {{- tool | tojson(indent=4) }}\n {%- if not loop.last %}\n {{- '\n\n' }}\n {%- endif %}\n {%- endfor %}\n {{- '<|end_of_text|>\n' }}\n{%- endif %}\n{%- for message in messages if message['role'] == 'system'%}{% else %}<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n{% endfor %}{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n {{- '<|system|>\n' + message['content'] + '\n' }}\n {%- elif message['role'] == 'user' %}\n {{- '<|user|>\n' + message['content'] + '\n' }}\n {%- elif message['role'] == 'assistant' %}\n {{- '<|assistant|>\n' + message['content'] + '<|end_of_text|>' }}\n {%- elif message['role'] == 'assistant_tool_call' %}\n {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- elif message['role'] == 'tool_response' %}\n {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}\n {%- endif %}\n {%- if loop.last and add_generation_prompt %}\n {{- '<|assistant|>\n' }}\n {%- endif %}\n{%- endfor %}",
197
- "clean_up_tokenization_spaces": true,
198
- "eos_token": "<|end_of_text|>",
199
- "errors": "replace",
200
- "extra_special_tokens": {},
201
- "model_max_length": 16384,
202
- "pad_token": "<|end_of_text|>",
203
- "padding_side": "right",
204
- "processor_class": "ColGraniteVisionProcessor",
205
- "tokenizer_class": "GPT2Tokenizer",
206
- "unk_token": "<|end_of_text|>",
207
- "vocab_size": 49152
208
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
vocab.json DELETED
The diff for this file is too large to render. See raw diff