YannQi commited on
Commit
1eb6491
·
verified ·
1 Parent(s): d68afe0

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. processing_r.py +244 -0
processing_r.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+
13
+
14
+ import math
15
+ from collections.abc import Iterable
16
+ from typing import Union
17
+
18
+ import numpy as np
19
+
20
+ from transformers.feature_extraction_utils import BatchFeature
21
+ from transformers.image_processing_utils import select_best_resolution
22
+ from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
23
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
24
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
25
+ from transformers.utils import logging
26
+
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+
31
+ class RProcessorKwargs(ProcessingKwargs, total=False):
32
+ # see processing_utils.ProcessingKwargs documentation for usage.
33
+ _defaults = {
34
+ "text_kwargs": {
35
+ "padding": False,
36
+
37
+ },
38
+ "image_kwargs": {},
39
+ "videos_kwargs": {},
40
+ }
41
+
42
+
43
+ class RProcessor(ProcessorMixin):
44
+ attributes = ["image_processor", "tokenizer", "video_processor"]
45
+ valid_kwargs = [
46
+ "chat_template",
47
+ "num_image_tokens",
48
+ "image_processor_type",
49
+ "vision_feature_select_strategy",
50
+ "image_token",
51
+ "video_token",
52
+ "vision_aspect_ratio",
53
+ ]
54
+ image_processor_class = "AutoImageProcessor"
55
+ tokenizer_class = "AutoTokenizer"
56
+ video_processor_class = "AutoVideoProcessor"
57
+
58
+ def __init__(
59
+ self,
60
+ image_processor=None,
61
+ tokenizer=None,
62
+ video_processor=None,
63
+ num_image_tokens=None,
64
+ vision_feature_select_strategy=None,
65
+ chat_template=None,
66
+ image_token="<image>",
67
+ video_token="<video>",
68
+ vision_aspect_ratio= "anyres",
69
+ **kwargs,
70
+ ):
71
+ self.num_image_tokens = num_image_tokens
72
+ self.vision_feature_select_strategy = vision_feature_select_strategy
73
+ self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
74
+ self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
75
+ self.image_token_id = (
76
+ tokenizer.image_token_id
77
+ if getattr(tokenizer, "image_token_id", None)
78
+ else tokenizer.convert_tokens_to_ids(self.image_token)
79
+ )
80
+ self.video_token_id = (
81
+ tokenizer.video_token_id
82
+ if getattr(tokenizer, "video_token_id", None)
83
+ else tokenizer.convert_tokens_to_ids(self.video_token)
84
+ )
85
+ self.vision_aspect_ratio = vision_aspect_ratio
86
+ super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
87
+
88
+ def __call__(
89
+ self,
90
+ images: ImageInput = None,
91
+ text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
92
+ audio=None,
93
+ videos=None,
94
+ **kwargs: Unpack[RProcessorKwargs],
95
+ ) -> BatchFeature:
96
+ output_kwargs = self._merge_kwargs(
97
+ RProcessorKwargs,
98
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
99
+ **kwargs,
100
+ )
101
+
102
+ if isinstance(text, str):
103
+ text = [text]
104
+ elif not isinstance(text, list) and not isinstance(text[0], str):
105
+ raise ValueError("Invalid input text. Please provide a string, or a list of strings")
106
+
107
+ image_inputs = video_inputs = {}
108
+
109
+ if images is not None:
110
+ image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
111
+
112
+ batch_num_images = iter(image_inputs["batch_num_images"])
113
+ image_sizes = iter(image_inputs["image_sizes"])
114
+ height, width = get_image_size(
115
+ to_numpy_array(image_inputs["pixel_values"][0][0]),
116
+ channel_dim=output_kwargs["images_kwargs"].get("data_format"),
117
+ )
118
+ text, num_image_tokens = self._expand_image_tokens(
119
+ text, image_sizes, height, width, self.image_token, batch_num_images
120
+ )
121
+
122
+ if videos is not None:
123
+ video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
124
+
125
+ one_video = video_inputs.get("pixel_values_videos")[0]
126
+ if isinstance(video_inputs.get("pixel_values_videos")[0], (list, tuple)):
127
+ one_video = np.array(one_video)
128
+ else:
129
+ one_video = to_numpy_array(one_video)
130
+ height, width = get_image_size(one_video[0], channel_dim=output_kwargs["images_kwargs"].get("data_format"))
131
+ num_frames = one_video.shape[0] # frame dim is always after batch dim
132
+ patches_height_width = int(math.sqrt(self.num_image_tokens))
133
+ pooled_height_width = math.ceil(patches_height_width / 2)
134
+ num_video_tokens = (num_frames * pooled_height_width * pooled_height_width) + 1 # +1 for newline token
135
+ text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text]
136
+
137
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
138
+
139
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
140
+ self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
141
+
142
+
143
+ return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs}, tensor_type=return_tensors)
144
+
145
+ def _expand_image_tokens(
146
+ self,
147
+ text: list[TextInput],
148
+ image_sizes: Iterable[Union[list[int], int]],
149
+ height: int,
150
+ width: int,
151
+ special_token: str,
152
+ batch_num_images: Iterable[int],
153
+ ):
154
+
155
+ prompt_strings = []
156
+ max_num_vision_tokens = 0
157
+ for sample in text:
158
+ if special_token in sample:
159
+ is_multi_image = next(batch_num_images) != 1
160
+ else:
161
+ is_multi_image = False
162
+ while special_token in sample:
163
+ if is_multi_image:
164
+ num_image_tokens = self.num_image_tokens + 1 # one for image_newline
165
+ else:
166
+ original_size = next(image_sizes)
167
+ if not isinstance(original_size, (list, tuple)):
168
+ # cast to list to avoid numerical precision errors when calculating unpadding
169
+ original_size = original_size.tolist()
170
+ orig_height, orig_width = original_size
171
+ num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
172
+ max_num_vision_tokens = max(max_num_vision_tokens, num_image_tokens)
173
+ if self.vision_feature_select_strategy == "default":
174
+ num_image_tokens -= 1
175
+ sample = sample.replace(special_token, "<placeholder>" * num_image_tokens, 1)
176
+ prompt_strings.append(sample)
177
+ text = [sample.replace("<placeholder>", special_token) for sample in prompt_strings]
178
+ return text, max_num_vision_tokens
179
+
180
+ def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
181
+ image_grid_pinpoints = self.image_processor.image_grid_pinpoints
182
+
183
+ height_best_resolution, width_best_resolution = select_best_resolution(
184
+ [orig_height, orig_width], image_grid_pinpoints
185
+ )
186
+ scale_height, scale_width = height_best_resolution // height, width_best_resolution // width
187
+
188
+ patches_height = patches_width = int(math.sqrt(self.num_image_tokens))
189
+ unpadded_features, newline_features = self._get_unpadded_features(
190
+ orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
191
+ )
192
+
193
+ # The base patch covers the entire image (no CLS for SigLIP)
194
+ base_features = self.num_image_tokens
195
+ num_image_tokens = unpadded_features + newline_features + base_features
196
+ return num_image_tokens
197
+
198
+ # Adapted from transformers.models.llava_next.processing_llava_next.LlavaNextProcessor._get_unpadded_features
199
+ def _get_unpadded_features(self, height, width, patches_height, patches_width, scale_height, scale_width):
200
+ current_height = patches_height * scale_height
201
+ current_width = patches_width * scale_width
202
+
203
+ original_aspect_ratio = width / height
204
+ current_aspect_ratio = current_width / current_height
205
+ if original_aspect_ratio > current_aspect_ratio:
206
+ new_height = int(round(height * (current_width / width), 7))
207
+ padding = (current_height - new_height) // 2
208
+ current_height -= padding * 2
209
+ else:
210
+ new_width = int(round(width * (current_height / height), 7))
211
+ padding = (current_width - new_width) // 2
212
+ current_width -= padding * 2
213
+
214
+ unpadded_features = current_height * current_width
215
+ newline_features = current_height
216
+
217
+ return (unpadded_features, newline_features)
218
+
219
+
220
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
221
+ def batch_decode(self, *args, **kwargs):
222
+ """
223
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
224
+ refer to the docstring of this method for more information.
225
+ """
226
+ return self.tokenizer.batch_decode(*args, **kwargs)
227
+
228
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
229
+ def decode(self, *args, **kwargs):
230
+ """
231
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
232
+ the docstring of this method for more information.
233
+ """
234
+ return self.tokenizer.decode(*args, **kwargs)
235
+
236
+ @property
237
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
238
+ def model_input_names(self):
239
+ tokenizer_input_names = self.tokenizer.model_input_names
240
+ image_processor_input_names = self.image_processor.model_input_names
241
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
242
+
243
+
244
+ __all__ = ["RProcessor"]