YannQi commited on
Commit
753c5a3
·
verified ·
1 Parent(s): 61e7537

Upload processing_r.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. processing_r.py +44 -2
processing_r.py CHANGED
@@ -20,7 +20,7 @@ import numpy as np
20
  from transformers.feature_extraction_utils import BatchFeature
21
  from transformers.image_processing_utils import select_best_resolution
22
  from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
23
- from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
24
  from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
25
  from transformers.utils import logging
26
 
@@ -190,6 +190,48 @@ class RProcessor(ProcessorMixin):
190
  return (unpadded_features, newline_features)
191
 
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
194
  def batch_decode(self, *args, **kwargs):
195
  """
@@ -214,4 +256,4 @@ class RProcessor(ProcessorMixin):
214
  return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
215
 
216
 
217
- __all__ = ["RProcessor"]
 
20
  from transformers.feature_extraction_utils import BatchFeature
21
  from transformers.image_processing_utils import select_best_resolution
22
  from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
23
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, MultiModalData
24
  from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
25
  from transformers.utils import logging
26
 
 
190
  return (unpadded_features, newline_features)
191
 
192
 
193
+ def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
194
+ """
195
+ Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
196
+ Args:
197
+ image_sizes (list[list[str]], *optional*):
198
+ The input sizes formatted as (height, width) per each image.
199
+ video_sizes (list[list[str]], *optional*):
200
+ The input sizes formatted as (num_frames, height, width) per each video.
201
+ audio_lengths (list[int], *optional*):
202
+ The input length formatted as per each audio.
203
+ Returns:
204
+ dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio")
205
+ to a list containing the number of placeholder tokens required. If the model doesn't accept
206
+ a certain modality or no input sizes are provided, the dict value is set to an empty list.
207
+ """
208
+ vision_data = {}
209
+ if image_sizes is not None:
210
+ images_kwargs = RProcessorKwargs._defaults.get("images_kwargs", {})
211
+ images_kwargs.update(kwargs)
212
+
213
+ size = images_kwargs.get("size", None) or self.image_processor.size
214
+ size = (
215
+ (size["shortest_edge"], size["shortest_edge"])
216
+ if "shortest_edge" in size
217
+ else (min(size["height"], size["width"]), min(size["height"], size["width"]))
218
+ )
219
+ processed_height, processed_width = size
220
+
221
+ batch_num_image_tokens = []
222
+ num_image_patches = [1] * len(image_sizes) # llava-ov doesn't batch pixels as Idefics, thus `1` patch`
223
+ for image_size in image_sizes:
224
+ orig_height, orig_width = image_size
225
+ num_image_tokens = self._get_number_of_features(
226
+ orig_height, orig_width, processed_height, processed_width
227
+ )
228
+ if self.vision_feature_select_strategy == "default":
229
+ num_image_tokens -= 1
230
+ batch_num_image_tokens.append(num_image_tokens)
231
+ vision_data.update({"num_image_tokens": batch_num_image_tokens, "num_image_patches": num_image_patches})
232
+
233
+ return MultiModalData(**vision_data)
234
+
235
  # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
236
  def batch_decode(self, *args, **kwargs):
237
  """
 
256
  return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
257
 
258
 
259
+ __all__ = ["RProcessor"]