Upload processing_r.py with huggingface_hub
Browse files- processing_r.py +44 -2
processing_r.py
CHANGED
@@ -20,7 +20,7 @@ import numpy as np
|
|
20 |
from transformers.feature_extraction_utils import BatchFeature
|
21 |
from transformers.image_processing_utils import select_best_resolution
|
22 |
from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
|
23 |
-
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
24 |
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
25 |
from transformers.utils import logging
|
26 |
|
@@ -190,6 +190,48 @@ class RProcessor(ProcessorMixin):
|
|
190 |
return (unpadded_features, newline_features)
|
191 |
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
194 |
def batch_decode(self, *args, **kwargs):
|
195 |
"""
|
@@ -214,4 +256,4 @@ class RProcessor(ProcessorMixin):
|
|
214 |
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
215 |
|
216 |
|
217 |
-
__all__ = ["RProcessor"]
|
|
|
20 |
from transformers.feature_extraction_utils import BatchFeature
|
21 |
from transformers.image_processing_utils import select_best_resolution
|
22 |
from transformers.image_utils import ImageInput, get_image_size, to_numpy_array
|
23 |
+
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, MultiModalData
|
24 |
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
25 |
from transformers.utils import logging
|
26 |
|
|
|
190 |
return (unpadded_features, newline_features)
|
191 |
|
192 |
|
193 |
+
def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
|
194 |
+
"""
|
195 |
+
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
|
196 |
+
Args:
|
197 |
+
image_sizes (list[list[str]], *optional*):
|
198 |
+
The input sizes formatted as (height, width) per each image.
|
199 |
+
video_sizes (list[list[str]], *optional*):
|
200 |
+
The input sizes formatted as (num_frames, height, width) per each video.
|
201 |
+
audio_lengths (list[int], *optional*):
|
202 |
+
The input length formatted as per each audio.
|
203 |
+
Returns:
|
204 |
+
dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio")
|
205 |
+
to a list containing the number of placeholder tokens required. If the model doesn't accept
|
206 |
+
a certain modality or no input sizes are provided, the dict value is set to an empty list.
|
207 |
+
"""
|
208 |
+
vision_data = {}
|
209 |
+
if image_sizes is not None:
|
210 |
+
images_kwargs = RProcessorKwargs._defaults.get("images_kwargs", {})
|
211 |
+
images_kwargs.update(kwargs)
|
212 |
+
|
213 |
+
size = images_kwargs.get("size", None) or self.image_processor.size
|
214 |
+
size = (
|
215 |
+
(size["shortest_edge"], size["shortest_edge"])
|
216 |
+
if "shortest_edge" in size
|
217 |
+
else (min(size["height"], size["width"]), min(size["height"], size["width"]))
|
218 |
+
)
|
219 |
+
processed_height, processed_width = size
|
220 |
+
|
221 |
+
batch_num_image_tokens = []
|
222 |
+
num_image_patches = [1] * len(image_sizes) # llava-ov doesn't batch pixels as Idefics, thus `1` patch`
|
223 |
+
for image_size in image_sizes:
|
224 |
+
orig_height, orig_width = image_size
|
225 |
+
num_image_tokens = self._get_number_of_features(
|
226 |
+
orig_height, orig_width, processed_height, processed_width
|
227 |
+
)
|
228 |
+
if self.vision_feature_select_strategy == "default":
|
229 |
+
num_image_tokens -= 1
|
230 |
+
batch_num_image_tokens.append(num_image_tokens)
|
231 |
+
vision_data.update({"num_image_tokens": batch_num_image_tokens, "num_image_patches": num_image_patches})
|
232 |
+
|
233 |
+
return MultiModalData(**vision_data)
|
234 |
+
|
235 |
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
236 |
def batch_decode(self, *args, **kwargs):
|
237 |
"""
|
|
|
256 |
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
257 |
|
258 |
|
259 |
+
__all__ = ["RProcessor"]
|