jinaai
/

jina-embeddings-v4

@@ -1,11 +1,11 @@
 {
   "_name_or_path": "jinaai/jina-embeddings-v4",
   "architectures": [
-    "ColQwen25Duo"
   ],
   "auto_map": {
-    "AutoConfig": "configuration_colqwen_duo.ColQwen25DuoConfig",
-    "AutoModel": "modeling_colqwen_duo.JinaEmbeddingsV4Model"
   },
   "attention_dropout": 0.0,
   "bos_token_id": 151643,

 {
   "_name_or_path": "jinaai/jina-embeddings-v4",
   "architectures": [
+    "JinaEmbeddingsV4Model"
   ],
   "auto_map": {
+    "AutoConfig": "configuration_jina_embeddings_v4.JinaEmbeddingsV4Config",
+    "AutoModel": "modeling_jina_embeddings_v4.JinaEmbeddingsV4Model"
   },
   "attention_dropout": 0.0,
   "bos_token_id": 151643,

configuration_colqwen_duo.py → configuration_jina_embeddings_v4.py RENAMED Viewed

@@ -2,9 +2,9 @@ from transformers.models.qwen2_5_vl import Qwen2_5_VLConfig
 from typing import Optional
-class ColQwen25DuoConfig(Qwen2_5_VLConfig):
     """
-    Configuration for the ColQwenDuo model.
     """
     def __init__(

 from typing import Optional
+class JinaEmbeddingsV4Config(Qwen2_5_VLConfig):
     """
+    Configuration for the JinaEmbeddingsV4 model.
     """
     def __init__(

modeling_colqwen_duo.py → modeling_jina_embeddings_v4.py RENAMED Viewed

@@ -2,11 +2,9 @@ import os
 import math
 import numpy as np
-from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Any, Callable, ClassVar, Dict, List, Optional, Union, cast
-from typing_extensions import Unpack
-from peft import LoraConfig, PeftModel
 import torch
 from torch import nn
 from torch.utils.data import DataLoader
@@ -17,170 +15,24 @@ from tqdm import tqdm
 from enum import Enum
 from peft.utils.hotswap import hotswap_adapter
-from transformers import BatchEncoding, BatchFeature
-from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLCausalLMOutputWithPast
 from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor, Qwen2_5_VLForConditionalGeneration
-from transformers.processing_utils import (
-    AllKwargsForChatTemplate,
-    ImageInput,
-    PreTokenizedInput,
-    TextInput,
-    VideoInput,
-)
 from huggingface_hub import snapshot_download
-from .configuration_colqwen_duo import ColQwen25DuoConfig
-def get_torch_device() -> str:
-    """
-    Returns the device (string) to be used by PyTorch.
-    `device` arg defaults to "auto" which will use:
-    - "cuda:0" if available
-    - else "mps" if available
-    - else "cpu".
-    """
-    if torch.cuda.is_available():
-        device = "cuda:0"
-    elif torch.backends.mps.is_available():  # for Apple Silicon
-        device = "mps"
-    else:
-        device = "cpu"
-    return device
 class PromptType(str, Enum):
     query = "query"
     passage = "passage"
-class BaseVisualRetrieverProcessor(ABC):
-    """
-    Base class for visual retriever processors.
-    """
-    @abstractmethod
-    def process_images(
-        self,
-        images: List[Image.Image],
-    ) -> Union[BatchFeature, BatchEncoding]:
-        pass
-    @abstractmethod
-    def process_texts(
-        self,
-        texts: List[str],
-        max_length: int = 50,
-        suffix: Optional[str] = None,
-        prefix: Optional[str] = None,
-    ) -> Union[BatchFeature, BatchEncoding]:
-        pass
-    @abstractmethod
-    def score(
-        self,
-        qs: List[torch.Tensor],
-        ps: List[torch.Tensor],
-        device: Optional[Union[str, torch.device]] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        pass
-    @staticmethod
-    def score_single_vector(
-        qs: List[torch.Tensor],
-        ps: List[torch.Tensor],
-        device: Optional[Union[str, torch.device]] = None,
-    ) -> torch.Tensor:
-        """
-        Compute the dot product score for the given single-vector query and passage embeddings.
-        """
-        device = device or get_torch_device()
-        if len(qs) == 0:
-            raise ValueError("No queries provided")
-        if len(ps) == 0:
-            raise ValueError("No passages provided")
-        qs_stacked = torch.stack(qs).to(device)
-        ps_stacked = torch.stack(ps).to(device)
-        scores = torch.einsum("bd,cd->bc", qs_stacked, ps_stacked)
-        assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
-        scores = scores.to(torch.float32)
-        return scores
-    @staticmethod
-    def score_multi_vector(
-        qs: List[torch.Tensor],
-        ps: List[torch.Tensor],
-        batch_size: int = 128,
-        device: Optional[Union[str, torch.device]] = None,
-    ) -> torch.Tensor:
-        """
-        Compute the MaxSim score (ColBERT-like) for the given multi-vector query and passage embeddings.
-        """
-        device = device or get_torch_device()
-        if len(qs) == 0:
-            raise ValueError("No queries provided")
-        if len(ps) == 0:
-            raise ValueError("No passages provided")
-        scores_list: List[torch.Tensor] = []
-        for i in range(0, len(qs), batch_size):
-            scores_batch = []
-            qs_batch = torch.nn.utils.rnn.pad_sequence(qs[i : i + batch_size], batch_first=True, padding_value=0).to(
-                device
-            )
-            for j in range(0, len(ps), batch_size):
-                ps_batch = torch.nn.utils.rnn.pad_sequence(
-                    ps[j : j + batch_size], batch_first=True, padding_value=0
-                ).to(device)
-                scores_batch.append(torch.einsum("bnd,csd->bcns", qs_batch, ps_batch).max(dim=3)[0].sum(dim=2))
-            scores_batch = torch.cat(scores_batch, dim=1).cpu()
-            scores_list.append(scores_batch)
-        scores = torch.cat(scores_list, dim=0)
-        assert scores.shape[0] == len(qs), f"Expected {len(qs)} scores, got {scores.shape[0]}"
-        scores = scores.to(torch.float32)
-        return scores
-class QwenVLProcessor(ABC):
-    def __call__(
-        self,
-        images: Optional[ImageInput] = None,
-        text: Optional[Union[TextInput, PreTokenizedInput, List[PreTokenizedInput]]] = None,
-        videos: Optional[VideoInput] = None,
-        **kwargs,
-    ) -> BatchFeature:
-        return super().__call__(images=images, text=text, videos=videos, **kwargs)  # type: ignore
-    def apply_chat_template(
-        self,
-        conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]]],
-        chat_template: Optional[str] = None,
-        **kwargs: Unpack[AllKwargsForChatTemplate],
-    ) -> str:
-        return super().apply_chat_template(conversation=conversation, chat_template=chat_template, **kwargs)  # type: ignore
-class QwenVLEmbeddingProcessorBase(BaseVisualRetrieverProcessor, QwenVLProcessor):
-    assistant_prefix_len: int = 58  # length of prefix created by
-    # super().apply_chat_template(conversation=conversation, chat_template=chat_template, **kwargs)
     @staticmethod
     def round_by_factor(number: float, factor: int) -> int:
@@ -236,12 +88,12 @@ class QwenVLEmbeddingProcessorBase(BaseVisualRetrieverProcessor, QwenVLProcessor
     def process_texts(
         self,
         texts: List[str],
-        max_length: int = 8192,
-        suffix: Optional[str] = None,
         prefix: Optional[str] = None,
         padding: Optional[str] = None,
     ) -> BatchFeature:
         padded_texts: List[str] = []
         for text in texts:
@@ -260,42 +112,8 @@ class QwenVLEmbeddingProcessorBase(BaseVisualRetrieverProcessor, QwenVLProcessor
         return text_batch
-class ColQwenDuoProcessorBase(QwenVLEmbeddingProcessorBase):
-    """
-    Processor for ColQwenDuo. Mirrors the `ColQwen2Processor` class.
-    """
-    def score(
-        self,
-        qs: List[torch.Tensor],
-        ps: List[torch.Tensor],
-        vector_type: str,
-        device: Optional[Union[str, torch.device]] = None,
-        truncate: Optional[int] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        Compute the MaxSim score (ColBERT-like) for the given multi-vector query and passage embeddings.
-        """
-        if truncate:
-            qs = [q[..., :truncate] for q in qs]
-            ps = [p[..., :truncate] for p in ps]
-        if vector_type == "single_vector":
-            return self.score_single_vector(qs, ps, device=device)
-        elif vector_type == "multi_vector":
-            return self.score_multi_vector(qs, ps, device=device, **kwargs)
-        else:
-            raise ValueError('vector_type must be one of the following: [`single_vector`, `multi_vector`]')
-class ColQwen25DuoProcessor(ColQwenDuoProcessorBase, Qwen2_5_VLProcessor):
-    def __init__(self, *args, **kwargs) -> None:
-        Qwen2_5_VLProcessor.__init__(self, *args, **kwargs)
 @dataclass
-class HybridModelOutput:
     """
     Base class for the Hybrid Model outputs.
     Args:
@@ -308,149 +126,20 @@ class HybridModelOutput:
     single_vec_emb: Optional[torch.Tensor] = None
     multi_vec_emb: Optional[torch.Tensor] = None
-class EncodeMixin:
-    """
-    Interface to encode data for MTEB and ViDoRe evaluations.
-    """
-    def _process_batches(
-        self,
-        data: List[Union[str, Image.Image]],
-        processor_fn: Callable,
-        desc: str,
-        vector_type: Optional[str] = None,
-        return_numpy: bool = False,
-        **kwargs,
-    ) -> Union[np.ndarray, List[torch.Tensor]]:
-        dataloader = DataLoader(
-            dataset=data,
-            batch_size=kwargs.get("batch_size", 32),
-            shuffle=False,
-            collate_fn=processor_fn,
-        )
-        results = []
-        self.eval()
-        for batch in tqdm(dataloader, desc=desc):
-            with torch.no_grad():
-                batch = {k: v.to(self.device) for k, v in batch.items()}
-                with torch.autocast(device_type=torch.device(self.device).type):
-                    embeddings = self(**batch)
-                    if isinstance(embeddings, HybridModelOutput) and (vector_type == "single_vector"):
-                        embeddings = embeddings.single_vec_emb
-                    elif isinstance(embeddings, HybridModelOutput) and (vector_type == "multi_vector"):
-                        embeddings = embeddings.multi_vec_emb
-                    elif not vector_type and isinstance(embeddings, HybridModelOutput):
-                        embeddings = embeddings.single_vec_emb  # get single-vectors for text2text tasks by default
-                    results.append(embeddings.cpu() if return_numpy else list(torch.unbind(embeddings)))
-        if return_numpy:
-            return np.concatenate([result.numpy() for result in results], axis=0)
-        return [item for sublist in results for item in sublist]
-    def encode(
-        self,
-        sentences: List[str],
-        max_length: int = 8192,
-        batch_size: int = 8,
-        prefixes: Optional[List[str]] = None,
-        desc: Optional[str] = None,
-        vector_type: Optional[str] = None,
-        padding: Optional[str] = None,
-        prompt_type: Optional[PromptType] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        prefix = None
-        if isinstance(prefixes, list) and len(prefixes) > 0:
-            if prompt_type:
-                desc = f"MTEB: Encode {prompt_type.value}..."
-                prefix = prefixes[0] if prompt_type.value == "query" else prefixes[1]
-            else:
-                prefix = prefixes[0]
-        processor_fn = partial(self.processor.process_texts, max_length=max_length, prefix=prefix, padding=padding)
-        desc = desc or "MTEB: Encode texts..."
-        return self._process_batches(
-            data=sentences,
-            processor_fn=processor_fn,
-            desc=desc,
-            vector_type=vector_type,
-            batch_size=batch_size,
-            **kwargs,
-        )
-    def encode_texts(
-        self,
-        queries: List[str],
-        max_length: int = 8192,
-        batch_size: int = 8,
-        vector_type: Optional[str] = None,
-        desc: Optional[str] = None,
-        **kwargs,
-    ) -> List[torch.Tensor]:
-        processor_fn = partial(self.processor.process_texts, max_length=max_length, prefix="Query")
-        return self._process_batches(
-            data=queries,
-            processor_fn=processor_fn,
-            desc=desc or "Encode queries...",
-            vector_type=vector_type,
-            batch_size=batch_size,
-            **kwargs,
-        )
-    def encode_images(
-        self,
-        documents: List[Image.Image],
-        batch_size: int = 8,
-        vector_type: Optional[str] = None,
-        desc: Optional[str] = None,
-        **kwargs,
-    ) -> List[torch.Tensor]:
-        return self._process_batches(
-            data=documents,
-            processor_fn=self.processor.process_images,
-            desc=desc or "Encode documents...",
-            vector_type=vector_type,
-            batch_size=batch_size,
-            **kwargs,
-        )
-class QwenVLModel(ABC):
-    def get_rope_index(
-        self,
-        input_ids: torch.LongTensor,
-        image_grid_thw: Union[torch.LongTensor, None],
-        attention_mask: torch.Tensor,
-    ) -> tuple[torch.LongTensor, torch.Tensor]:
-        return super().get_rope_index(  # type: ignore
-            input_ids=input_ids,
-            image_grid_thw=image_grid_thw,
-            attention_mask=attention_mask,
-        )
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        attention_mask: torch.Tensor,
-        position_ids: torch.LongTensor,
-        rope_deltas: torch.Tensor,
-        output_hidden_states: bool,
-        use_cache: bool,
-        **kwargs,
-    ) -> Qwen2VLCausalLMOutputWithPast:
-        return super().forward(  # type: ignore
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            rope_deltas=rope_deltas,
-            output_hidden_states=output_hidden_states,
-            use_cache=use_cache,
-            **kwargs,
-        )
-class QwenVLEmbeddingBase(EncodeMixin, QwenVLModel):
     main_input_name: ClassVar[str] = "doc_input_ids"
-    def get_vlm_last_hidden_states(
         self,
         input_ids: torch.LongTensor,
         attention_mask: torch.Tensor,
@@ -460,19 +149,20 @@ class QwenVLEmbeddingBase(EncodeMixin, QwenVLModel):
             offsets = kwargs["image_grid_thw"][:, 1] * kwargs["image_grid_thw"][:, 2]
             kwargs["pixel_values"] = torch.cat([pv[:o] for pv, o in zip(kwargs["pixel_values"], offsets)], dim=0)
-        position_ids, rope_deltas = self.get_rope_index(
             input_ids=input_ids,
             image_grid_thw=kwargs.get("image_grid_thw", None),
             attention_mask=attention_mask,
         )
         outputs = super().forward(
             input_ids,
             attention_mask,
             **kwargs,
             position_ids=position_ids,
             rope_deltas=rope_deltas,
-            output_hidden_states=True,
             use_cache=False,
         )
@@ -482,35 +172,6 @@ class QwenVLEmbeddingBase(EncodeMixin, QwenVLModel):
         return hidden_states[-1]
-class AbstractHybridModel(ABC):
-    """
-    Abstract class for a hybrid model (single-vector and multi-vector embeddings).
-    """
-    @property
-    def single_vector_projector_dim(self) -> int:
-        return self.config.single_vector_projector_dim
-    @property
-    def multi_vector_projector_dim(self) -> int:
-        return self.config.multi_vector_projector_dim
-    @abstractmethod
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        attention_mask: torch.Tensor,
-        output_vlm_last_hidden_states: bool = False,
-        *args,
-        **kwargs,
-    ) -> HybridModelOutput:
-        """
-        Forward pass through the model. Returns both single-vector and multi-vector embeddings.
-        Must be implemented by subclasses.
-        """
-        pass
     def _init_projection_layers(self, config) -> None:
         """
         Initializes projection layers.
@@ -528,14 +189,6 @@ class AbstractHybridModel(ABC):
             out_features=self.config.multi_vector_projector_dim,
         )
-    @staticmethod
-    def _delete_redundant_forward_kwargs(kwargs: Dict[str, Any]) -> None:
-        """
-        Delete redundant kwargs before passing them to the forward method. In-place operation.
-        """
-        for key in ["input_ids", "attention_mask", "output_hidden_states"]:
-            kwargs.pop(key, None)
     def project_to_single_vector_embeddings(
         self,
         hidden_states: torch.Tensor,
@@ -545,48 +198,15 @@ class AbstractHybridModel(ABC):
         """
         Project the hidden states to single-vector embeddings.
         """
-        pooling_method = self.config.single_vector_pool_strategy
-        if pooling_method == "mean" and input_ids is None:
-            print("Warning: `input_ids` is None. Using `legacy-mean` pooling strategy instead.")
-            pooling_method = "legacy-mean"
-        if pooling_method == "last-token":
-            pooled_output = hidden_states[:, -1, :]
-        elif pooling_method == "mean":
-            if self._input_has_image(input_ids[0]):  # got document image(s)
-                # getting start and end positions of image tokens; torch.where returns
-                #   (1) a list of indices of input sequences
-                #       (shape corresponds to the total number of images in the batch)
-                #   (2) a list of positions of image tokens in the input sequence
-                #       (shape corresponds to the total number of images in the batch)
-                input_seq_idx, img_start_pos = torch.where(
-                    input_ids == self.config.vision_start_token_id
-                )  # (total number of images), (total number of images)
-                _, img_end_pos = torch.where(
-                    input_ids == self.config.vision_end_token_id
-                )  # (total number of images), (total number of images)
-                means = []
-                for i in range(input_seq_idx.shape[0]):
-                    vector_pos = input_seq_idx[i]
-                    start = img_start_pos[i]
-                    end = img_end_pos[i]
-                    mean_value = hidden_states[vector_pos][start : end + 1].mean(dim=0)
-                    means.append(mean_value)
-                pooled_output = torch.stack(means)
-            else:  # got query text
-                pooled_output = torch.sum(hidden_states * attention_mask.unsqueeze(-1), dim=1) / torch.sum(
-                    attention_mask, dim=1, keepdim=True
-                )
-        elif pooling_method == "legacy-mean":
             pooled_output = torch.sum(hidden_states * attention_mask.unsqueeze(-1), dim=1) / torch.sum(
                 attention_mask, dim=1, keepdim=True
             )
-        else:
-            raise ValueError(f"Invalid pooling strategy: {pooling_method}")
         single_vec_emb = self.single_vector_projector(pooled_output)
         return torch.nn.functional.normalize(single_vec_emb, dim=-1)
@@ -605,30 +225,25 @@ class AbstractHybridModel(ABC):
     def _input_has_image(self, input_ids):
         return self.config.vision_start_token_id in input_ids
-class ColQwenDuoBase(AbstractHybridModel, QwenVLEmbeddingBase):
     def forward(
         self,
         input_ids: torch.LongTensor,
         attention_mask: torch.Tensor,
         output_vlm_last_hidden_states: bool = False,
         **kwargs,
-    ) -> HybridModelOutput:
         """
-        Forward pass through ColQwenDuo. Returns both single-vector and multi-vector embeddings.
         Args:
             input_ids (torch.LongTensor): The input tokens tensor.
             attention_mask (torch.LongTensor): The attention mask tensor.
         Returns:
-            HybridModelOutput:
                 single_vector (torch.Tensor): Single-vector embeddings of shape (batch_size, dim).
                 multi_vector (torch.Tensor): Multi-vector embeddings of shape (batch_size, num_tokens, dim).
         """
-        # Delete redundant kwargs
-        self._delete_redundant_forward_kwargs(kwargs)
         # Forward pass through the VLM
-        hidden_states = self.get_vlm_last_hidden_states(
             input_ids=input_ids, attention_mask=attention_mask, **kwargs
         )  # (batch_size, seq_length, hidden_size)
@@ -636,16 +251,85 @@ class ColQwenDuoBase(AbstractHybridModel, QwenVLEmbeddingBase):
         single_vec_emb = self.project_to_single_vector_embeddings(hidden_states, attention_mask, input_ids=input_ids)
         multi_vec_emb = self.project_to_multi_vector_embeddings(hidden_states, attention_mask)
-        return HybridModelOutput(
             vlm_last_hidden_states=hidden_states if output_vlm_last_hidden_states else None,
             single_vec_emb=single_vec_emb,
             multi_vec_emb=multi_vec_emb,
         )
 class JinaEmbeddingsV4Model:
     """
-    Wrapper class for ColQwen25Duo that handles the loading of models and adapters.
     """
     def __init__(self, model, adapter_dir):
@@ -664,7 +348,7 @@ class JinaEmbeddingsV4Model:
         task = kwargs.pop('task', 'retrieval')
-        model = ColQwen25Duo.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
         if os.path.isdir(model.name_or_path):
             adapter_dir = os.path.join(model.name_or_path, 'adapters')
@@ -705,13 +389,4 @@ class JinaEmbeddingsV4Model:
         Forward the call to the underlying model's forward method.
         """
         return self.model(*args, **kwargs)
-class ColQwen25Duo(ColQwenDuoBase, Qwen2_5_VLForConditionalGeneration):
-    config_class = ColQwen25DuoConfig
-    def __init__(self, config: ColQwen25DuoConfig):
-        Qwen2_5_VLForConditionalGeneration.__init__(self, config)
-        self._init_projection_layers(config)
-        self.post_init()
-        self.processor = ColQwen25DuoProcessor.from_pretrained(self.name_or_path, trust_remote_code=True)

 import math
 import numpy as np
 from dataclasses import dataclass
 from typing import Any, Callable, ClassVar, Dict, List, Optional, Union, cast
+from peft import PeftModel
 import torch
 from torch import nn
 from torch.utils.data import DataLoader
 from enum import Enum
 from peft.utils.hotswap import hotswap_adapter
+from transformers import BatchFeature
 from transformers.models.qwen2_5_vl import Qwen2_5_VLProcessor, Qwen2_5_VLForConditionalGeneration
 from huggingface_hub import snapshot_download
+from .configuration_jina_embeddings_v4 import JinaEmbeddingsV4Config
 class PromptType(str, Enum):
     query = "query"
     passage = "passage"
+class JinaEmbeddingsV4Processor(Qwen2_5_VLProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        Qwen2_5_VLProcessor.__init__(self, *args, **kwargs)
+        self.assistant_prefix_len = 58
+        self.text_max_length = 8192
     @staticmethod
     def round_by_factor(number: float, factor: int) -> int:
     def process_texts(
         self,
         texts: List[str],
+        max_length: Optional[int] = None,
         prefix: Optional[str] = None,
         padding: Optional[str] = None,
     ) -> BatchFeature:
+        max_length = self.text_max_length if max_length is None else min(max_length, self.text_max_length)
         padded_texts: List[str] = []
         for text in texts:
         return text_batch
 @dataclass
+class JinaEmbeddingsV4ModelOutput:
     """
     Base class for the Hybrid Model outputs.
     Args:
     single_vec_emb: Optional[torch.Tensor] = None
     multi_vec_emb: Optional[torch.Tensor] = None
+class QwenVL25Embeddings(Qwen2_5_VLForConditionalGeneration):
+    config_class = JinaEmbeddingsV4Config
     main_input_name: ClassVar[str] = "doc_input_ids"
+    def __init__(self, config: JinaEmbeddingsV4Config):
+        Qwen2_5_VLForConditionalGeneration.__init__(self, config)
+        self._init_projection_layers(config)
+        self.post_init()
+        self.processor = JinaEmbeddingsV4Processor.from_pretrained(self.name_or_path, trust_remote_code=True)
+        self.single_vector_projector_dim = config.single_vector_projector_dim
+        self.multi_vector_projector_dim = config.multi_vector_projector_dim
+    def get_last_hidden_states(
         self,
         input_ids: torch.LongTensor,
         attention_mask: torch.Tensor,
             offsets = kwargs["image_grid_thw"][:, 1] * kwargs["image_grid_thw"][:, 2]
             kwargs["pixel_values"] = torch.cat([pv[:o] for pv, o in zip(kwargs["pixel_values"], offsets)], dim=0)
+        position_ids, rope_deltas = super().get_rope_index(  # type: ignore
             input_ids=input_ids,
             image_grid_thw=kwargs.get("image_grid_thw", None),
             attention_mask=attention_mask,
         )
+        kwargs['output_hidden_states'] = True
         outputs = super().forward(
             input_ids,
             attention_mask,
             **kwargs,
             position_ids=position_ids,
             rope_deltas=rope_deltas,
             use_cache=False,
         )
         return hidden_states[-1]
     def _init_projection_layers(self, config) -> None:
         """
         Initializes projection layers.
             out_features=self.config.multi_vector_projector_dim,
         )
     def project_to_single_vector_embeddings(
         self,
         hidden_states: torch.Tensor,
         """
         Project the hidden states to single-vector embeddings.
         """
+        if self._input_has_image(input_ids[0]):  # got document image
+            img_start_pos = torch.where(input_ids[0] == self.config.vision_start_token_id)[0][0]
+            img_end_pos = torch.where(input_ids[0] == self.config.vision_end_token_id)[0][0]
+            pooled_output = hidden_states[0][img_start_pos:img_end_pos + 1].mean(dim=0).unsqueeze(0)
+        else:  # got query text
             pooled_output = torch.sum(hidden_states * attention_mask.unsqueeze(-1), dim=1) / torch.sum(
                 attention_mask, dim=1, keepdim=True
             )
         single_vec_emb = self.single_vector_projector(pooled_output)
         return torch.nn.functional.normalize(single_vec_emb, dim=-1)
     def _input_has_image(self, input_ids):
         return self.config.vision_start_token_id in input_ids
     def forward(
         self,
         input_ids: torch.LongTensor,
         attention_mask: torch.Tensor,
         output_vlm_last_hidden_states: bool = False,
         **kwargs,
+    ) -> JinaEmbeddingsV4ModelOutput:
         """
+        Forward pass through QwenVL25Embeddings. Returns both single-vector and multi-vector embeddings.
         Args:
             input_ids (torch.LongTensor): The input tokens tensor.
             attention_mask (torch.LongTensor): The attention mask tensor.
         Returns:
+            JinaEmbeddingsV4ModelOutput:
                 single_vector (torch.Tensor): Single-vector embeddings of shape (batch_size, dim).
                 multi_vector (torch.Tensor): Multi-vector embeddings of shape (batch_size, num_tokens, dim).
         """
         # Forward pass through the VLM
+        hidden_states = self.get_last_hidden_states(
             input_ids=input_ids, attention_mask=attention_mask, **kwargs
         )  # (batch_size, seq_length, hidden_size)
         single_vec_emb = self.project_to_single_vector_embeddings(hidden_states, attention_mask, input_ids=input_ids)
         multi_vec_emb = self.project_to_multi_vector_embeddings(hidden_states, attention_mask)
+        return JinaEmbeddingsV4ModelOutput(
             vlm_last_hidden_states=hidden_states if output_vlm_last_hidden_states else None,
             single_vec_emb=single_vec_emb,
             multi_vec_emb=multi_vec_emb,
         )
+    def _process_batches(
+        self,
+        data: List[Union[str, Image.Image]],
+        processor_fn: Callable,
+        desc: str,
+        vector_type: Optional[str] = None,
+        return_numpy: bool = False,
+        **kwargs,
+    ) -> Union[np.ndarray, List[torch.Tensor]]:
+        dataloader = DataLoader(
+            dataset=data,
+            batch_size=kwargs.get("batch_size", 32),
+            shuffle=False,
+            collate_fn=processor_fn,
+        )
+        vector_type = vector_type or "single_vector"
+        results = []
+        self.eval()
+        for batch in tqdm(dataloader, desc=desc):
+            with torch.no_grad():
+                batch = {k: v.to(self.device) for k, v in batch.items()}
+                with torch.autocast(device_type=torch.device(self.device).type):
+                    embeddings = self(**batch)
+                    if vector_type == "single_vector":
+                        embeddings = embeddings.single_vec_emb
+                    else:
+                        embeddings = embeddings.multi_vec_emb
+                    results.append(embeddings.cpu() if return_numpy else list(torch.unbind(embeddings)))
+        if return_numpy:
+            return np.concatenate([result.numpy() for result in results], axis=0)
+        return [item for sublist in results for item in sublist]
+    def encode_texts(
+        self,
+        queries: List[str],
+        max_length: int = 8192,
+        batch_size: int = 8,
+        vector_type: Optional[str] = None,
+        desc: Optional[str] = None,
+        **kwargs,
+    ) -> List[torch.Tensor]:
+        processor_fn = partial(self.processor.process_texts, max_length=max_length, prefix="Query")
+        return self._process_batches(
+            data=queries,
+            processor_fn=processor_fn,
+            desc=desc or "Encode queries...",
+            vector_type=vector_type,
+            batch_size=batch_size,
+            **kwargs,
+        )
+    def encode_images(
+        self,
+        documents: List[Image.Image],
+        batch_size: int = 8,
+        vector_type: Optional[str] = None,
+        desc: Optional[str] = None,
+        **kwargs,
+    ) -> List[torch.Tensor]:
+        return self._process_batches(
+            data=documents,
+            processor_fn=self.processor.process_images,
+            desc=desc or "Encode documents...",
+            vector_type=vector_type,
+            batch_size=batch_size,
+            **kwargs,
+        )
 class JinaEmbeddingsV4Model:
     """
+    Wrapper class for QwenVL25Embeddings that handles the loading of models and adapters.
     """
     def __init__(self, model, adapter_dir):
         task = kwargs.pop('task', 'retrieval')
+        model = QwenVL25Embeddings.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
         if os.path.isdir(model.name_or_path):
             adapter_dir = os.path.join(model.name_or_path, 'adapters')
         Forward the call to the underlying model's forward method.
         """
         return self.model(*args, **kwargs)

preprocessor_config.json CHANGED Viewed

@@ -18,7 +18,7 @@
   "merge_size": 2,
   "min_pixels": 3136,
   "patch_size": 14,
-  "processor_class": "ColQwen25DuoProcessor",
   "resample": 3,
   "rescale_factor": 0.00392156862745098,
   "size": {
@@ -27,6 +27,6 @@
   },
   "temporal_patch_size": 2,
   "auto_map": {
-    "AutoProcessor": "modeling_colqwen_duo.ColQwen25DuoProcessor"
   }
 }

   "merge_size": 2,
   "min_pixels": 3136,
   "patch_size": 14,
+  "processor_class": "JinaEmbeddingsV4Processor",
   "resample": 3,
   "rescale_factor": 0.00392156862745098,
   "size": {
   },
   "temporal_patch_size": 2,
   "auto_map": {
+    "AutoProcessor": "modeling_jina_embeddings_v4.JinaEmbeddingsV4Processor"
   }
 }