hfendpoints-images
/

embeddings-sentence-transformers-cpu

hfendpoints

embedding

Model card Files Files and versions

xet

Community

Morgan Funtowicz commited on May 6

Commit

5e1abf0

1 Parent(s): 6ce5654

feat(embeddings): do not tokenize twice

Browse files

Files changed (1) hide show

handler.py +61 -20

handler.py CHANGED Viewed

@@ -1,13 +1,17 @@
 import platform
-from typing import Union, Sequence, Sized
 import torch
 from hfendpoints.openai import Context, run
 from hfendpoints.openai.embeddings import Embedding, EmbeddingEndpoint, EmbeddingRequest, EmbeddingResponse, Usage
-from hfendpoints import EndpointConfig, Handler, __version__
 from loguru import logger
-from torch.backends.mkldnn import VERBOSE_ON_CREATION, VERBOSE_OFF
 from sentence_transformers import SentenceTransformer
 # Not used for now
 SUPPORTED_AMP_DTYPES = {torch.float32, torch.bfloat16}
@@ -27,17 +31,47 @@ def has_bf16_support() -> bool:
     return torch.cpu._is_avx512_bf16_supported() or torch.cpu._is_avx512_supported()
-def get_usage(tokens: Union[Sized, Sequence[Sized]]) -> Usage:
     """
     Compute the number of processed tokens and return as Usage object matching OpenAI
-    :param tokens: List or nested List of tokens
     :return: Usage object matching OpenAI specifications
     """
-    num_tokens = tokens["attention_mask"].sum().item()
     return Usage(prompt_tokens=num_tokens, total_tokens=num_tokens)
 class SentenceTransformerHandler(Handler):
-    __slots__ = ("_config", "_dtype", "_model", "_model_name", "_use_amp")
     def __init__(self, config: EndpointConfig):
         self._config = config
@@ -47,44 +81,51 @@ class SentenceTransformerHandler(Handler):
         self._allocate_model()
     def _allocate_model(self):
-        # Denormal number is used to store extremely small numbers that are close to 0.
         # Computations with denormal numbers are remarkably slower than normalized number.
         torch.set_flush_denormal(True)
         dtype = torch.bfloat16 if has_bf16_support() else torch.float32
         model = SentenceTransformer(self._config.model_id, device="cpu", model_kwargs={"torch_dtype": dtype})
         if platform.machine() == "x86_64":
             import intel_extension_for_pytorch as ipex
             logger.info(f"x64 platform detected: {platform.processor()}")
             with torch.inference_mode():
                 model = model.eval()
                 model = model.to(memory_format=torch.channels_last)
-                model = ipex.optimize(model, dtype=dtype, weights_prepack=False, graph_mode=True, concat_linear=True)
                 model = torch.compile(model, dynamic=True, backend="ipex")
         else:
             model = torch.compile(model)
-        self._model = model
         self._dtype = dtype
         self._use_amp = dtype in SUPPORTED_AMP_DTYPES
     async def __call__(self, request: EmbeddingRequest, ctx: Context) -> EmbeddingResponse:
         with torch.backends.mkldnn.verbose(VERBOSE_ON_CREATION if self._config.is_debug else VERBOSE_OFF):
             with torch.inference_mode(), torch.amp.autocast("cpu", dtype=self._dtype, enabled=self._use_amp):
-                tokens = self._model.tokenize(request.input)
-                vectors = self._model.encode(request.input)
                 embeddings = [None] * len(request)
-                if not request.is_batched:
-                    embeddings[0] = Embedding(index=0, embedding=vectors.tolist())
-                else:
-                    for (index, embedding) in enumerate(vectors.tolist()):
-                        embedding = Embedding(index=index, embedding=embedding)
-                        embeddings[index] = embedding
-            usage = get_usage(tokens)
             return EmbeddingResponse(model=self._model_name, embeddings=embeddings, usage=usage)

 import platform
+from functools import reduce
+from operator import itemgetter
+from typing import Generator, Tuple
 import torch
 from hfendpoints.openai import Context, run
 from hfendpoints.openai.embeddings import Embedding, EmbeddingEndpoint, EmbeddingRequest, EmbeddingResponse, Usage
+from intel_extension_for_pytorch.cpu.runtime import pin
 from loguru import logger
+from hfendpoints import EndpointConfig, Handler, __version__
 from sentence_transformers import SentenceTransformer
+from torch.nn import Module
+from torch.backends.mkldnn import VERBOSE_ON_CREATION, VERBOSE_OFF
 # Not used for now
 SUPPORTED_AMP_DTYPES = {torch.float32, torch.bfloat16}
     return torch.cpu._is_avx512_bf16_supported() or torch.cpu._is_avx512_supported()
+def get_cores_pinning_strategy() -> "CPUPool":
+    import intel_extension_for_pytorch as ipex
+    # Retrieve the number of nodes
+    num_nodes = ipex.cpu.runtime.runtime_utils.get_num_nodes()
+    cpu_cores_id = [ipex.cpu.runtime.runtime_utils.get_core_list_of_node_id(node_id) for node_id in range(num_nodes)]
+    if num_nodes == 1:
+        pinned_cpu_cores_id = cpu_cores_id[0]
+    else:
+        pinned_cpu_cores_id = [core_id for node in cpu_cores_id for core_id in node]
+    logger.info(f"Pinning CPU cores to {pinned_cpu_cores_id}")
+    return ipex.cpu.runtime.CPUPool(pinned_cpu_cores_id)
+    # return ipex.cpu.runtime.CPUPool(node_id=0)
+def get_usage(mask: torch.IntTensor) -> Usage:
     """
     Compute the number of processed tokens and return as Usage object matching OpenAI
+    :param mask: Attention mask tensor, as returned by the model
     :return: Usage object matching OpenAI specifications
     """
+    num_tokens = sum(m.sum().item() for m in mask)
     return Usage(prompt_tokens=num_tokens, total_tokens=num_tokens)
+class SentenceTransformerWithUsage(Module):
+    __slots__ = ("_model", )
+    def __init__(self, model: SentenceTransformer):
+        super().__init__()
+        self._model = model
+    def forward(self, sentences: list[str]) -> Tuple[Generator[torch.Tensor], Generator[torch.Tensor]]:
+        vectors = self._model.encode(sentences, output_value=None)
+        return map(itemgetter('attention_mask'), vectors), map(itemgetter('sentence_embedding'), vectors)
 class SentenceTransformerHandler(Handler):
+    __slots__ = ("_config", "_dtype", "_model", "_model_name", "_pinned_cores", "_use_amp")
     def __init__(self, config: EndpointConfig):
         self._config = config
         self._allocate_model()
     def _allocate_model(self):
+        # Denormal number is used to store tiny numbers that are close to 0.
         # Computations with denormal numbers are remarkably slower than normalized number.
         torch.set_flush_denormal(True)
         dtype = torch.bfloat16 if has_bf16_support() else torch.float32
         model = SentenceTransformer(self._config.model_id, device="cpu", model_kwargs={"torch_dtype": dtype})
         if platform.machine() == "x86_64":
             import intel_extension_for_pytorch as ipex
             logger.info(f"x64 platform detected: {platform.processor()}")
+            # Retrieve all the physical cores ID for all the CPU nodes
+            self._pinned_cores = get_cores_pinning_strategy()
+            # Optimize the model for inference
             with torch.inference_mode():
                 model = model.eval()
                 model = model.to(memory_format=torch.channels_last)
+                # Apply IPEx optimizations
+                model = ipex.optimize(model, dtype=dtype, weights_prepack=True, graph_mode=True, concat_linear=True)
                 model = torch.compile(model, dynamic=True, backend="ipex")
+                # model = ipex.cpu.runtime.MultiStreamModule(SentenceTransformerWithUsage(model), num_streams=1)
         else:
             model = torch.compile(model)
         self._dtype = dtype
         self._use_amp = dtype in SUPPORTED_AMP_DTYPES
+        self._model = SentenceTransformerWithUsage(model)
     async def __call__(self, request: EmbeddingRequest, ctx: Context) -> EmbeddingResponse:
         with torch.backends.mkldnn.verbose(VERBOSE_ON_CREATION if self._config.is_debug else VERBOSE_OFF):
             with torch.inference_mode(), torch.amp.autocast("cpu", dtype=self._dtype, enabled=self._use_amp):
+                with pin(self._pinned_cores):
+                    mask, vectors = self._model(request.input if request.is_batched else [request.input])
                 embeddings = [None] * len(request)
+                for (index, embedding) in enumerate(vectors):
+                    embedding = Embedding(index=index, embedding=embedding.tolist())
+                    embeddings[index] = embedding
+            usage = get_usage(mask)
             return EmbeddingResponse(model=self._model_name, embeddings=embeddings, usage=usage)