Add ONNX and PyTorch pipelines for Gemma3 embedding model

- Implemented `gemma3_mean_pooling_basic.py` for basic mean pooling using the Gemma3 model.
- Created `onnx_gemma3_pipeline.py` to define an ONNX pipeline for the Gemma3 embedding model, including tokenization, encoding, and cosine similarity calculations.
- Developed `pytorch_gemma3_pipeline.py` to provide a PyTorch-based implementation of the Gemma3 embedding model, featuring batch processing and mean pooling.
- Added cosine similarity demonstration in both ONNX and PyTorch pipelines for comparing embeddings of sample words.

Files changed (10) hide show

README.md +1 -1
compare_gemma3_onnx_vs_pytorch.py +66 -66
download_missing_hf_files.py +69 -69
embeddinggemma-300m/config.json +61 -61
embeddinggemma-300m/modules.json +31 -31
embeddinggemma-300m/special_tokens_map.json +33 -33
embeddinggemma-300m/tokenizer_config.json +0 -0
gemma3_mean_pooling_basic.py +23 -23
onnx_gemma3_pipeline.py +91 -91
pytorch_gemma3_pipeline.py +58 -58

README.md CHANGED Viewed

@@ -58,4 +58,4 @@ The comparison script prints cosine similarities between sample word embeddings
 ## References
 - [Optimum-ONNX Gemma3 PR](https://github.com/huggingface/optimum-onnx/pull/50)
-- [Gemma3 Model](https://huggingface.co/google/embeddinggemma-300m-qat-q4_0-unquantized)

 ## References
 - [Optimum-ONNX Gemma3 PR](https://github.com/huggingface/optimum-onnx/pull/50)
+- [Gemma3 Model](https://huggingface.co/google/embeddinggemma-300m-qat-q4_0-unquantized)

compare_gemma3_onnx_vs_pytorch.py CHANGED Viewed

@@ -1,66 +1,66 @@
-from sentence_transformers import SentenceTransformer
-import torch
-import numpy as np
-# Words to compare
-words = ["apple", "banana", "car"]
-# Load original SentenceTransformer (PyTorch, CUDA)
-st_model = SentenceTransformer("google/embeddinggemma-300m-qat-q4_0-unquantized")
-st_model = st_model.to("cuda" if torch.cuda.is_available() else "cpu")
-# Get PyTorch embeddings
-with torch.no_grad():
-    pt_embeddings = st_model.encode(words, convert_to_numpy=True, device="cuda" if torch.cuda.is_available() else "cpu")
-from onnx_gemma3_pipeline import onnx_st
-from transformers import AutoTokenizer
-from optimum.onnxruntime import ORTModelForFeatureExtraction
-# Basic mean pooling ONNX implementation
-def basic_mean_pooling(words):
-    tokenizer = AutoTokenizer.from_pretrained("./embeddinggemma-300m")
-    model = ORTModelForFeatureExtraction.from_pretrained("./embeddinggemma-300m")
-    embeddings = []
-    for word in words:
-        inputs = tokenizer(word, return_tensors="pt")
-        input_ids = inputs['input_ids']
-        sequence_length = input_ids.shape[1]
-        position_ids = np.arange(sequence_length)[None, :]
-        position_ids = np.tile(position_ids, (input_ids.shape[0], 1))
-        inputs['position_ids'] = torch.tensor(position_ids, dtype=torch.long)
-        outputs = model(**inputs)
-        last_hidden = outputs.last_hidden_state
-        attention_mask = inputs['attention_mask']
-        from sentence_transformers import models
-        pooling = models.Pooling(word_embedding_dimension=last_hidden.shape[-1], pooling_mode_mean_tokens=True)
-        features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
-        pooled = pooling(features)['sentence_embedding']
-        embeddings.append(pooled[0].detach().cpu().numpy())
-    return np.stack(embeddings)
-from transformers import AutoTokenizer
-from optimum.onnxruntime import ORTModelForFeatureExtraction
-onnx_embeddings = onnx_st.encode(words)
-# Cosine similarity function
-def cosine_similarity(a, b):
-    a = a.flatten()
-    b = b.flatten()
-    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
-print("Safetensor Cosine similarities:")
-print(f"apple vs banana: {cosine_similarity(pt_embeddings[0], pt_embeddings[1]):.4f}")
-print(f"apple vs car: {cosine_similarity(pt_embeddings[0], pt_embeddings[2]):.4f}")
-print(f"banana vs car: {cosine_similarity(pt_embeddings[1], pt_embeddings[2]):.4f}")
-print("\nONNX Cosine similarities:")
-print(f"apple vs banana: {cosine_similarity(onnx_embeddings[0], onnx_embeddings[1]):.4f}")
-print(f"apple vs car: {cosine_similarity(onnx_embeddings[0], onnx_embeddings[2]):.4f}")
-print(f"banana vs car: {cosine_similarity(onnx_embeddings[1], onnx_embeddings[2]):.4f}")
-# Basic mean pooling ONNX pipeline
-basic_embeddings = basic_mean_pooling(words)
-print("\nBasic ONNX (mean pooling only) Cosine similarities:")
-print(f"apple vs banana: {cosine_similarity(basic_embeddings[0], basic_embeddings[1]):.4f}")
-print(f"apple vs car: {cosine_similarity(basic_embeddings[0], basic_embeddings[2]):.4f}")
-print(f"banana vs car: {cosine_similarity(basic_embeddings[1], basic_embeddings[2]):.4f}")

+from sentence_transformers import SentenceTransformer
+import torch
+import numpy as np
+# Words to compare
+words = ["apple", "banana", "car"]
+# Load original SentenceTransformer (PyTorch, CUDA)
+st_model = SentenceTransformer("google/embeddinggemma-300m-qat-q4_0-unquantized")
+st_model = st_model.to("cuda" if torch.cuda.is_available() else "cpu")
+# Get PyTorch embeddings
+with torch.no_grad():
+    pt_embeddings = st_model.encode(words, convert_to_numpy=True, device="cuda" if torch.cuda.is_available() else "cpu")
+from onnx_gemma3_pipeline import onnx_st
+from transformers import AutoTokenizer
+from optimum.onnxruntime import ORTModelForFeatureExtraction
+# Basic mean pooling ONNX implementation
+def basic_mean_pooling(words):
+    tokenizer = AutoTokenizer.from_pretrained("./embeddinggemma-300m")
+    model = ORTModelForFeatureExtraction.from_pretrained("./embeddinggemma-300m")
+    embeddings = []
+    for word in words:
+        inputs = tokenizer(word, return_tensors="pt")
+        input_ids = inputs['input_ids']
+        sequence_length = input_ids.shape[1]
+        position_ids = np.arange(sequence_length)[None, :]
+        position_ids = np.tile(position_ids, (input_ids.shape[0], 1))
+        inputs['position_ids'] = torch.tensor(position_ids, dtype=torch.long)
+        outputs = model(**inputs)
+        last_hidden = outputs.last_hidden_state
+        attention_mask = inputs['attention_mask']
+        from sentence_transformers import models
+        pooling = models.Pooling(word_embedding_dimension=last_hidden.shape[-1], pooling_mode_mean_tokens=True)
+        features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
+        pooled = pooling(features)['sentence_embedding']
+        embeddings.append(pooled[0].detach().cpu().numpy())
+    return np.stack(embeddings)
+from transformers import AutoTokenizer
+from optimum.onnxruntime import ORTModelForFeatureExtraction
+onnx_embeddings = onnx_st.encode(words)
+# Cosine similarity function
+def cosine_similarity(a, b):
+    a = a.flatten()
+    b = b.flatten()
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+print("Safetensor Cosine similarities:")
+print(f"apple vs banana: {cosine_similarity(pt_embeddings[0], pt_embeddings[1]):.4f}")
+print(f"apple vs car: {cosine_similarity(pt_embeddings[0], pt_embeddings[2]):.4f}")
+print(f"banana vs car: {cosine_similarity(pt_embeddings[1], pt_embeddings[2]):.4f}")
+print("\nONNX Cosine similarities:")
+print(f"apple vs banana: {cosine_similarity(onnx_embeddings[0], onnx_embeddings[1]):.4f}")
+print(f"apple vs car: {cosine_similarity(onnx_embeddings[0], onnx_embeddings[2]):.4f}")
+print(f"banana vs car: {cosine_similarity(onnx_embeddings[1], onnx_embeddings[2]):.4f}")
+# Basic mean pooling ONNX pipeline
+basic_embeddings = basic_mean_pooling(words)
+print("\nBasic ONNX (mean pooling only) Cosine similarities:")
+print(f"apple vs banana: {cosine_similarity(basic_embeddings[0], basic_embeddings[1]):.4f}")
+print(f"apple vs car: {cosine_similarity(basic_embeddings[0], basic_embeddings[2]):.4f}")
+print(f"banana vs car: {cosine_similarity(basic_embeddings[1], basic_embeddings[2]):.4f}")

download_missing_hf_files.py CHANGED Viewed

@@ -1,69 +1,69 @@
-from huggingface_hub import snapshot_download
-import os
-import shutil
-from sentence_transformers import SentenceTransformer
-import torch
-# Model repo and local directory
-repo_id = "google/embeddinggemma-300m-qat-q4_0-unquantized"
-local_dir = "embeddinggemma-300m"
-# Download all files except model.safetensors and those already present
-existing_files = set(os.listdir(local_dir))
-# Download snapshot to a temp dir
-temp_dir = "_hf_temp_download"
-os.makedirs(temp_dir, exist_ok=True)
-snapshot_download(
-    repo_id,
-    local_dir=temp_dir,
-    ignore_patterns=["model.safetensors"],
-    resume_download=True,
-    allow_patterns=None
-)
-# Copy missing files
-for fname in os.listdir(temp_dir):
-    if fname not in existing_files:
-        shutil.move(os.path.join(temp_dir, fname), os.path.join(local_dir, fname))
-        print(f"Downloaded: {fname}")
-    else:
-        print(f"Already exists: {fname}")
-# Clean up temp dir
-shutil.rmtree(temp_dir)
-print("Done.")
-# Export Dense layers from SentenceTransformer to ONNX
-st_model = SentenceTransformer(repo_id)
-dense1 = st_model[2].linear
-dense2 = st_model[3].linear
-onnx_dir = os.path.join(local_dir, "onnx")
-os.makedirs(onnx_dir, exist_ok=True)
-# Export Dense1
-dummy_input1 = torch.randn(1, dense1.in_features)
-dense1 = dense1.to(dummy_input1.device)
-torch.onnx.export(
-    dense1,
-    dummy_input1,
-    os.path.join(onnx_dir, "dense1.onnx"),
-    input_names=["input"],
-    output_names=["output"],
-    opset_version=14
-)
-print("Exported dense1.onnx")
-# Export Dense2
-dummy_input2 = torch.randn(1, dense2.in_features)
-dense2 = dense2.to(dummy_input2.device)
-torch.onnx.export(
-    dense2,
-    dummy_input2,
-    os.path.join(onnx_dir, "dense2.onnx"),
-    input_names=["input"],
-    output_names=["output"],
-    opset_version=14
-)
-print("Exported dense2.onnx")

+from huggingface_hub import snapshot_download
+import os
+import shutil
+from sentence_transformers import SentenceTransformer
+import torch
+# Model repo and local directory
+repo_id = "google/embeddinggemma-300m-qat-q4_0-unquantized"
+local_dir = "embeddinggemma-300m"
+# Download all files except model.safetensors and those already present
+existing_files = set(os.listdir(local_dir))
+# Download snapshot to a temp dir
+temp_dir = "_hf_temp_download"
+os.makedirs(temp_dir, exist_ok=True)
+snapshot_download(
+    repo_id,
+    local_dir=temp_dir,
+    ignore_patterns=["model.safetensors"],
+    resume_download=True,
+    allow_patterns=None
+)
+# Copy missing files
+for fname in os.listdir(temp_dir):
+    if fname not in existing_files:
+        shutil.move(os.path.join(temp_dir, fname), os.path.join(local_dir, fname))
+        print(f"Downloaded: {fname}")
+    else:
+        print(f"Already exists: {fname}")
+# Clean up temp dir
+shutil.rmtree(temp_dir)
+print("Done.")
+# Export Dense layers from SentenceTransformer to ONNX
+st_model = SentenceTransformer(repo_id)
+dense1 = st_model[2].linear
+dense2 = st_model[3].linear
+onnx_dir = os.path.join(local_dir, "onnx")
+os.makedirs(onnx_dir, exist_ok=True)
+# Export Dense1
+dummy_input1 = torch.randn(1, dense1.in_features)
+dense1 = dense1.to(dummy_input1.device)
+torch.onnx.export(
+    dense1,
+    dummy_input1,
+    os.path.join(onnx_dir, "dense1.onnx"),
+    input_names=["input"],
+    output_names=["output"],
+    opset_version=14
+)
+print("Exported dense1.onnx")
+# Export Dense2
+dummy_input2 = torch.randn(1, dense2.in_features)
+dense2 = dense2.to(dummy_input2.device)
+torch.onnx.export(
+    dense2,
+    dummy_input2,
+    os.path.join(onnx_dir, "dense2.onnx"),
+    input_names=["input"],
+    output_names=["output"],
+    opset_version=14
+)
+print("Exported dense2.onnx")

embeddinggemma-300m/config.json CHANGED Viewed

@@ -1,61 +1,61 @@
-{
-  "_sliding_window_pattern": 6,
-  "architectures": [
-    "Gemma3TextModel"
-  ],
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "attn_logit_softcapping": null,
-  "bos_token_id": 2,
-  "dtype": "float32",
-  "eos_token_id": 1,
-  "final_logit_softcapping": null,
-  "head_dim": 256,
-  "hidden_activation": "gelu_pytorch_tanh",
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "intermediate_size": 1152,
-  "layer_types": [
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "full_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "sliding_attention",
-    "full_attention"
-  ],
-  "max_position_embeddings": 2048,
-  "model_type": "gemma3_text",
-  "num_attention_heads": 3,
-  "num_hidden_layers": 24,
-  "num_key_value_heads": 1,
-  "pad_token_id": 0,
-  "query_pre_attn_scalar": 256,
-  "rms_norm_eps": 1e-06,
-  "rope_local_base_freq": 10000.0,
-  "rope_scaling": null,
-  "rope_theta": 1000000.0,
-  "sliding_window": 512,
-  "torch_dtype": "float32",
-  "transformers_version": "4.53.3",
-  "use_bidirectional_attention": true,
-  "use_cache": true,
-  "vocab_size": 262144
-}

+{
+  "_sliding_window_pattern": 6,
+  "architectures": [
+    "Gemma3TextModel"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": null,
+  "bos_token_id": 2,
+  "dtype": "float32",
+  "eos_token_id": 1,
+  "final_logit_softcapping": null,
+  "head_dim": 256,
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 1152,
+  "layer_types": [
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 2048,
+  "model_type": "gemma3_text",
+  "num_attention_heads": 3,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 1,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_local_base_freq": 10000.0,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": 512,
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.3",
+  "use_bidirectional_attention": true,
+  "use_cache": true,
+  "vocab_size": 262144
+}

embeddinggemma-300m/modules.json CHANGED Viewed

@@ -1,32 +1,32 @@
-[
-  {
-    "idx": 0,
-    "name": "0",
-    "path": "",
-    "type": "sentence_transformers.models.Transformer"
-  },
-  {
-    "idx": 1,
-    "name": "1",
-    "path": "1_Pooling",
-    "type": "sentence_transformers.models.Pooling"
-  },
-  {
-    "idx": 2,
-    "name": "2",
-    "path": "2_Dense",
-    "type": "sentence_transformers.models.Dense"
-  },
-  {
-    "idx": 3,
-    "name": "3",
-    "path": "3_Dense",
-    "type": "sentence_transformers.models.Dense"
-  },
-  {
-    "idx": 4,
-    "name": "4",
-    "path": "4_Normalize",
-    "type": "sentence_transformers.models.Normalize"
-  }
 ]

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Dense",
+    "type": "sentence_transformers.models.Dense"
+  },
+  {
+    "idx": 3,
+    "name": "3",
+    "path": "3_Dense",
+    "type": "sentence_transformers.models.Dense"
+  },
+  {
+    "idx": 4,
+    "name": "4",
+    "path": "4_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
 ]

embeddinggemma-300m/special_tokens_map.json CHANGED Viewed

@@ -1,33 +1,33 @@
-{
-  "boi_token": "<start_of_image>",
-  "bos_token": {
-    "content": "<bos>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eoi_token": "<end_of_image>",
-  "eos_token": {
-    "content": "<eos>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "image_token": "<image_soft_token>",
-  "pad_token": {
-    "content": "<pad>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "unk_token": {
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
-}

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

embeddinggemma-300m/tokenizer_config.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

gemma3_mean_pooling_basic.py CHANGED Viewed

@@ -1,24 +1,24 @@
-from transformers import AutoTokenizer
-from optimum.onnxruntime import ORTModelForFeatureExtraction
-from sentence_transformers import models
-import numpy as np
-import torch
-tokenizer = AutoTokenizer.from_pretrained("./embeddinggemma-300m")
-model = ORTModelForFeatureExtraction.from_pretrained("./embeddinggemma-300m")
-inputs = tokenizer("apple", return_tensors="pt")
-print(inputs)
-input_ids = inputs['input_ids']
-sequence_length = input_ids.shape[1]
-position_ids = np.arange(sequence_length)[None, :]
-position_ids = np.tile(position_ids, (input_ids.shape[0], 1))
-inputs['position_ids'] = torch.tensor(position_ids, dtype=torch.long)
-outputs = model(**inputs)
-last_hidden = outputs.last_hidden_state
-attention_mask = inputs['attention_mask']
-# Use SentenceTransformer's Pooling module for mean pooling
-pooling = models.Pooling(word_embedding_dimension=last_hidden.shape[-1], pooling_mode_mean_tokens=True)
-features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
-pooled = pooling(features)['sentence_embedding']
 print("Mean pooled:", pooled[0][:5].detach().cpu().numpy())

+from transformers import AutoTokenizer
+from optimum.onnxruntime import ORTModelForFeatureExtraction
+from sentence_transformers import models
+import numpy as np
+import torch
+tokenizer = AutoTokenizer.from_pretrained("./embeddinggemma-300m")
+model = ORTModelForFeatureExtraction.from_pretrained("./embeddinggemma-300m")
+inputs = tokenizer("apple", return_tensors="pt")
+print(inputs)
+input_ids = inputs['input_ids']
+sequence_length = input_ids.shape[1]
+position_ids = np.arange(sequence_length)[None, :]
+position_ids = np.tile(position_ids, (input_ids.shape[0], 1))
+inputs['position_ids'] = torch.tensor(position_ids, dtype=torch.long)
+outputs = model(**inputs)
+last_hidden = outputs.last_hidden_state
+attention_mask = inputs['attention_mask']
+# Use SentenceTransformer's Pooling module for mean pooling
+pooling = models.Pooling(word_embedding_dimension=last_hidden.shape[-1], pooling_mode_mean_tokens=True)
+features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
+pooled = pooling(features)['sentence_embedding']
 print("Mean pooled:", pooled[0][:5].detach().cpu().numpy())

onnx_gemma3_pipeline.py CHANGED Viewed

@@ -1,91 +1,91 @@
-from sentence_transformers import models
-import torch
-from transformers import AutoTokenizer
-from optimum.onnxruntime import ORTModelForFeatureExtraction
-import numpy as np
-import os
-import onnxruntime
-# ONNX pipeline for Gemma3 embedding model
-model_dir = "embeddinggemma-300m"
-tokenizer = AutoTokenizer.from_pretrained("google/embeddinggemma-300m-qat-q4_0-unquantized")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-onnx_model = ORTModelForFeatureExtraction.from_pretrained(
-    model_dir,
-    file_name="model.onnx"
-).to(device)
-class ONNXTransformer:
-    def __init__(self, onnx_model, tokenizer, max_seq_length=2048):
-        self.onnx_model = onnx_model
-        self.tokenizer = tokenizer
-        self.max_seq_length = max_seq_length
-    def encode(self, sentences):
-        inputs = self.tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=self.max_seq_length)
-        input_ids = inputs['input_ids']
-        sequence_length = input_ids.shape[1]
-        position_ids = torch.arange(sequence_length)[None, :].expand(input_ids.shape[0], sequence_length)
-        inputs['position_ids'] = position_ids.to(input_ids.device)
-        with torch.no_grad():
-            outputs = self.onnx_model(**inputs)
-        return outputs.last_hidden_state
-modules = []
-onnx_transformer = ONNXTransformer(onnx_model, tokenizer, max_seq_length=2048)
-modules.append(onnx_transformer)
-for idx, name in [(1, "Pooling"), (2, "Dense"), (3, "Dense"), (4, "Normalize")]:
-    module_path = os.path.join(model_dir, f"{idx}_{name}")
-    if name == "Pooling":
-        modules.append(models.Pooling(module_path))
-    elif name == "Dense":
-        # Use ONNXRuntime for Dense layers
-        dense_onnx_path = os.path.join(model_dir, "onnx", f"dense{idx-1}.onnx")
-        modules.append(onnxruntime.InferenceSession(dense_onnx_path, providers=["CPUExecutionProvider"]))
-    elif name == "Normalize":
-        modules.append(models.Normalize())
-class ONNXSentenceTransformer:
-    def __init__(self, modules):
-        self.modules = modules
-    def encode(self, sentences):
-        features = self.modules[0].encode(sentences)
-        for module in self.modules[1:]:
-            if isinstance(module, models.Pooling):
-                features = module({'token_embeddings': features, 'attention_mask': torch.ones(features.shape[:2], device=features.device)})['sentence_embedding']
-            elif isinstance(module, onnxruntime.InferenceSession):
-                # ONNX Dense layer expects shape [1, in_features], so process each embedding separately
-                if isinstance(features, torch.Tensor):
-                    features = features.cpu().detach().numpy()
-                outputs = []
-                for vec in features:
-                    ort_inputs = {module.get_inputs()[0].name: vec.reshape(1, -1)}
-                    out = module.run(None, ort_inputs)[0]
-                    outputs.append(out.squeeze(0))
-                features = np.stack(outputs, axis=0)
-            elif isinstance(module, models.Normalize):
-                # Normalize still uses PyTorch
-                if not isinstance(features, torch.Tensor):
-                    features = torch.from_numpy(features)
-                features = module({'sentence_embedding': features})['sentence_embedding']
-        if isinstance(features, torch.Tensor):
-            return features.cpu().detach().numpy()
-        return features
-onnx_st = ONNXSentenceTransformer(modules)
-def cosine_similarity(a, b):
-    a = a.flatten()
-    b = b.flatten()
-    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
-if __name__ == "__main__":
-    words = ["apple", "banana", "car"]
-    embeddings = onnx_st.encode(words)
-    print(embeddings)
-    for idx, embedding in enumerate(embeddings):
-        print(f"Embedding {idx+1}: {embedding.shape}")
-    print("\nCosine similarities:")
-    print(f"apple vs banana: {cosine_similarity(embeddings[0], embeddings[1]):.4f}")
-    print(f"apple vs car: {cosine_similarity(embeddings[0], embeddings[2]):.4f}")
-    print(f"banana vs car: {cosine_similarity(embeddings[1], embeddings[2]):.4f}")

+from sentence_transformers import models
+import torch
+from transformers import AutoTokenizer
+from optimum.onnxruntime import ORTModelForFeatureExtraction
+import numpy as np
+import os
+import onnxruntime
+# ONNX pipeline for Gemma3 embedding model
+model_dir = "embeddinggemma-300m"
+tokenizer = AutoTokenizer.from_pretrained("google/embeddinggemma-300m-qat-q4_0-unquantized")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+onnx_model = ORTModelForFeatureExtraction.from_pretrained(
+    model_dir,
+    file_name="model.onnx"
+).to(device)
+class ONNXTransformer:
+    def __init__(self, onnx_model, tokenizer, max_seq_length=2048):
+        self.onnx_model = onnx_model
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_seq_length
+    def encode(self, sentences):
+        inputs = self.tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=self.max_seq_length)
+        input_ids = inputs['input_ids']
+        sequence_length = input_ids.shape[1]
+        position_ids = torch.arange(sequence_length)[None, :].expand(input_ids.shape[0], sequence_length)
+        inputs['position_ids'] = position_ids.to(input_ids.device)
+        with torch.no_grad():
+            outputs = self.onnx_model(**inputs)
+        return outputs.last_hidden_state
+modules = []
+onnx_transformer = ONNXTransformer(onnx_model, tokenizer, max_seq_length=2048)
+modules.append(onnx_transformer)
+for idx, name in [(1, "Pooling"), (2, "Dense"), (3, "Dense"), (4, "Normalize")]:
+    module_path = os.path.join(model_dir, f"{idx}_{name}")
+    if name == "Pooling":
+        modules.append(models.Pooling(module_path))
+    elif name == "Dense":
+        # Use ONNXRuntime for Dense layers
+        dense_onnx_path = os.path.join(model_dir, "onnx", f"dense{idx-1}.onnx")
+        modules.append(onnxruntime.InferenceSession(dense_onnx_path, providers=["CPUExecutionProvider"]))
+    elif name == "Normalize":
+        modules.append(models.Normalize())
+class ONNXSentenceTransformer:
+    def __init__(self, modules):
+        self.modules = modules
+    def encode(self, sentences):
+        features = self.modules[0].encode(sentences)
+        for module in self.modules[1:]:
+            if isinstance(module, models.Pooling):
+                features = module({'token_embeddings': features, 'attention_mask': torch.ones(features.shape[:2], device=features.device)})['sentence_embedding']
+            elif isinstance(module, onnxruntime.InferenceSession):
+                # ONNX Dense layer expects shape [1, in_features], so process each embedding separately
+                if isinstance(features, torch.Tensor):
+                    features = features.cpu().detach().numpy()
+                outputs = []
+                for vec in features:
+                    ort_inputs = {module.get_inputs()[0].name: vec.reshape(1, -1)}
+                    out = module.run(None, ort_inputs)[0]
+                    outputs.append(out.squeeze(0))
+                features = np.stack(outputs, axis=0)
+            elif isinstance(module, models.Normalize):
+                # Normalize still uses PyTorch
+                if not isinstance(features, torch.Tensor):
+                    features = torch.from_numpy(features)
+                features = module({'sentence_embedding': features})['sentence_embedding']
+        if isinstance(features, torch.Tensor):
+            return features.cpu().detach().numpy()
+        return features
+onnx_st = ONNXSentenceTransformer(modules)
+def cosine_similarity(a, b):
+    a = a.flatten()
+    b = b.flatten()
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+if __name__ == "__main__":
+    words = ["apple", "banana", "car"]
+    embeddings = onnx_st.encode(words)
+    print(embeddings)
+    for idx, embedding in enumerate(embeddings):
+        print(f"Embedding {idx+1}: {embedding.shape}")
+    print("\nCosine similarities:")
+    print(f"apple vs banana: {cosine_similarity(embeddings[0], embeddings[1]):.4f}")
+    print(f"apple vs car: {cosine_similarity(embeddings[0], embeddings[2]):.4f}")
+    print(f"banana vs car: {cosine_similarity(embeddings[1], embeddings[2]):.4f}")

pytorch_gemma3_pipeline.py CHANGED Viewed

@@ -1,58 +1,58 @@
-from sentence_transformers import models
-import torch
-from transformers import AutoTokenizer
-from optimum.onnxruntime import ORTModelForFeatureExtraction
-import numpy as np
-# Load tokenizer and ONNX model
-model_path = "./embeddinggemma-300m"
-tokenizer = AutoTokenizer.from_pretrained("google/embeddinggemma-300m-qat-q4_0-unquantized")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-onnx_model = ORTModelForFeatureExtraction.from_pretrained(model_path).to(device)
-class ONNXSentenceTransformer:
-    def __init__(self, model, tokenizer):
-        self.model = model
-        self.tokenizer = tokenizer
-        self.word_embedding_dimension = 768
-        self.pooling = models.Pooling(word_embedding_dimension=self.word_embedding_dimension, pooling_mode_mean_tokens=True)
-    def encode(self, sentences, batch_size=32):
-        if isinstance(sentences, str):
-            sentences = [sentences]
-        embeddings = []
-        for i in range(0, len(sentences), batch_size):
-            batch = sentences[i:i+batch_size]
-            inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
-            input_ids = inputs['input_ids']
-            sequence_length = input_ids.shape[1]
-            position_ids = torch.arange(sequence_length)[None, :].expand(input_ids.shape[0], sequence_length)
-            inputs['position_ids'] = position_ids
-            with torch.no_grad():
-                outputs = self.model(**inputs)
-            last_hidden = outputs.last_hidden_state
-            attention_mask = inputs['attention_mask'].to(last_hidden.device)
-            features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
-            pooled = self.pooling(features)['sentence_embedding']
-            embeddings.append(pooled)
-        return torch.cat(embeddings, dim=0).cpu().detach().numpy()
-# Usage example
-onnx_st = ONNXSentenceTransformer(onnx_model, tokenizer)
-words = ["apple", "banana", "car"]
-embeddings = onnx_st.encode(words)
-print(embeddings)
-for idx, embedding in enumerate(embeddings):
-    print(f"Embedding {idx+1}: {embedding.shape}")
-# Cosine similarity demonstration
-def cosine_similarity(a, b):
-    a = a.flatten()
-    b = b.flatten()
-    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
-print("\nCosine similarities:")
-print(f"apple vs banana: {cosine_similarity(embeddings[0], embeddings[1]):.4f}")
-print(f"apple vs car: {cosine_similarity(embeddings[0], embeddings[2]):.4f}")
-print(f"banana vs car: {cosine_similarity(embeddings[1], embeddings[2]):.4f}")

+from sentence_transformers import models
+import torch
+from transformers import AutoTokenizer
+from optimum.onnxruntime import ORTModelForFeatureExtraction
+import numpy as np
+# Load tokenizer and ONNX model
+model_path = "./embeddinggemma-300m"
+tokenizer = AutoTokenizer.from_pretrained("google/embeddinggemma-300m-qat-q4_0-unquantized")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+onnx_model = ORTModelForFeatureExtraction.from_pretrained(model_path).to(device)
+class ONNXSentenceTransformer:
+    def __init__(self, model, tokenizer):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.word_embedding_dimension = 768
+        self.pooling = models.Pooling(word_embedding_dimension=self.word_embedding_dimension, pooling_mode_mean_tokens=True)
+    def encode(self, sentences, batch_size=32):
+        if isinstance(sentences, str):
+            sentences = [sentences]
+        embeddings = []
+        for i in range(0, len(sentences), batch_size):
+            batch = sentences[i:i+batch_size]
+            inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
+            input_ids = inputs['input_ids']
+            sequence_length = input_ids.shape[1]
+            position_ids = torch.arange(sequence_length)[None, :].expand(input_ids.shape[0], sequence_length)
+            inputs['position_ids'] = position_ids
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+            last_hidden = outputs.last_hidden_state
+            attention_mask = inputs['attention_mask'].to(last_hidden.device)
+            features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
+            pooled = self.pooling(features)['sentence_embedding']
+            embeddings.append(pooled)
+        return torch.cat(embeddings, dim=0).cpu().detach().numpy()
+# Usage example
+onnx_st = ONNXSentenceTransformer(onnx_model, tokenizer)
+words = ["apple", "banana", "car"]
+embeddings = onnx_st.encode(words)
+print(embeddings)
+for idx, embedding in enumerate(embeddings):
+    print(f"Embedding {idx+1}: {embedding.shape}")
+# Cosine similarity demonstration
+def cosine_similarity(a, b):
+    a = a.flatten()
+    b = b.flatten()
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+print("\nCosine similarities:")
+print(f"apple vs banana: {cosine_similarity(embeddings[0], embeddings[1]):.4f}")
+print(f"apple vs car: {cosine_similarity(embeddings[0], embeddings[2]):.4f}")
+print(f"banana vs car: {cosine_similarity(embeddings[1], embeddings[2]):.4f}")