Alex Sadleir
commited on
Commit
·
d5fb6c3
1
Parent(s):
0fdf9c8
Add ONNX and PyTorch pipelines for Gemma3 embedding model
Browse files- Implemented `gemma3_mean_pooling_basic.py` for basic mean pooling using the Gemma3 model.
- Created `onnx_gemma3_pipeline.py` to define an ONNX pipeline for the Gemma3 embedding model, including tokenization, encoding, and cosine similarity calculations.
- Developed `pytorch_gemma3_pipeline.py` to provide a PyTorch-based implementation of the Gemma3 embedding model, featuring batch processing and mean pooling.
- Added cosine similarity demonstration in both ONNX and PyTorch pipelines for comparing embeddings of sample words.
- README.md +1 -1
- compare_gemma3_onnx_vs_pytorch.py +66 -66
- download_missing_hf_files.py +69 -69
- embeddinggemma-300m/config.json +61 -61
- embeddinggemma-300m/modules.json +31 -31
- embeddinggemma-300m/special_tokens_map.json +33 -33
- embeddinggemma-300m/tokenizer_config.json +0 -0
- gemma3_mean_pooling_basic.py +23 -23
- onnx_gemma3_pipeline.py +91 -91
- pytorch_gemma3_pipeline.py +58 -58
README.md
CHANGED
@@ -58,4 +58,4 @@ The comparison script prints cosine similarities between sample word embeddings
|
|
58 |
|
59 |
## References
|
60 |
- [Optimum-ONNX Gemma3 PR](https://github.com/huggingface/optimum-onnx/pull/50)
|
61 |
-
- [Gemma3 Model](https://huggingface.co/google/embeddinggemma-300m-qat-q4_0-unquantized)
|
|
|
58 |
|
59 |
## References
|
60 |
- [Optimum-ONNX Gemma3 PR](https://github.com/huggingface/optimum-onnx/pull/50)
|
61 |
+
- [Gemma3 Model](https://huggingface.co/google/embeddinggemma-300m-qat-q4_0-unquantized)
|
compare_gemma3_onnx_vs_pytorch.py
CHANGED
@@ -1,66 +1,66 @@
|
|
1 |
-
from sentence_transformers import SentenceTransformer
|
2 |
-
import torch
|
3 |
-
import numpy as np
|
4 |
-
|
5 |
-
# Words to compare
|
6 |
-
words = ["apple", "banana", "car"]
|
7 |
-
|
8 |
-
# Load original SentenceTransformer (PyTorch, CUDA)
|
9 |
-
st_model = SentenceTransformer("google/embeddinggemma-300m-qat-q4_0-unquantized")
|
10 |
-
st_model = st_model.to("cuda" if torch.cuda.is_available() else "cpu")
|
11 |
-
|
12 |
-
# Get PyTorch embeddings
|
13 |
-
with torch.no_grad():
|
14 |
-
pt_embeddings = st_model.encode(words, convert_to_numpy=True, device="cuda" if torch.cuda.is_available() else "cpu")
|
15 |
-
|
16 |
-
from onnx_gemma3_pipeline import onnx_st
|
17 |
-
from transformers import AutoTokenizer
|
18 |
-
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
19 |
-
|
20 |
-
# Basic mean pooling ONNX implementation
|
21 |
-
def basic_mean_pooling(words):
|
22 |
-
tokenizer = AutoTokenizer.from_pretrained("./embeddinggemma-300m")
|
23 |
-
model = ORTModelForFeatureExtraction.from_pretrained("./embeddinggemma-300m")
|
24 |
-
embeddings = []
|
25 |
-
for word in words:
|
26 |
-
inputs = tokenizer(word, return_tensors="pt")
|
27 |
-
input_ids = inputs['input_ids']
|
28 |
-
sequence_length = input_ids.shape[1]
|
29 |
-
position_ids = np.arange(sequence_length)[None, :]
|
30 |
-
position_ids = np.tile(position_ids, (input_ids.shape[0], 1))
|
31 |
-
inputs['position_ids'] = torch.tensor(position_ids, dtype=torch.long)
|
32 |
-
outputs = model(**inputs)
|
33 |
-
last_hidden = outputs.last_hidden_state
|
34 |
-
attention_mask = inputs['attention_mask']
|
35 |
-
from sentence_transformers import models
|
36 |
-
pooling = models.Pooling(word_embedding_dimension=last_hidden.shape[-1], pooling_mode_mean_tokens=True)
|
37 |
-
features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
|
38 |
-
pooled = pooling(features)['sentence_embedding']
|
39 |
-
embeddings.append(pooled[0].detach().cpu().numpy())
|
40 |
-
return np.stack(embeddings)
|
41 |
-
from transformers import AutoTokenizer
|
42 |
-
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
43 |
-
onnx_embeddings = onnx_st.encode(words)
|
44 |
-
|
45 |
-
# Cosine similarity function
|
46 |
-
def cosine_similarity(a, b):
|
47 |
-
a = a.flatten()
|
48 |
-
b = b.flatten()
|
49 |
-
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
50 |
-
|
51 |
-
print("Safetensor Cosine similarities:")
|
52 |
-
print(f"apple vs banana: {cosine_similarity(pt_embeddings[0], pt_embeddings[1]):.4f}")
|
53 |
-
print(f"apple vs car: {cosine_similarity(pt_embeddings[0], pt_embeddings[2]):.4f}")
|
54 |
-
print(f"banana vs car: {cosine_similarity(pt_embeddings[1], pt_embeddings[2]):.4f}")
|
55 |
-
|
56 |
-
print("\nONNX Cosine similarities:")
|
57 |
-
print(f"apple vs banana: {cosine_similarity(onnx_embeddings[0], onnx_embeddings[1]):.4f}")
|
58 |
-
print(f"apple vs car: {cosine_similarity(onnx_embeddings[0], onnx_embeddings[2]):.4f}")
|
59 |
-
print(f"banana vs car: {cosine_similarity(onnx_embeddings[1], onnx_embeddings[2]):.4f}")
|
60 |
-
|
61 |
-
# Basic mean pooling ONNX pipeline
|
62 |
-
basic_embeddings = basic_mean_pooling(words)
|
63 |
-
print("\nBasic ONNX (mean pooling only) Cosine similarities:")
|
64 |
-
print(f"apple vs banana: {cosine_similarity(basic_embeddings[0], basic_embeddings[1]):.4f}")
|
65 |
-
print(f"apple vs car: {cosine_similarity(basic_embeddings[0], basic_embeddings[2]):.4f}")
|
66 |
-
print(f"banana vs car: {cosine_similarity(basic_embeddings[1], basic_embeddings[2]):.4f}")
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
# Words to compare
|
6 |
+
words = ["apple", "banana", "car"]
|
7 |
+
|
8 |
+
# Load original SentenceTransformer (PyTorch, CUDA)
|
9 |
+
st_model = SentenceTransformer("google/embeddinggemma-300m-qat-q4_0-unquantized")
|
10 |
+
st_model = st_model.to("cuda" if torch.cuda.is_available() else "cpu")
|
11 |
+
|
12 |
+
# Get PyTorch embeddings
|
13 |
+
with torch.no_grad():
|
14 |
+
pt_embeddings = st_model.encode(words, convert_to_numpy=True, device="cuda" if torch.cuda.is_available() else "cpu")
|
15 |
+
|
16 |
+
from onnx_gemma3_pipeline import onnx_st
|
17 |
+
from transformers import AutoTokenizer
|
18 |
+
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
19 |
+
|
20 |
+
# Basic mean pooling ONNX implementation
|
21 |
+
def basic_mean_pooling(words):
|
22 |
+
tokenizer = AutoTokenizer.from_pretrained("./embeddinggemma-300m")
|
23 |
+
model = ORTModelForFeatureExtraction.from_pretrained("./embeddinggemma-300m")
|
24 |
+
embeddings = []
|
25 |
+
for word in words:
|
26 |
+
inputs = tokenizer(word, return_tensors="pt")
|
27 |
+
input_ids = inputs['input_ids']
|
28 |
+
sequence_length = input_ids.shape[1]
|
29 |
+
position_ids = np.arange(sequence_length)[None, :]
|
30 |
+
position_ids = np.tile(position_ids, (input_ids.shape[0], 1))
|
31 |
+
inputs['position_ids'] = torch.tensor(position_ids, dtype=torch.long)
|
32 |
+
outputs = model(**inputs)
|
33 |
+
last_hidden = outputs.last_hidden_state
|
34 |
+
attention_mask = inputs['attention_mask']
|
35 |
+
from sentence_transformers import models
|
36 |
+
pooling = models.Pooling(word_embedding_dimension=last_hidden.shape[-1], pooling_mode_mean_tokens=True)
|
37 |
+
features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
|
38 |
+
pooled = pooling(features)['sentence_embedding']
|
39 |
+
embeddings.append(pooled[0].detach().cpu().numpy())
|
40 |
+
return np.stack(embeddings)
|
41 |
+
from transformers import AutoTokenizer
|
42 |
+
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
43 |
+
onnx_embeddings = onnx_st.encode(words)
|
44 |
+
|
45 |
+
# Cosine similarity function
|
46 |
+
def cosine_similarity(a, b):
|
47 |
+
a = a.flatten()
|
48 |
+
b = b.flatten()
|
49 |
+
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
50 |
+
|
51 |
+
print("Safetensor Cosine similarities:")
|
52 |
+
print(f"apple vs banana: {cosine_similarity(pt_embeddings[0], pt_embeddings[1]):.4f}")
|
53 |
+
print(f"apple vs car: {cosine_similarity(pt_embeddings[0], pt_embeddings[2]):.4f}")
|
54 |
+
print(f"banana vs car: {cosine_similarity(pt_embeddings[1], pt_embeddings[2]):.4f}")
|
55 |
+
|
56 |
+
print("\nONNX Cosine similarities:")
|
57 |
+
print(f"apple vs banana: {cosine_similarity(onnx_embeddings[0], onnx_embeddings[1]):.4f}")
|
58 |
+
print(f"apple vs car: {cosine_similarity(onnx_embeddings[0], onnx_embeddings[2]):.4f}")
|
59 |
+
print(f"banana vs car: {cosine_similarity(onnx_embeddings[1], onnx_embeddings[2]):.4f}")
|
60 |
+
|
61 |
+
# Basic mean pooling ONNX pipeline
|
62 |
+
basic_embeddings = basic_mean_pooling(words)
|
63 |
+
print("\nBasic ONNX (mean pooling only) Cosine similarities:")
|
64 |
+
print(f"apple vs banana: {cosine_similarity(basic_embeddings[0], basic_embeddings[1]):.4f}")
|
65 |
+
print(f"apple vs car: {cosine_similarity(basic_embeddings[0], basic_embeddings[2]):.4f}")
|
66 |
+
print(f"banana vs car: {cosine_similarity(basic_embeddings[1], basic_embeddings[2]):.4f}")
|
download_missing_hf_files.py
CHANGED
@@ -1,69 +1,69 @@
|
|
1 |
-
from huggingface_hub import snapshot_download
|
2 |
-
import os
|
3 |
-
import shutil
|
4 |
-
from sentence_transformers import SentenceTransformer
|
5 |
-
import torch
|
6 |
-
|
7 |
-
# Model repo and local directory
|
8 |
-
repo_id = "google/embeddinggemma-300m-qat-q4_0-unquantized"
|
9 |
-
local_dir = "embeddinggemma-300m"
|
10 |
-
|
11 |
-
# Download all files except model.safetensors and those already present
|
12 |
-
existing_files = set(os.listdir(local_dir))
|
13 |
-
|
14 |
-
# Download snapshot to a temp dir
|
15 |
-
temp_dir = "_hf_temp_download"
|
16 |
-
os.makedirs(temp_dir, exist_ok=True)
|
17 |
-
snapshot_download(
|
18 |
-
repo_id,
|
19 |
-
local_dir=temp_dir,
|
20 |
-
ignore_patterns=["model.safetensors"],
|
21 |
-
resume_download=True,
|
22 |
-
allow_patterns=None
|
23 |
-
)
|
24 |
-
|
25 |
-
# Copy missing files
|
26 |
-
for fname in os.listdir(temp_dir):
|
27 |
-
if fname not in existing_files:
|
28 |
-
shutil.move(os.path.join(temp_dir, fname), os.path.join(local_dir, fname))
|
29 |
-
print(f"Downloaded: {fname}")
|
30 |
-
else:
|
31 |
-
print(f"Already exists: {fname}")
|
32 |
-
|
33 |
-
# Clean up temp dir
|
34 |
-
shutil.rmtree(temp_dir)
|
35 |
-
print("Done.")
|
36 |
-
|
37 |
-
# Export Dense layers from SentenceTransformer to ONNX
|
38 |
-
st_model = SentenceTransformer(repo_id)
|
39 |
-
dense1 = st_model[2].linear
|
40 |
-
dense2 = st_model[3].linear
|
41 |
-
|
42 |
-
onnx_dir = os.path.join(local_dir, "onnx")
|
43 |
-
os.makedirs(onnx_dir, exist_ok=True)
|
44 |
-
|
45 |
-
# Export Dense1
|
46 |
-
dummy_input1 = torch.randn(1, dense1.in_features)
|
47 |
-
dense1 = dense1.to(dummy_input1.device)
|
48 |
-
torch.onnx.export(
|
49 |
-
dense1,
|
50 |
-
dummy_input1,
|
51 |
-
os.path.join(onnx_dir, "dense1.onnx"),
|
52 |
-
input_names=["input"],
|
53 |
-
output_names=["output"],
|
54 |
-
opset_version=14
|
55 |
-
)
|
56 |
-
print("Exported dense1.onnx")
|
57 |
-
|
58 |
-
# Export Dense2
|
59 |
-
dummy_input2 = torch.randn(1, dense2.in_features)
|
60 |
-
dense2 = dense2.to(dummy_input2.device)
|
61 |
-
torch.onnx.export(
|
62 |
-
dense2,
|
63 |
-
dummy_input2,
|
64 |
-
os.path.join(onnx_dir, "dense2.onnx"),
|
65 |
-
input_names=["input"],
|
66 |
-
output_names=["output"],
|
67 |
-
opset_version=14
|
68 |
-
)
|
69 |
-
print("Exported dense2.onnx")
|
|
|
1 |
+
from huggingface_hub import snapshot_download
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
import torch
|
6 |
+
|
7 |
+
# Model repo and local directory
|
8 |
+
repo_id = "google/embeddinggemma-300m-qat-q4_0-unquantized"
|
9 |
+
local_dir = "embeddinggemma-300m"
|
10 |
+
|
11 |
+
# Download all files except model.safetensors and those already present
|
12 |
+
existing_files = set(os.listdir(local_dir))
|
13 |
+
|
14 |
+
# Download snapshot to a temp dir
|
15 |
+
temp_dir = "_hf_temp_download"
|
16 |
+
os.makedirs(temp_dir, exist_ok=True)
|
17 |
+
snapshot_download(
|
18 |
+
repo_id,
|
19 |
+
local_dir=temp_dir,
|
20 |
+
ignore_patterns=["model.safetensors"],
|
21 |
+
resume_download=True,
|
22 |
+
allow_patterns=None
|
23 |
+
)
|
24 |
+
|
25 |
+
# Copy missing files
|
26 |
+
for fname in os.listdir(temp_dir):
|
27 |
+
if fname not in existing_files:
|
28 |
+
shutil.move(os.path.join(temp_dir, fname), os.path.join(local_dir, fname))
|
29 |
+
print(f"Downloaded: {fname}")
|
30 |
+
else:
|
31 |
+
print(f"Already exists: {fname}")
|
32 |
+
|
33 |
+
# Clean up temp dir
|
34 |
+
shutil.rmtree(temp_dir)
|
35 |
+
print("Done.")
|
36 |
+
|
37 |
+
# Export Dense layers from SentenceTransformer to ONNX
|
38 |
+
st_model = SentenceTransformer(repo_id)
|
39 |
+
dense1 = st_model[2].linear
|
40 |
+
dense2 = st_model[3].linear
|
41 |
+
|
42 |
+
onnx_dir = os.path.join(local_dir, "onnx")
|
43 |
+
os.makedirs(onnx_dir, exist_ok=True)
|
44 |
+
|
45 |
+
# Export Dense1
|
46 |
+
dummy_input1 = torch.randn(1, dense1.in_features)
|
47 |
+
dense1 = dense1.to(dummy_input1.device)
|
48 |
+
torch.onnx.export(
|
49 |
+
dense1,
|
50 |
+
dummy_input1,
|
51 |
+
os.path.join(onnx_dir, "dense1.onnx"),
|
52 |
+
input_names=["input"],
|
53 |
+
output_names=["output"],
|
54 |
+
opset_version=14
|
55 |
+
)
|
56 |
+
print("Exported dense1.onnx")
|
57 |
+
|
58 |
+
# Export Dense2
|
59 |
+
dummy_input2 = torch.randn(1, dense2.in_features)
|
60 |
+
dense2 = dense2.to(dummy_input2.device)
|
61 |
+
torch.onnx.export(
|
62 |
+
dense2,
|
63 |
+
dummy_input2,
|
64 |
+
os.path.join(onnx_dir, "dense2.onnx"),
|
65 |
+
input_names=["input"],
|
66 |
+
output_names=["output"],
|
67 |
+
opset_version=14
|
68 |
+
)
|
69 |
+
print("Exported dense2.onnx")
|
embeddinggemma-300m/config.json
CHANGED
@@ -1,61 +1,61 @@
|
|
1 |
-
{
|
2 |
-
"_sliding_window_pattern": 6,
|
3 |
-
"architectures": [
|
4 |
-
"Gemma3TextModel"
|
5 |
-
],
|
6 |
-
"attention_bias": false,
|
7 |
-
"attention_dropout": 0.0,
|
8 |
-
"attn_logit_softcapping": null,
|
9 |
-
"bos_token_id": 2,
|
10 |
-
"dtype": "float32",
|
11 |
-
"eos_token_id": 1,
|
12 |
-
"final_logit_softcapping": null,
|
13 |
-
"head_dim": 256,
|
14 |
-
"hidden_activation": "gelu_pytorch_tanh",
|
15 |
-
"hidden_size": 768,
|
16 |
-
"initializer_range": 0.02,
|
17 |
-
"intermediate_size": 1152,
|
18 |
-
"layer_types": [
|
19 |
-
"sliding_attention",
|
20 |
-
"sliding_attention",
|
21 |
-
"sliding_attention",
|
22 |
-
"sliding_attention",
|
23 |
-
"sliding_attention",
|
24 |
-
"full_attention",
|
25 |
-
"sliding_attention",
|
26 |
-
"sliding_attention",
|
27 |
-
"sliding_attention",
|
28 |
-
"sliding_attention",
|
29 |
-
"sliding_attention",
|
30 |
-
"full_attention",
|
31 |
-
"sliding_attention",
|
32 |
-
"sliding_attention",
|
33 |
-
"sliding_attention",
|
34 |
-
"sliding_attention",
|
35 |
-
"sliding_attention",
|
36 |
-
"full_attention",
|
37 |
-
"sliding_attention",
|
38 |
-
"sliding_attention",
|
39 |
-
"sliding_attention",
|
40 |
-
"sliding_attention",
|
41 |
-
"sliding_attention",
|
42 |
-
"full_attention"
|
43 |
-
],
|
44 |
-
"max_position_embeddings": 2048,
|
45 |
-
"model_type": "gemma3_text",
|
46 |
-
"num_attention_heads": 3,
|
47 |
-
"num_hidden_layers": 24,
|
48 |
-
"num_key_value_heads": 1,
|
49 |
-
"pad_token_id": 0,
|
50 |
-
"query_pre_attn_scalar": 256,
|
51 |
-
"rms_norm_eps": 1e-06,
|
52 |
-
"rope_local_base_freq": 10000.0,
|
53 |
-
"rope_scaling": null,
|
54 |
-
"rope_theta": 1000000.0,
|
55 |
-
"sliding_window": 512,
|
56 |
-
"torch_dtype": "float32",
|
57 |
-
"transformers_version": "4.53.3",
|
58 |
-
"use_bidirectional_attention": true,
|
59 |
-
"use_cache": true,
|
60 |
-
"vocab_size": 262144
|
61 |
-
}
|
|
|
1 |
+
{
|
2 |
+
"_sliding_window_pattern": 6,
|
3 |
+
"architectures": [
|
4 |
+
"Gemma3TextModel"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"attn_logit_softcapping": null,
|
9 |
+
"bos_token_id": 2,
|
10 |
+
"dtype": "float32",
|
11 |
+
"eos_token_id": 1,
|
12 |
+
"final_logit_softcapping": null,
|
13 |
+
"head_dim": 256,
|
14 |
+
"hidden_activation": "gelu_pytorch_tanh",
|
15 |
+
"hidden_size": 768,
|
16 |
+
"initializer_range": 0.02,
|
17 |
+
"intermediate_size": 1152,
|
18 |
+
"layer_types": [
|
19 |
+
"sliding_attention",
|
20 |
+
"sliding_attention",
|
21 |
+
"sliding_attention",
|
22 |
+
"sliding_attention",
|
23 |
+
"sliding_attention",
|
24 |
+
"full_attention",
|
25 |
+
"sliding_attention",
|
26 |
+
"sliding_attention",
|
27 |
+
"sliding_attention",
|
28 |
+
"sliding_attention",
|
29 |
+
"sliding_attention",
|
30 |
+
"full_attention",
|
31 |
+
"sliding_attention",
|
32 |
+
"sliding_attention",
|
33 |
+
"sliding_attention",
|
34 |
+
"sliding_attention",
|
35 |
+
"sliding_attention",
|
36 |
+
"full_attention",
|
37 |
+
"sliding_attention",
|
38 |
+
"sliding_attention",
|
39 |
+
"sliding_attention",
|
40 |
+
"sliding_attention",
|
41 |
+
"sliding_attention",
|
42 |
+
"full_attention"
|
43 |
+
],
|
44 |
+
"max_position_embeddings": 2048,
|
45 |
+
"model_type": "gemma3_text",
|
46 |
+
"num_attention_heads": 3,
|
47 |
+
"num_hidden_layers": 24,
|
48 |
+
"num_key_value_heads": 1,
|
49 |
+
"pad_token_id": 0,
|
50 |
+
"query_pre_attn_scalar": 256,
|
51 |
+
"rms_norm_eps": 1e-06,
|
52 |
+
"rope_local_base_freq": 10000.0,
|
53 |
+
"rope_scaling": null,
|
54 |
+
"rope_theta": 1000000.0,
|
55 |
+
"sliding_window": 512,
|
56 |
+
"torch_dtype": "float32",
|
57 |
+
"transformers_version": "4.53.3",
|
58 |
+
"use_bidirectional_attention": true,
|
59 |
+
"use_cache": true,
|
60 |
+
"vocab_size": 262144
|
61 |
+
}
|
embeddinggemma-300m/modules.json
CHANGED
@@ -1,32 +1,32 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"idx": 0,
|
4 |
-
"name": "0",
|
5 |
-
"path": "",
|
6 |
-
"type": "sentence_transformers.models.Transformer"
|
7 |
-
},
|
8 |
-
{
|
9 |
-
"idx": 1,
|
10 |
-
"name": "1",
|
11 |
-
"path": "1_Pooling",
|
12 |
-
"type": "sentence_transformers.models.Pooling"
|
13 |
-
},
|
14 |
-
{
|
15 |
-
"idx": 2,
|
16 |
-
"name": "2",
|
17 |
-
"path": "2_Dense",
|
18 |
-
"type": "sentence_transformers.models.Dense"
|
19 |
-
},
|
20 |
-
{
|
21 |
-
"idx": 3,
|
22 |
-
"name": "3",
|
23 |
-
"path": "3_Dense",
|
24 |
-
"type": "sentence_transformers.models.Dense"
|
25 |
-
},
|
26 |
-
{
|
27 |
-
"idx": 4,
|
28 |
-
"name": "4",
|
29 |
-
"path": "4_Normalize",
|
30 |
-
"type": "sentence_transformers.models.Normalize"
|
31 |
-
}
|
32 |
]
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Dense",
|
18 |
+
"type": "sentence_transformers.models.Dense"
|
19 |
+
},
|
20 |
+
{
|
21 |
+
"idx": 3,
|
22 |
+
"name": "3",
|
23 |
+
"path": "3_Dense",
|
24 |
+
"type": "sentence_transformers.models.Dense"
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"idx": 4,
|
28 |
+
"name": "4",
|
29 |
+
"path": "4_Normalize",
|
30 |
+
"type": "sentence_transformers.models.Normalize"
|
31 |
+
}
|
32 |
]
|
embeddinggemma-300m/special_tokens_map.json
CHANGED
@@ -1,33 +1,33 @@
|
|
1 |
-
{
|
2 |
-
"boi_token": "<start_of_image>",
|
3 |
-
"bos_token": {
|
4 |
-
"content": "<bos>",
|
5 |
-
"lstrip": false,
|
6 |
-
"normalized": false,
|
7 |
-
"rstrip": false,
|
8 |
-
"single_word": false
|
9 |
-
},
|
10 |
-
"eoi_token": "<end_of_image>",
|
11 |
-
"eos_token": {
|
12 |
-
"content": "<eos>",
|
13 |
-
"lstrip": false,
|
14 |
-
"normalized": false,
|
15 |
-
"rstrip": false,
|
16 |
-
"single_word": false
|
17 |
-
},
|
18 |
-
"image_token": "<image_soft_token>",
|
19 |
-
"pad_token": {
|
20 |
-
"content": "<pad>",
|
21 |
-
"lstrip": false,
|
22 |
-
"normalized": false,
|
23 |
-
"rstrip": false,
|
24 |
-
"single_word": false
|
25 |
-
},
|
26 |
-
"unk_token": {
|
27 |
-
"content": "<unk>",
|
28 |
-
"lstrip": false,
|
29 |
-
"normalized": false,
|
30 |
-
"rstrip": false,
|
31 |
-
"single_word": false
|
32 |
-
}
|
33 |
-
}
|
|
|
1 |
+
{
|
2 |
+
"boi_token": "<start_of_image>",
|
3 |
+
"bos_token": {
|
4 |
+
"content": "<bos>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false
|
9 |
+
},
|
10 |
+
"eoi_token": "<end_of_image>",
|
11 |
+
"eos_token": {
|
12 |
+
"content": "<eos>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false
|
17 |
+
},
|
18 |
+
"image_token": "<image_soft_token>",
|
19 |
+
"pad_token": {
|
20 |
+
"content": "<pad>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false
|
25 |
+
},
|
26 |
+
"unk_token": {
|
27 |
+
"content": "<unk>",
|
28 |
+
"lstrip": false,
|
29 |
+
"normalized": false,
|
30 |
+
"rstrip": false,
|
31 |
+
"single_word": false
|
32 |
+
}
|
33 |
+
}
|
embeddinggemma-300m/tokenizer_config.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
gemma3_mean_pooling_basic.py
CHANGED
@@ -1,24 +1,24 @@
|
|
1 |
-
from transformers import AutoTokenizer
|
2 |
-
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
3 |
-
from sentence_transformers import models
|
4 |
-
import numpy as np
|
5 |
-
import torch
|
6 |
-
|
7 |
-
tokenizer = AutoTokenizer.from_pretrained("./embeddinggemma-300m")
|
8 |
-
model = ORTModelForFeatureExtraction.from_pretrained("./embeddinggemma-300m")
|
9 |
-
|
10 |
-
inputs = tokenizer("apple", return_tensors="pt")
|
11 |
-
print(inputs)
|
12 |
-
input_ids = inputs['input_ids']
|
13 |
-
sequence_length = input_ids.shape[1]
|
14 |
-
position_ids = np.arange(sequence_length)[None, :]
|
15 |
-
position_ids = np.tile(position_ids, (input_ids.shape[0], 1))
|
16 |
-
inputs['position_ids'] = torch.tensor(position_ids, dtype=torch.long)
|
17 |
-
outputs = model(**inputs)
|
18 |
-
last_hidden = outputs.last_hidden_state
|
19 |
-
attention_mask = inputs['attention_mask']
|
20 |
-
# Use SentenceTransformer's Pooling module for mean pooling
|
21 |
-
pooling = models.Pooling(word_embedding_dimension=last_hidden.shape[-1], pooling_mode_mean_tokens=True)
|
22 |
-
features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
|
23 |
-
pooled = pooling(features)['sentence_embedding']
|
24 |
print("Mean pooled:", pooled[0][:5].detach().cpu().numpy())
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
3 |
+
from sentence_transformers import models
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
|
7 |
+
tokenizer = AutoTokenizer.from_pretrained("./embeddinggemma-300m")
|
8 |
+
model = ORTModelForFeatureExtraction.from_pretrained("./embeddinggemma-300m")
|
9 |
+
|
10 |
+
inputs = tokenizer("apple", return_tensors="pt")
|
11 |
+
print(inputs)
|
12 |
+
input_ids = inputs['input_ids']
|
13 |
+
sequence_length = input_ids.shape[1]
|
14 |
+
position_ids = np.arange(sequence_length)[None, :]
|
15 |
+
position_ids = np.tile(position_ids, (input_ids.shape[0], 1))
|
16 |
+
inputs['position_ids'] = torch.tensor(position_ids, dtype=torch.long)
|
17 |
+
outputs = model(**inputs)
|
18 |
+
last_hidden = outputs.last_hidden_state
|
19 |
+
attention_mask = inputs['attention_mask']
|
20 |
+
# Use SentenceTransformer's Pooling module for mean pooling
|
21 |
+
pooling = models.Pooling(word_embedding_dimension=last_hidden.shape[-1], pooling_mode_mean_tokens=True)
|
22 |
+
features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
|
23 |
+
pooled = pooling(features)['sentence_embedding']
|
24 |
print("Mean pooled:", pooled[0][:5].detach().cpu().numpy())
|
onnx_gemma3_pipeline.py
CHANGED
@@ -1,91 +1,91 @@
|
|
1 |
-
from sentence_transformers import models
|
2 |
-
import torch
|
3 |
-
from transformers import AutoTokenizer
|
4 |
-
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
5 |
-
import numpy as np
|
6 |
-
import os
|
7 |
-
import onnxruntime
|
8 |
-
|
9 |
-
# ONNX pipeline for Gemma3 embedding model
|
10 |
-
model_dir = "embeddinggemma-300m"
|
11 |
-
tokenizer = AutoTokenizer.from_pretrained("google/embeddinggemma-300m-qat-q4_0-unquantized")
|
12 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
-
onnx_model = ORTModelForFeatureExtraction.from_pretrained(
|
14 |
-
model_dir,
|
15 |
-
file_name="model.onnx"
|
16 |
-
).to(device)
|
17 |
-
|
18 |
-
class ONNXTransformer:
|
19 |
-
def __init__(self, onnx_model, tokenizer, max_seq_length=2048):
|
20 |
-
self.onnx_model = onnx_model
|
21 |
-
self.tokenizer = tokenizer
|
22 |
-
self.max_seq_length = max_seq_length
|
23 |
-
def encode(self, sentences):
|
24 |
-
inputs = self.tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=self.max_seq_length)
|
25 |
-
input_ids = inputs['input_ids']
|
26 |
-
sequence_length = input_ids.shape[1]
|
27 |
-
position_ids = torch.arange(sequence_length)[None, :].expand(input_ids.shape[0], sequence_length)
|
28 |
-
inputs['position_ids'] = position_ids.to(input_ids.device)
|
29 |
-
with torch.no_grad():
|
30 |
-
outputs = self.onnx_model(**inputs)
|
31 |
-
return outputs.last_hidden_state
|
32 |
-
|
33 |
-
modules = []
|
34 |
-
onnx_transformer = ONNXTransformer(onnx_model, tokenizer, max_seq_length=2048)
|
35 |
-
modules.append(onnx_transformer)
|
36 |
-
for idx, name in [(1, "Pooling"), (2, "Dense"), (3, "Dense"), (4, "Normalize")]:
|
37 |
-
module_path = os.path.join(model_dir, f"{idx}_{name}")
|
38 |
-
if name == "Pooling":
|
39 |
-
modules.append(models.Pooling(module_path))
|
40 |
-
elif name == "Dense":
|
41 |
-
# Use ONNXRuntime for Dense layers
|
42 |
-
dense_onnx_path = os.path.join(model_dir, "onnx", f"dense{idx-1}.onnx")
|
43 |
-
modules.append(onnxruntime.InferenceSession(dense_onnx_path, providers=["CPUExecutionProvider"]))
|
44 |
-
elif name == "Normalize":
|
45 |
-
modules.append(models.Normalize())
|
46 |
-
|
47 |
-
class ONNXSentenceTransformer:
|
48 |
-
def __init__(self, modules):
|
49 |
-
self.modules = modules
|
50 |
-
def encode(self, sentences):
|
51 |
-
features = self.modules[0].encode(sentences)
|
52 |
-
for module in self.modules[1:]:
|
53 |
-
if isinstance(module, models.Pooling):
|
54 |
-
features = module({'token_embeddings': features, 'attention_mask': torch.ones(features.shape[:2], device=features.device)})['sentence_embedding']
|
55 |
-
elif isinstance(module, onnxruntime.InferenceSession):
|
56 |
-
# ONNX Dense layer expects shape [1, in_features], so process each embedding separately
|
57 |
-
if isinstance(features, torch.Tensor):
|
58 |
-
features = features.cpu().detach().numpy()
|
59 |
-
outputs = []
|
60 |
-
for vec in features:
|
61 |
-
ort_inputs = {module.get_inputs()[0].name: vec.reshape(1, -1)}
|
62 |
-
out = module.run(None, ort_inputs)[0]
|
63 |
-
outputs.append(out.squeeze(0))
|
64 |
-
features = np.stack(outputs, axis=0)
|
65 |
-
elif isinstance(module, models.Normalize):
|
66 |
-
# Normalize still uses PyTorch
|
67 |
-
if not isinstance(features, torch.Tensor):
|
68 |
-
features = torch.from_numpy(features)
|
69 |
-
features = module({'sentence_embedding': features})['sentence_embedding']
|
70 |
-
if isinstance(features, torch.Tensor):
|
71 |
-
return features.cpu().detach().numpy()
|
72 |
-
return features
|
73 |
-
|
74 |
-
onnx_st = ONNXSentenceTransformer(modules)
|
75 |
-
|
76 |
-
def cosine_similarity(a, b):
|
77 |
-
a = a.flatten()
|
78 |
-
b = b.flatten()
|
79 |
-
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
80 |
-
|
81 |
-
if __name__ == "__main__":
|
82 |
-
words = ["apple", "banana", "car"]
|
83 |
-
embeddings = onnx_st.encode(words)
|
84 |
-
print(embeddings)
|
85 |
-
for idx, embedding in enumerate(embeddings):
|
86 |
-
print(f"Embedding {idx+1}: {embedding.shape}")
|
87 |
-
|
88 |
-
print("\nCosine similarities:")
|
89 |
-
print(f"apple vs banana: {cosine_similarity(embeddings[0], embeddings[1]):.4f}")
|
90 |
-
print(f"apple vs car: {cosine_similarity(embeddings[0], embeddings[2]):.4f}")
|
91 |
-
print(f"banana vs car: {cosine_similarity(embeddings[1], embeddings[2]):.4f}")
|
|
|
1 |
+
from sentence_transformers import models
|
2 |
+
import torch
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
5 |
+
import numpy as np
|
6 |
+
import os
|
7 |
+
import onnxruntime
|
8 |
+
|
9 |
+
# ONNX pipeline for Gemma3 embedding model
|
10 |
+
model_dir = "embeddinggemma-300m"
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained("google/embeddinggemma-300m-qat-q4_0-unquantized")
|
12 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
+
onnx_model = ORTModelForFeatureExtraction.from_pretrained(
|
14 |
+
model_dir,
|
15 |
+
file_name="model.onnx"
|
16 |
+
).to(device)
|
17 |
+
|
18 |
+
class ONNXTransformer:
|
19 |
+
def __init__(self, onnx_model, tokenizer, max_seq_length=2048):
|
20 |
+
self.onnx_model = onnx_model
|
21 |
+
self.tokenizer = tokenizer
|
22 |
+
self.max_seq_length = max_seq_length
|
23 |
+
def encode(self, sentences):
|
24 |
+
inputs = self.tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=self.max_seq_length)
|
25 |
+
input_ids = inputs['input_ids']
|
26 |
+
sequence_length = input_ids.shape[1]
|
27 |
+
position_ids = torch.arange(sequence_length)[None, :].expand(input_ids.shape[0], sequence_length)
|
28 |
+
inputs['position_ids'] = position_ids.to(input_ids.device)
|
29 |
+
with torch.no_grad():
|
30 |
+
outputs = self.onnx_model(**inputs)
|
31 |
+
return outputs.last_hidden_state
|
32 |
+
|
33 |
+
modules = []
|
34 |
+
onnx_transformer = ONNXTransformer(onnx_model, tokenizer, max_seq_length=2048)
|
35 |
+
modules.append(onnx_transformer)
|
36 |
+
for idx, name in [(1, "Pooling"), (2, "Dense"), (3, "Dense"), (4, "Normalize")]:
|
37 |
+
module_path = os.path.join(model_dir, f"{idx}_{name}")
|
38 |
+
if name == "Pooling":
|
39 |
+
modules.append(models.Pooling(module_path))
|
40 |
+
elif name == "Dense":
|
41 |
+
# Use ONNXRuntime for Dense layers
|
42 |
+
dense_onnx_path = os.path.join(model_dir, "onnx", f"dense{idx-1}.onnx")
|
43 |
+
modules.append(onnxruntime.InferenceSession(dense_onnx_path, providers=["CPUExecutionProvider"]))
|
44 |
+
elif name == "Normalize":
|
45 |
+
modules.append(models.Normalize())
|
46 |
+
|
47 |
+
class ONNXSentenceTransformer:
|
48 |
+
def __init__(self, modules):
|
49 |
+
self.modules = modules
|
50 |
+
def encode(self, sentences):
|
51 |
+
features = self.modules[0].encode(sentences)
|
52 |
+
for module in self.modules[1:]:
|
53 |
+
if isinstance(module, models.Pooling):
|
54 |
+
features = module({'token_embeddings': features, 'attention_mask': torch.ones(features.shape[:2], device=features.device)})['sentence_embedding']
|
55 |
+
elif isinstance(module, onnxruntime.InferenceSession):
|
56 |
+
# ONNX Dense layer expects shape [1, in_features], so process each embedding separately
|
57 |
+
if isinstance(features, torch.Tensor):
|
58 |
+
features = features.cpu().detach().numpy()
|
59 |
+
outputs = []
|
60 |
+
for vec in features:
|
61 |
+
ort_inputs = {module.get_inputs()[0].name: vec.reshape(1, -1)}
|
62 |
+
out = module.run(None, ort_inputs)[0]
|
63 |
+
outputs.append(out.squeeze(0))
|
64 |
+
features = np.stack(outputs, axis=0)
|
65 |
+
elif isinstance(module, models.Normalize):
|
66 |
+
# Normalize still uses PyTorch
|
67 |
+
if not isinstance(features, torch.Tensor):
|
68 |
+
features = torch.from_numpy(features)
|
69 |
+
features = module({'sentence_embedding': features})['sentence_embedding']
|
70 |
+
if isinstance(features, torch.Tensor):
|
71 |
+
return features.cpu().detach().numpy()
|
72 |
+
return features
|
73 |
+
|
74 |
+
onnx_st = ONNXSentenceTransformer(modules)
|
75 |
+
|
76 |
+
def cosine_similarity(a, b):
|
77 |
+
a = a.flatten()
|
78 |
+
b = b.flatten()
|
79 |
+
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
80 |
+
|
81 |
+
if __name__ == "__main__":
|
82 |
+
words = ["apple", "banana", "car"]
|
83 |
+
embeddings = onnx_st.encode(words)
|
84 |
+
print(embeddings)
|
85 |
+
for idx, embedding in enumerate(embeddings):
|
86 |
+
print(f"Embedding {idx+1}: {embedding.shape}")
|
87 |
+
|
88 |
+
print("\nCosine similarities:")
|
89 |
+
print(f"apple vs banana: {cosine_similarity(embeddings[0], embeddings[1]):.4f}")
|
90 |
+
print(f"apple vs car: {cosine_similarity(embeddings[0], embeddings[2]):.4f}")
|
91 |
+
print(f"banana vs car: {cosine_similarity(embeddings[1], embeddings[2]):.4f}")
|
pytorch_gemma3_pipeline.py
CHANGED
@@ -1,58 +1,58 @@
|
|
1 |
-
from sentence_transformers import models
|
2 |
-
import torch
|
3 |
-
from transformers import AutoTokenizer
|
4 |
-
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
5 |
-
import numpy as np
|
6 |
-
# Load tokenizer and ONNX model
|
7 |
-
model_path = "./embeddinggemma-300m"
|
8 |
-
tokenizer = AutoTokenizer.from_pretrained("google/embeddinggemma-300m-qat-q4_0-unquantized")
|
9 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
-
onnx_model = ORTModelForFeatureExtraction.from_pretrained(model_path).to(device)
|
11 |
-
|
12 |
-
class ONNXSentenceTransformer:
|
13 |
-
def __init__(self, model, tokenizer):
|
14 |
-
self.model = model
|
15 |
-
self.tokenizer = tokenizer
|
16 |
-
self.word_embedding_dimension = 768
|
17 |
-
self.pooling = models.Pooling(word_embedding_dimension=self.word_embedding_dimension, pooling_mode_mean_tokens=True)
|
18 |
-
|
19 |
-
def encode(self, sentences, batch_size=32):
|
20 |
-
if isinstance(sentences, str):
|
21 |
-
sentences = [sentences]
|
22 |
-
embeddings = []
|
23 |
-
for i in range(0, len(sentences), batch_size):
|
24 |
-
batch = sentences[i:i+batch_size]
|
25 |
-
inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
|
26 |
-
input_ids = inputs['input_ids']
|
27 |
-
sequence_length = input_ids.shape[1]
|
28 |
-
position_ids = torch.arange(sequence_length)[None, :].expand(input_ids.shape[0], sequence_length)
|
29 |
-
inputs['position_ids'] = position_ids
|
30 |
-
with torch.no_grad():
|
31 |
-
outputs = self.model(**inputs)
|
32 |
-
last_hidden = outputs.last_hidden_state
|
33 |
-
attention_mask = inputs['attention_mask'].to(last_hidden.device)
|
34 |
-
features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
|
35 |
-
pooled = self.pooling(features)['sentence_embedding']
|
36 |
-
embeddings.append(pooled)
|
37 |
-
return torch.cat(embeddings, dim=0).cpu().detach().numpy()
|
38 |
-
|
39 |
-
|
40 |
-
# Usage example
|
41 |
-
onnx_st = ONNXSentenceTransformer(onnx_model, tokenizer)
|
42 |
-
|
43 |
-
words = ["apple", "banana", "car"]
|
44 |
-
embeddings = onnx_st.encode(words)
|
45 |
-
print(embeddings)
|
46 |
-
for idx, embedding in enumerate(embeddings):
|
47 |
-
print(f"Embedding {idx+1}: {embedding.shape}")
|
48 |
-
|
49 |
-
# Cosine similarity demonstration
|
50 |
-
def cosine_similarity(a, b):
|
51 |
-
a = a.flatten()
|
52 |
-
b = b.flatten()
|
53 |
-
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
54 |
-
|
55 |
-
print("\nCosine similarities:")
|
56 |
-
print(f"apple vs banana: {cosine_similarity(embeddings[0], embeddings[1]):.4f}")
|
57 |
-
print(f"apple vs car: {cosine_similarity(embeddings[0], embeddings[2]):.4f}")
|
58 |
-
print(f"banana vs car: {cosine_similarity(embeddings[1], embeddings[2]):.4f}")
|
|
|
1 |
+
from sentence_transformers import models
|
2 |
+
import torch
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
5 |
+
import numpy as np
|
6 |
+
# Load tokenizer and ONNX model
|
7 |
+
model_path = "./embeddinggemma-300m"
|
8 |
+
tokenizer = AutoTokenizer.from_pretrained("google/embeddinggemma-300m-qat-q4_0-unquantized")
|
9 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
+
onnx_model = ORTModelForFeatureExtraction.from_pretrained(model_path).to(device)
|
11 |
+
|
12 |
+
class ONNXSentenceTransformer:
|
13 |
+
def __init__(self, model, tokenizer):
|
14 |
+
self.model = model
|
15 |
+
self.tokenizer = tokenizer
|
16 |
+
self.word_embedding_dimension = 768
|
17 |
+
self.pooling = models.Pooling(word_embedding_dimension=self.word_embedding_dimension, pooling_mode_mean_tokens=True)
|
18 |
+
|
19 |
+
def encode(self, sentences, batch_size=32):
|
20 |
+
if isinstance(sentences, str):
|
21 |
+
sentences = [sentences]
|
22 |
+
embeddings = []
|
23 |
+
for i in range(0, len(sentences), batch_size):
|
24 |
+
batch = sentences[i:i+batch_size]
|
25 |
+
inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
|
26 |
+
input_ids = inputs['input_ids']
|
27 |
+
sequence_length = input_ids.shape[1]
|
28 |
+
position_ids = torch.arange(sequence_length)[None, :].expand(input_ids.shape[0], sequence_length)
|
29 |
+
inputs['position_ids'] = position_ids
|
30 |
+
with torch.no_grad():
|
31 |
+
outputs = self.model(**inputs)
|
32 |
+
last_hidden = outputs.last_hidden_state
|
33 |
+
attention_mask = inputs['attention_mask'].to(last_hidden.device)
|
34 |
+
features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
|
35 |
+
pooled = self.pooling(features)['sentence_embedding']
|
36 |
+
embeddings.append(pooled)
|
37 |
+
return torch.cat(embeddings, dim=0).cpu().detach().numpy()
|
38 |
+
|
39 |
+
|
40 |
+
# Usage example
|
41 |
+
onnx_st = ONNXSentenceTransformer(onnx_model, tokenizer)
|
42 |
+
|
43 |
+
words = ["apple", "banana", "car"]
|
44 |
+
embeddings = onnx_st.encode(words)
|
45 |
+
print(embeddings)
|
46 |
+
for idx, embedding in enumerate(embeddings):
|
47 |
+
print(f"Embedding {idx+1}: {embedding.shape}")
|
48 |
+
|
49 |
+
# Cosine similarity demonstration
|
50 |
+
def cosine_similarity(a, b):
|
51 |
+
a = a.flatten()
|
52 |
+
b = b.flatten()
|
53 |
+
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
54 |
+
|
55 |
+
print("\nCosine similarities:")
|
56 |
+
print(f"apple vs banana: {cosine_similarity(embeddings[0], embeddings[1]):.4f}")
|
57 |
+
print(f"apple vs car: {cosine_similarity(embeddings[0], embeddings[2]):.4f}")
|
58 |
+
print(f"banana vs car: {cosine_similarity(embeddings[1], embeddings[2]):.4f}")
|