Alex Sadleir commited on
Commit
d5fb6c3
·
1 Parent(s): 0fdf9c8

Add ONNX and PyTorch pipelines for Gemma3 embedding model

Browse files

- Implemented `gemma3_mean_pooling_basic.py` for basic mean pooling using the Gemma3 model.
- Created `onnx_gemma3_pipeline.py` to define an ONNX pipeline for the Gemma3 embedding model, including tokenization, encoding, and cosine similarity calculations.
- Developed `pytorch_gemma3_pipeline.py` to provide a PyTorch-based implementation of the Gemma3 embedding model, featuring batch processing and mean pooling.
- Added cosine similarity demonstration in both ONNX and PyTorch pipelines for comparing embeddings of sample words.

README.md CHANGED
@@ -58,4 +58,4 @@ The comparison script prints cosine similarities between sample word embeddings
58
 
59
  ## References
60
  - [Optimum-ONNX Gemma3 PR](https://github.com/huggingface/optimum-onnx/pull/50)
61
- - [Gemma3 Model](https://huggingface.co/google/embeddinggemma-300m-qat-q4_0-unquantized)
 
58
 
59
  ## References
60
  - [Optimum-ONNX Gemma3 PR](https://github.com/huggingface/optimum-onnx/pull/50)
61
+ - [Gemma3 Model](https://huggingface.co/google/embeddinggemma-300m-qat-q4_0-unquantized)
compare_gemma3_onnx_vs_pytorch.py CHANGED
@@ -1,66 +1,66 @@
1
- from sentence_transformers import SentenceTransformer
2
- import torch
3
- import numpy as np
4
-
5
- # Words to compare
6
- words = ["apple", "banana", "car"]
7
-
8
- # Load original SentenceTransformer (PyTorch, CUDA)
9
- st_model = SentenceTransformer("google/embeddinggemma-300m-qat-q4_0-unquantized")
10
- st_model = st_model.to("cuda" if torch.cuda.is_available() else "cpu")
11
-
12
- # Get PyTorch embeddings
13
- with torch.no_grad():
14
- pt_embeddings = st_model.encode(words, convert_to_numpy=True, device="cuda" if torch.cuda.is_available() else "cpu")
15
-
16
- from onnx_gemma3_pipeline import onnx_st
17
- from transformers import AutoTokenizer
18
- from optimum.onnxruntime import ORTModelForFeatureExtraction
19
-
20
- # Basic mean pooling ONNX implementation
21
- def basic_mean_pooling(words):
22
- tokenizer = AutoTokenizer.from_pretrained("./embeddinggemma-300m")
23
- model = ORTModelForFeatureExtraction.from_pretrained("./embeddinggemma-300m")
24
- embeddings = []
25
- for word in words:
26
- inputs = tokenizer(word, return_tensors="pt")
27
- input_ids = inputs['input_ids']
28
- sequence_length = input_ids.shape[1]
29
- position_ids = np.arange(sequence_length)[None, :]
30
- position_ids = np.tile(position_ids, (input_ids.shape[0], 1))
31
- inputs['position_ids'] = torch.tensor(position_ids, dtype=torch.long)
32
- outputs = model(**inputs)
33
- last_hidden = outputs.last_hidden_state
34
- attention_mask = inputs['attention_mask']
35
- from sentence_transformers import models
36
- pooling = models.Pooling(word_embedding_dimension=last_hidden.shape[-1], pooling_mode_mean_tokens=True)
37
- features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
38
- pooled = pooling(features)['sentence_embedding']
39
- embeddings.append(pooled[0].detach().cpu().numpy())
40
- return np.stack(embeddings)
41
- from transformers import AutoTokenizer
42
- from optimum.onnxruntime import ORTModelForFeatureExtraction
43
- onnx_embeddings = onnx_st.encode(words)
44
-
45
- # Cosine similarity function
46
- def cosine_similarity(a, b):
47
- a = a.flatten()
48
- b = b.flatten()
49
- return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
50
-
51
- print("Safetensor Cosine similarities:")
52
- print(f"apple vs banana: {cosine_similarity(pt_embeddings[0], pt_embeddings[1]):.4f}")
53
- print(f"apple vs car: {cosine_similarity(pt_embeddings[0], pt_embeddings[2]):.4f}")
54
- print(f"banana vs car: {cosine_similarity(pt_embeddings[1], pt_embeddings[2]):.4f}")
55
-
56
- print("\nONNX Cosine similarities:")
57
- print(f"apple vs banana: {cosine_similarity(onnx_embeddings[0], onnx_embeddings[1]):.4f}")
58
- print(f"apple vs car: {cosine_similarity(onnx_embeddings[0], onnx_embeddings[2]):.4f}")
59
- print(f"banana vs car: {cosine_similarity(onnx_embeddings[1], onnx_embeddings[2]):.4f}")
60
-
61
- # Basic mean pooling ONNX pipeline
62
- basic_embeddings = basic_mean_pooling(words)
63
- print("\nBasic ONNX (mean pooling only) Cosine similarities:")
64
- print(f"apple vs banana: {cosine_similarity(basic_embeddings[0], basic_embeddings[1]):.4f}")
65
- print(f"apple vs car: {cosine_similarity(basic_embeddings[0], basic_embeddings[2]):.4f}")
66
- print(f"banana vs car: {cosine_similarity(basic_embeddings[1], basic_embeddings[2]):.4f}")
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import torch
3
+ import numpy as np
4
+
5
+ # Words to compare
6
+ words = ["apple", "banana", "car"]
7
+
8
+ # Load original SentenceTransformer (PyTorch, CUDA)
9
+ st_model = SentenceTransformer("google/embeddinggemma-300m-qat-q4_0-unquantized")
10
+ st_model = st_model.to("cuda" if torch.cuda.is_available() else "cpu")
11
+
12
+ # Get PyTorch embeddings
13
+ with torch.no_grad():
14
+ pt_embeddings = st_model.encode(words, convert_to_numpy=True, device="cuda" if torch.cuda.is_available() else "cpu")
15
+
16
+ from onnx_gemma3_pipeline import onnx_st
17
+ from transformers import AutoTokenizer
18
+ from optimum.onnxruntime import ORTModelForFeatureExtraction
19
+
20
+ # Basic mean pooling ONNX implementation
21
+ def basic_mean_pooling(words):
22
+ tokenizer = AutoTokenizer.from_pretrained("./embeddinggemma-300m")
23
+ model = ORTModelForFeatureExtraction.from_pretrained("./embeddinggemma-300m")
24
+ embeddings = []
25
+ for word in words:
26
+ inputs = tokenizer(word, return_tensors="pt")
27
+ input_ids = inputs['input_ids']
28
+ sequence_length = input_ids.shape[1]
29
+ position_ids = np.arange(sequence_length)[None, :]
30
+ position_ids = np.tile(position_ids, (input_ids.shape[0], 1))
31
+ inputs['position_ids'] = torch.tensor(position_ids, dtype=torch.long)
32
+ outputs = model(**inputs)
33
+ last_hidden = outputs.last_hidden_state
34
+ attention_mask = inputs['attention_mask']
35
+ from sentence_transformers import models
36
+ pooling = models.Pooling(word_embedding_dimension=last_hidden.shape[-1], pooling_mode_mean_tokens=True)
37
+ features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
38
+ pooled = pooling(features)['sentence_embedding']
39
+ embeddings.append(pooled[0].detach().cpu().numpy())
40
+ return np.stack(embeddings)
41
+ from transformers import AutoTokenizer
42
+ from optimum.onnxruntime import ORTModelForFeatureExtraction
43
+ onnx_embeddings = onnx_st.encode(words)
44
+
45
+ # Cosine similarity function
46
+ def cosine_similarity(a, b):
47
+ a = a.flatten()
48
+ b = b.flatten()
49
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
50
+
51
+ print("Safetensor Cosine similarities:")
52
+ print(f"apple vs banana: {cosine_similarity(pt_embeddings[0], pt_embeddings[1]):.4f}")
53
+ print(f"apple vs car: {cosine_similarity(pt_embeddings[0], pt_embeddings[2]):.4f}")
54
+ print(f"banana vs car: {cosine_similarity(pt_embeddings[1], pt_embeddings[2]):.4f}")
55
+
56
+ print("\nONNX Cosine similarities:")
57
+ print(f"apple vs banana: {cosine_similarity(onnx_embeddings[0], onnx_embeddings[1]):.4f}")
58
+ print(f"apple vs car: {cosine_similarity(onnx_embeddings[0], onnx_embeddings[2]):.4f}")
59
+ print(f"banana vs car: {cosine_similarity(onnx_embeddings[1], onnx_embeddings[2]):.4f}")
60
+
61
+ # Basic mean pooling ONNX pipeline
62
+ basic_embeddings = basic_mean_pooling(words)
63
+ print("\nBasic ONNX (mean pooling only) Cosine similarities:")
64
+ print(f"apple vs banana: {cosine_similarity(basic_embeddings[0], basic_embeddings[1]):.4f}")
65
+ print(f"apple vs car: {cosine_similarity(basic_embeddings[0], basic_embeddings[2]):.4f}")
66
+ print(f"banana vs car: {cosine_similarity(basic_embeddings[1], basic_embeddings[2]):.4f}")
download_missing_hf_files.py CHANGED
@@ -1,69 +1,69 @@
1
- from huggingface_hub import snapshot_download
2
- import os
3
- import shutil
4
- from sentence_transformers import SentenceTransformer
5
- import torch
6
-
7
- # Model repo and local directory
8
- repo_id = "google/embeddinggemma-300m-qat-q4_0-unquantized"
9
- local_dir = "embeddinggemma-300m"
10
-
11
- # Download all files except model.safetensors and those already present
12
- existing_files = set(os.listdir(local_dir))
13
-
14
- # Download snapshot to a temp dir
15
- temp_dir = "_hf_temp_download"
16
- os.makedirs(temp_dir, exist_ok=True)
17
- snapshot_download(
18
- repo_id,
19
- local_dir=temp_dir,
20
- ignore_patterns=["model.safetensors"],
21
- resume_download=True,
22
- allow_patterns=None
23
- )
24
-
25
- # Copy missing files
26
- for fname in os.listdir(temp_dir):
27
- if fname not in existing_files:
28
- shutil.move(os.path.join(temp_dir, fname), os.path.join(local_dir, fname))
29
- print(f"Downloaded: {fname}")
30
- else:
31
- print(f"Already exists: {fname}")
32
-
33
- # Clean up temp dir
34
- shutil.rmtree(temp_dir)
35
- print("Done.")
36
-
37
- # Export Dense layers from SentenceTransformer to ONNX
38
- st_model = SentenceTransformer(repo_id)
39
- dense1 = st_model[2].linear
40
- dense2 = st_model[3].linear
41
-
42
- onnx_dir = os.path.join(local_dir, "onnx")
43
- os.makedirs(onnx_dir, exist_ok=True)
44
-
45
- # Export Dense1
46
- dummy_input1 = torch.randn(1, dense1.in_features)
47
- dense1 = dense1.to(dummy_input1.device)
48
- torch.onnx.export(
49
- dense1,
50
- dummy_input1,
51
- os.path.join(onnx_dir, "dense1.onnx"),
52
- input_names=["input"],
53
- output_names=["output"],
54
- opset_version=14
55
- )
56
- print("Exported dense1.onnx")
57
-
58
- # Export Dense2
59
- dummy_input2 = torch.randn(1, dense2.in_features)
60
- dense2 = dense2.to(dummy_input2.device)
61
- torch.onnx.export(
62
- dense2,
63
- dummy_input2,
64
- os.path.join(onnx_dir, "dense2.onnx"),
65
- input_names=["input"],
66
- output_names=["output"],
67
- opset_version=14
68
- )
69
- print("Exported dense2.onnx")
 
1
+ from huggingface_hub import snapshot_download
2
+ import os
3
+ import shutil
4
+ from sentence_transformers import SentenceTransformer
5
+ import torch
6
+
7
+ # Model repo and local directory
8
+ repo_id = "google/embeddinggemma-300m-qat-q4_0-unquantized"
9
+ local_dir = "embeddinggemma-300m"
10
+
11
+ # Download all files except model.safetensors and those already present
12
+ existing_files = set(os.listdir(local_dir))
13
+
14
+ # Download snapshot to a temp dir
15
+ temp_dir = "_hf_temp_download"
16
+ os.makedirs(temp_dir, exist_ok=True)
17
+ snapshot_download(
18
+ repo_id,
19
+ local_dir=temp_dir,
20
+ ignore_patterns=["model.safetensors"],
21
+ resume_download=True,
22
+ allow_patterns=None
23
+ )
24
+
25
+ # Copy missing files
26
+ for fname in os.listdir(temp_dir):
27
+ if fname not in existing_files:
28
+ shutil.move(os.path.join(temp_dir, fname), os.path.join(local_dir, fname))
29
+ print(f"Downloaded: {fname}")
30
+ else:
31
+ print(f"Already exists: {fname}")
32
+
33
+ # Clean up temp dir
34
+ shutil.rmtree(temp_dir)
35
+ print("Done.")
36
+
37
+ # Export Dense layers from SentenceTransformer to ONNX
38
+ st_model = SentenceTransformer(repo_id)
39
+ dense1 = st_model[2].linear
40
+ dense2 = st_model[3].linear
41
+
42
+ onnx_dir = os.path.join(local_dir, "onnx")
43
+ os.makedirs(onnx_dir, exist_ok=True)
44
+
45
+ # Export Dense1
46
+ dummy_input1 = torch.randn(1, dense1.in_features)
47
+ dense1 = dense1.to(dummy_input1.device)
48
+ torch.onnx.export(
49
+ dense1,
50
+ dummy_input1,
51
+ os.path.join(onnx_dir, "dense1.onnx"),
52
+ input_names=["input"],
53
+ output_names=["output"],
54
+ opset_version=14
55
+ )
56
+ print("Exported dense1.onnx")
57
+
58
+ # Export Dense2
59
+ dummy_input2 = torch.randn(1, dense2.in_features)
60
+ dense2 = dense2.to(dummy_input2.device)
61
+ torch.onnx.export(
62
+ dense2,
63
+ dummy_input2,
64
+ os.path.join(onnx_dir, "dense2.onnx"),
65
+ input_names=["input"],
66
+ output_names=["output"],
67
+ opset_version=14
68
+ )
69
+ print("Exported dense2.onnx")
embeddinggemma-300m/config.json CHANGED
@@ -1,61 +1,61 @@
1
- {
2
- "_sliding_window_pattern": 6,
3
- "architectures": [
4
- "Gemma3TextModel"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "attn_logit_softcapping": null,
9
- "bos_token_id": 2,
10
- "dtype": "float32",
11
- "eos_token_id": 1,
12
- "final_logit_softcapping": null,
13
- "head_dim": 256,
14
- "hidden_activation": "gelu_pytorch_tanh",
15
- "hidden_size": 768,
16
- "initializer_range": 0.02,
17
- "intermediate_size": 1152,
18
- "layer_types": [
19
- "sliding_attention",
20
- "sliding_attention",
21
- "sliding_attention",
22
- "sliding_attention",
23
- "sliding_attention",
24
- "full_attention",
25
- "sliding_attention",
26
- "sliding_attention",
27
- "sliding_attention",
28
- "sliding_attention",
29
- "sliding_attention",
30
- "full_attention",
31
- "sliding_attention",
32
- "sliding_attention",
33
- "sliding_attention",
34
- "sliding_attention",
35
- "sliding_attention",
36
- "full_attention",
37
- "sliding_attention",
38
- "sliding_attention",
39
- "sliding_attention",
40
- "sliding_attention",
41
- "sliding_attention",
42
- "full_attention"
43
- ],
44
- "max_position_embeddings": 2048,
45
- "model_type": "gemma3_text",
46
- "num_attention_heads": 3,
47
- "num_hidden_layers": 24,
48
- "num_key_value_heads": 1,
49
- "pad_token_id": 0,
50
- "query_pre_attn_scalar": 256,
51
- "rms_norm_eps": 1e-06,
52
- "rope_local_base_freq": 10000.0,
53
- "rope_scaling": null,
54
- "rope_theta": 1000000.0,
55
- "sliding_window": 512,
56
- "torch_dtype": "float32",
57
- "transformers_version": "4.53.3",
58
- "use_bidirectional_attention": true,
59
- "use_cache": true,
60
- "vocab_size": 262144
61
- }
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3TextModel"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "dtype": "float32",
11
+ "eos_token_id": 1,
12
+ "final_logit_softcapping": null,
13
+ "head_dim": 256,
14
+ "hidden_activation": "gelu_pytorch_tanh",
15
+ "hidden_size": 768,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 1152,
18
+ "layer_types": [
19
+ "sliding_attention",
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention",
37
+ "sliding_attention",
38
+ "sliding_attention",
39
+ "sliding_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "full_attention"
43
+ ],
44
+ "max_position_embeddings": 2048,
45
+ "model_type": "gemma3_text",
46
+ "num_attention_heads": 3,
47
+ "num_hidden_layers": 24,
48
+ "num_key_value_heads": 1,
49
+ "pad_token_id": 0,
50
+ "query_pre_attn_scalar": 256,
51
+ "rms_norm_eps": 1e-06,
52
+ "rope_local_base_freq": 10000.0,
53
+ "rope_scaling": null,
54
+ "rope_theta": 1000000.0,
55
+ "sliding_window": 512,
56
+ "torch_dtype": "float32",
57
+ "transformers_version": "4.53.3",
58
+ "use_bidirectional_attention": true,
59
+ "use_cache": true,
60
+ "vocab_size": 262144
61
+ }
embeddinggemma-300m/modules.json CHANGED
@@ -1,32 +1,32 @@
1
- [
2
- {
3
- "idx": 0,
4
- "name": "0",
5
- "path": "",
6
- "type": "sentence_transformers.models.Transformer"
7
- },
8
- {
9
- "idx": 1,
10
- "name": "1",
11
- "path": "1_Pooling",
12
- "type": "sentence_transformers.models.Pooling"
13
- },
14
- {
15
- "idx": 2,
16
- "name": "2",
17
- "path": "2_Dense",
18
- "type": "sentence_transformers.models.Dense"
19
- },
20
- {
21
- "idx": 3,
22
- "name": "3",
23
- "path": "3_Dense",
24
- "type": "sentence_transformers.models.Dense"
25
- },
26
- {
27
- "idx": 4,
28
- "name": "4",
29
- "path": "4_Normalize",
30
- "type": "sentence_transformers.models.Normalize"
31
- }
32
  ]
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Dense",
18
+ "type": "sentence_transformers.models.Dense"
19
+ },
20
+ {
21
+ "idx": 3,
22
+ "name": "3",
23
+ "path": "3_Dense",
24
+ "type": "sentence_transformers.models.Dense"
25
+ },
26
+ {
27
+ "idx": 4,
28
+ "name": "4",
29
+ "path": "4_Normalize",
30
+ "type": "sentence_transformers.models.Normalize"
31
+ }
32
  ]
embeddinggemma-300m/special_tokens_map.json CHANGED
@@ -1,33 +1,33 @@
1
- {
2
- "boi_token": "<start_of_image>",
3
- "bos_token": {
4
- "content": "<bos>",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- "eoi_token": "<end_of_image>",
11
- "eos_token": {
12
- "content": "<eos>",
13
- "lstrip": false,
14
- "normalized": false,
15
- "rstrip": false,
16
- "single_word": false
17
- },
18
- "image_token": "<image_soft_token>",
19
- "pad_token": {
20
- "content": "<pad>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false
25
- },
26
- "unk_token": {
27
- "content": "<unk>",
28
- "lstrip": false,
29
- "normalized": false,
30
- "rstrip": false,
31
- "single_word": false
32
- }
33
- }
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
embeddinggemma-300m/tokenizer_config.json CHANGED
The diff for this file is too large to render. See raw diff
 
gemma3_mean_pooling_basic.py CHANGED
@@ -1,24 +1,24 @@
1
- from transformers import AutoTokenizer
2
- from optimum.onnxruntime import ORTModelForFeatureExtraction
3
- from sentence_transformers import models
4
- import numpy as np
5
- import torch
6
-
7
- tokenizer = AutoTokenizer.from_pretrained("./embeddinggemma-300m")
8
- model = ORTModelForFeatureExtraction.from_pretrained("./embeddinggemma-300m")
9
-
10
- inputs = tokenizer("apple", return_tensors="pt")
11
- print(inputs)
12
- input_ids = inputs['input_ids']
13
- sequence_length = input_ids.shape[1]
14
- position_ids = np.arange(sequence_length)[None, :]
15
- position_ids = np.tile(position_ids, (input_ids.shape[0], 1))
16
- inputs['position_ids'] = torch.tensor(position_ids, dtype=torch.long)
17
- outputs = model(**inputs)
18
- last_hidden = outputs.last_hidden_state
19
- attention_mask = inputs['attention_mask']
20
- # Use SentenceTransformer's Pooling module for mean pooling
21
- pooling = models.Pooling(word_embedding_dimension=last_hidden.shape[-1], pooling_mode_mean_tokens=True)
22
- features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
23
- pooled = pooling(features)['sentence_embedding']
24
  print("Mean pooled:", pooled[0][:5].detach().cpu().numpy())
 
1
+ from transformers import AutoTokenizer
2
+ from optimum.onnxruntime import ORTModelForFeatureExtraction
3
+ from sentence_transformers import models
4
+ import numpy as np
5
+ import torch
6
+
7
+ tokenizer = AutoTokenizer.from_pretrained("./embeddinggemma-300m")
8
+ model = ORTModelForFeatureExtraction.from_pretrained("./embeddinggemma-300m")
9
+
10
+ inputs = tokenizer("apple", return_tensors="pt")
11
+ print(inputs)
12
+ input_ids = inputs['input_ids']
13
+ sequence_length = input_ids.shape[1]
14
+ position_ids = np.arange(sequence_length)[None, :]
15
+ position_ids = np.tile(position_ids, (input_ids.shape[0], 1))
16
+ inputs['position_ids'] = torch.tensor(position_ids, dtype=torch.long)
17
+ outputs = model(**inputs)
18
+ last_hidden = outputs.last_hidden_state
19
+ attention_mask = inputs['attention_mask']
20
+ # Use SentenceTransformer's Pooling module for mean pooling
21
+ pooling = models.Pooling(word_embedding_dimension=last_hidden.shape[-1], pooling_mode_mean_tokens=True)
22
+ features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
23
+ pooled = pooling(features)['sentence_embedding']
24
  print("Mean pooled:", pooled[0][:5].detach().cpu().numpy())
onnx_gemma3_pipeline.py CHANGED
@@ -1,91 +1,91 @@
1
- from sentence_transformers import models
2
- import torch
3
- from transformers import AutoTokenizer
4
- from optimum.onnxruntime import ORTModelForFeatureExtraction
5
- import numpy as np
6
- import os
7
- import onnxruntime
8
-
9
- # ONNX pipeline for Gemma3 embedding model
10
- model_dir = "embeddinggemma-300m"
11
- tokenizer = AutoTokenizer.from_pretrained("google/embeddinggemma-300m-qat-q4_0-unquantized")
12
- device = "cuda" if torch.cuda.is_available() else "cpu"
13
- onnx_model = ORTModelForFeatureExtraction.from_pretrained(
14
- model_dir,
15
- file_name="model.onnx"
16
- ).to(device)
17
-
18
- class ONNXTransformer:
19
- def __init__(self, onnx_model, tokenizer, max_seq_length=2048):
20
- self.onnx_model = onnx_model
21
- self.tokenizer = tokenizer
22
- self.max_seq_length = max_seq_length
23
- def encode(self, sentences):
24
- inputs = self.tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=self.max_seq_length)
25
- input_ids = inputs['input_ids']
26
- sequence_length = input_ids.shape[1]
27
- position_ids = torch.arange(sequence_length)[None, :].expand(input_ids.shape[0], sequence_length)
28
- inputs['position_ids'] = position_ids.to(input_ids.device)
29
- with torch.no_grad():
30
- outputs = self.onnx_model(**inputs)
31
- return outputs.last_hidden_state
32
-
33
- modules = []
34
- onnx_transformer = ONNXTransformer(onnx_model, tokenizer, max_seq_length=2048)
35
- modules.append(onnx_transformer)
36
- for idx, name in [(1, "Pooling"), (2, "Dense"), (3, "Dense"), (4, "Normalize")]:
37
- module_path = os.path.join(model_dir, f"{idx}_{name}")
38
- if name == "Pooling":
39
- modules.append(models.Pooling(module_path))
40
- elif name == "Dense":
41
- # Use ONNXRuntime for Dense layers
42
- dense_onnx_path = os.path.join(model_dir, "onnx", f"dense{idx-1}.onnx")
43
- modules.append(onnxruntime.InferenceSession(dense_onnx_path, providers=["CPUExecutionProvider"]))
44
- elif name == "Normalize":
45
- modules.append(models.Normalize())
46
-
47
- class ONNXSentenceTransformer:
48
- def __init__(self, modules):
49
- self.modules = modules
50
- def encode(self, sentences):
51
- features = self.modules[0].encode(sentences)
52
- for module in self.modules[1:]:
53
- if isinstance(module, models.Pooling):
54
- features = module({'token_embeddings': features, 'attention_mask': torch.ones(features.shape[:2], device=features.device)})['sentence_embedding']
55
- elif isinstance(module, onnxruntime.InferenceSession):
56
- # ONNX Dense layer expects shape [1, in_features], so process each embedding separately
57
- if isinstance(features, torch.Tensor):
58
- features = features.cpu().detach().numpy()
59
- outputs = []
60
- for vec in features:
61
- ort_inputs = {module.get_inputs()[0].name: vec.reshape(1, -1)}
62
- out = module.run(None, ort_inputs)[0]
63
- outputs.append(out.squeeze(0))
64
- features = np.stack(outputs, axis=0)
65
- elif isinstance(module, models.Normalize):
66
- # Normalize still uses PyTorch
67
- if not isinstance(features, torch.Tensor):
68
- features = torch.from_numpy(features)
69
- features = module({'sentence_embedding': features})['sentence_embedding']
70
- if isinstance(features, torch.Tensor):
71
- return features.cpu().detach().numpy()
72
- return features
73
-
74
- onnx_st = ONNXSentenceTransformer(modules)
75
-
76
- def cosine_similarity(a, b):
77
- a = a.flatten()
78
- b = b.flatten()
79
- return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
80
-
81
- if __name__ == "__main__":
82
- words = ["apple", "banana", "car"]
83
- embeddings = onnx_st.encode(words)
84
- print(embeddings)
85
- for idx, embedding in enumerate(embeddings):
86
- print(f"Embedding {idx+1}: {embedding.shape}")
87
-
88
- print("\nCosine similarities:")
89
- print(f"apple vs banana: {cosine_similarity(embeddings[0], embeddings[1]):.4f}")
90
- print(f"apple vs car: {cosine_similarity(embeddings[0], embeddings[2]):.4f}")
91
- print(f"banana vs car: {cosine_similarity(embeddings[1], embeddings[2]):.4f}")
 
1
+ from sentence_transformers import models
2
+ import torch
3
+ from transformers import AutoTokenizer
4
+ from optimum.onnxruntime import ORTModelForFeatureExtraction
5
+ import numpy as np
6
+ import os
7
+ import onnxruntime
8
+
9
+ # ONNX pipeline for Gemma3 embedding model
10
+ model_dir = "embeddinggemma-300m"
11
+ tokenizer = AutoTokenizer.from_pretrained("google/embeddinggemma-300m-qat-q4_0-unquantized")
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ onnx_model = ORTModelForFeatureExtraction.from_pretrained(
14
+ model_dir,
15
+ file_name="model.onnx"
16
+ ).to(device)
17
+
18
+ class ONNXTransformer:
19
+ def __init__(self, onnx_model, tokenizer, max_seq_length=2048):
20
+ self.onnx_model = onnx_model
21
+ self.tokenizer = tokenizer
22
+ self.max_seq_length = max_seq_length
23
+ def encode(self, sentences):
24
+ inputs = self.tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=self.max_seq_length)
25
+ input_ids = inputs['input_ids']
26
+ sequence_length = input_ids.shape[1]
27
+ position_ids = torch.arange(sequence_length)[None, :].expand(input_ids.shape[0], sequence_length)
28
+ inputs['position_ids'] = position_ids.to(input_ids.device)
29
+ with torch.no_grad():
30
+ outputs = self.onnx_model(**inputs)
31
+ return outputs.last_hidden_state
32
+
33
+ modules = []
34
+ onnx_transformer = ONNXTransformer(onnx_model, tokenizer, max_seq_length=2048)
35
+ modules.append(onnx_transformer)
36
+ for idx, name in [(1, "Pooling"), (2, "Dense"), (3, "Dense"), (4, "Normalize")]:
37
+ module_path = os.path.join(model_dir, f"{idx}_{name}")
38
+ if name == "Pooling":
39
+ modules.append(models.Pooling(module_path))
40
+ elif name == "Dense":
41
+ # Use ONNXRuntime for Dense layers
42
+ dense_onnx_path = os.path.join(model_dir, "onnx", f"dense{idx-1}.onnx")
43
+ modules.append(onnxruntime.InferenceSession(dense_onnx_path, providers=["CPUExecutionProvider"]))
44
+ elif name == "Normalize":
45
+ modules.append(models.Normalize())
46
+
47
+ class ONNXSentenceTransformer:
48
+ def __init__(self, modules):
49
+ self.modules = modules
50
+ def encode(self, sentences):
51
+ features = self.modules[0].encode(sentences)
52
+ for module in self.modules[1:]:
53
+ if isinstance(module, models.Pooling):
54
+ features = module({'token_embeddings': features, 'attention_mask': torch.ones(features.shape[:2], device=features.device)})['sentence_embedding']
55
+ elif isinstance(module, onnxruntime.InferenceSession):
56
+ # ONNX Dense layer expects shape [1, in_features], so process each embedding separately
57
+ if isinstance(features, torch.Tensor):
58
+ features = features.cpu().detach().numpy()
59
+ outputs = []
60
+ for vec in features:
61
+ ort_inputs = {module.get_inputs()[0].name: vec.reshape(1, -1)}
62
+ out = module.run(None, ort_inputs)[0]
63
+ outputs.append(out.squeeze(0))
64
+ features = np.stack(outputs, axis=0)
65
+ elif isinstance(module, models.Normalize):
66
+ # Normalize still uses PyTorch
67
+ if not isinstance(features, torch.Tensor):
68
+ features = torch.from_numpy(features)
69
+ features = module({'sentence_embedding': features})['sentence_embedding']
70
+ if isinstance(features, torch.Tensor):
71
+ return features.cpu().detach().numpy()
72
+ return features
73
+
74
+ onnx_st = ONNXSentenceTransformer(modules)
75
+
76
+ def cosine_similarity(a, b):
77
+ a = a.flatten()
78
+ b = b.flatten()
79
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
80
+
81
+ if __name__ == "__main__":
82
+ words = ["apple", "banana", "car"]
83
+ embeddings = onnx_st.encode(words)
84
+ print(embeddings)
85
+ for idx, embedding in enumerate(embeddings):
86
+ print(f"Embedding {idx+1}: {embedding.shape}")
87
+
88
+ print("\nCosine similarities:")
89
+ print(f"apple vs banana: {cosine_similarity(embeddings[0], embeddings[1]):.4f}")
90
+ print(f"apple vs car: {cosine_similarity(embeddings[0], embeddings[2]):.4f}")
91
+ print(f"banana vs car: {cosine_similarity(embeddings[1], embeddings[2]):.4f}")
pytorch_gemma3_pipeline.py CHANGED
@@ -1,58 +1,58 @@
1
- from sentence_transformers import models
2
- import torch
3
- from transformers import AutoTokenizer
4
- from optimum.onnxruntime import ORTModelForFeatureExtraction
5
- import numpy as np
6
- # Load tokenizer and ONNX model
7
- model_path = "./embeddinggemma-300m"
8
- tokenizer = AutoTokenizer.from_pretrained("google/embeddinggemma-300m-qat-q4_0-unquantized")
9
- device = "cuda" if torch.cuda.is_available() else "cpu"
10
- onnx_model = ORTModelForFeatureExtraction.from_pretrained(model_path).to(device)
11
-
12
- class ONNXSentenceTransformer:
13
- def __init__(self, model, tokenizer):
14
- self.model = model
15
- self.tokenizer = tokenizer
16
- self.word_embedding_dimension = 768
17
- self.pooling = models.Pooling(word_embedding_dimension=self.word_embedding_dimension, pooling_mode_mean_tokens=True)
18
-
19
- def encode(self, sentences, batch_size=32):
20
- if isinstance(sentences, str):
21
- sentences = [sentences]
22
- embeddings = []
23
- for i in range(0, len(sentences), batch_size):
24
- batch = sentences[i:i+batch_size]
25
- inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
26
- input_ids = inputs['input_ids']
27
- sequence_length = input_ids.shape[1]
28
- position_ids = torch.arange(sequence_length)[None, :].expand(input_ids.shape[0], sequence_length)
29
- inputs['position_ids'] = position_ids
30
- with torch.no_grad():
31
- outputs = self.model(**inputs)
32
- last_hidden = outputs.last_hidden_state
33
- attention_mask = inputs['attention_mask'].to(last_hidden.device)
34
- features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
35
- pooled = self.pooling(features)['sentence_embedding']
36
- embeddings.append(pooled)
37
- return torch.cat(embeddings, dim=0).cpu().detach().numpy()
38
-
39
-
40
- # Usage example
41
- onnx_st = ONNXSentenceTransformer(onnx_model, tokenizer)
42
-
43
- words = ["apple", "banana", "car"]
44
- embeddings = onnx_st.encode(words)
45
- print(embeddings)
46
- for idx, embedding in enumerate(embeddings):
47
- print(f"Embedding {idx+1}: {embedding.shape}")
48
-
49
- # Cosine similarity demonstration
50
- def cosine_similarity(a, b):
51
- a = a.flatten()
52
- b = b.flatten()
53
- return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
54
-
55
- print("\nCosine similarities:")
56
- print(f"apple vs banana: {cosine_similarity(embeddings[0], embeddings[1]):.4f}")
57
- print(f"apple vs car: {cosine_similarity(embeddings[0], embeddings[2]):.4f}")
58
- print(f"banana vs car: {cosine_similarity(embeddings[1], embeddings[2]):.4f}")
 
1
+ from sentence_transformers import models
2
+ import torch
3
+ from transformers import AutoTokenizer
4
+ from optimum.onnxruntime import ORTModelForFeatureExtraction
5
+ import numpy as np
6
+ # Load tokenizer and ONNX model
7
+ model_path = "./embeddinggemma-300m"
8
+ tokenizer = AutoTokenizer.from_pretrained("google/embeddinggemma-300m-qat-q4_0-unquantized")
9
+ device = "cuda" if torch.cuda.is_available() else "cpu"
10
+ onnx_model = ORTModelForFeatureExtraction.from_pretrained(model_path).to(device)
11
+
12
+ class ONNXSentenceTransformer:
13
+ def __init__(self, model, tokenizer):
14
+ self.model = model
15
+ self.tokenizer = tokenizer
16
+ self.word_embedding_dimension = 768
17
+ self.pooling = models.Pooling(word_embedding_dimension=self.word_embedding_dimension, pooling_mode_mean_tokens=True)
18
+
19
+ def encode(self, sentences, batch_size=32):
20
+ if isinstance(sentences, str):
21
+ sentences = [sentences]
22
+ embeddings = []
23
+ for i in range(0, len(sentences), batch_size):
24
+ batch = sentences[i:i+batch_size]
25
+ inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
26
+ input_ids = inputs['input_ids']
27
+ sequence_length = input_ids.shape[1]
28
+ position_ids = torch.arange(sequence_length)[None, :].expand(input_ids.shape[0], sequence_length)
29
+ inputs['position_ids'] = position_ids
30
+ with torch.no_grad():
31
+ outputs = self.model(**inputs)
32
+ last_hidden = outputs.last_hidden_state
33
+ attention_mask = inputs['attention_mask'].to(last_hidden.device)
34
+ features = {'token_embeddings': last_hidden, 'attention_mask': attention_mask}
35
+ pooled = self.pooling(features)['sentence_embedding']
36
+ embeddings.append(pooled)
37
+ return torch.cat(embeddings, dim=0).cpu().detach().numpy()
38
+
39
+
40
+ # Usage example
41
+ onnx_st = ONNXSentenceTransformer(onnx_model, tokenizer)
42
+
43
+ words = ["apple", "banana", "car"]
44
+ embeddings = onnx_st.encode(words)
45
+ print(embeddings)
46
+ for idx, embedding in enumerate(embeddings):
47
+ print(f"Embedding {idx+1}: {embedding.shape}")
48
+
49
+ # Cosine similarity demonstration
50
+ def cosine_similarity(a, b):
51
+ a = a.flatten()
52
+ b = b.flatten()
53
+ return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
54
+
55
+ print("\nCosine similarities:")
56
+ print(f"apple vs banana: {cosine_similarity(embeddings[0], embeddings[1]):.4f}")
57
+ print(f"apple vs car: {cosine_similarity(embeddings[0], embeddings[2]):.4f}")
58
+ print(f"banana vs car: {cosine_similarity(embeddings[1], embeddings[2]):.4f}")