alexmarques commited on
Commit
0bdcaf2
·
verified ·
1 Parent(s): 02e3d99

Upload folder using huggingface_hub

Browse files
consolidated.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5694e04fc45f53436051c68f77f08b7d5379b72788f83bcd5883e1868d3dfca3
3
+ size 6141906040
convert_voxtral_hf_to_mistral.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ import argparse
16
+ import gc
17
+ import json
18
+ import os
19
+ import re
20
+
21
+ from safetensors.torch import save_file
22
+ from safetensors.torch import safe_open
23
+ from huggingface_hub import snapshot_download
24
+
25
+ from transformers import VoxtralConfig
26
+
27
+ # fmt: off
28
+ STATE_DICT_MAPPING = {
29
+ r"^language_model\.lm_head": r"output",
30
+ r"^language_model\.model\.norm": r"norm",
31
+ r"^language_model\.model\.embed_tokens": r"tok_embeddings",
32
+ r"^language_model\.model\.layers\.(\d+)\.input_layernorm": r"layers.\1.attention_norm",
33
+ r"^language_model\.model\.layers\.(\d+)\.post_attention_layernorm": r"layers.\1.ffn_norm",
34
+ r"^language_model\.model\.layers\.(\d+)\.self_attn\.(q|k|v|o)_proj": r"layers.\1.attention.w\2",
35
+ r"^language_model\.model\.layers\.(\d+)\.mlp\.gate_proj": r"layers.\1.feed_forward.w1",
36
+ r"^language_model\.model\.layers\.(\d+)\.mlp\.down_proj": r"layers.\1.feed_forward.w2",
37
+ r"^language_model\.model\.layers\.(\d+)\.mlp\.up_proj": r"layers.\1.feed_forward.w3",
38
+ r"language_model.model.embed_tokens": r"tok_embeddings",
39
+ r"audio_tower.conv1": r"mm_whisper_embeddings.whisper_encoder.conv_layers.0" ,
40
+ r"audio_tower.conv2": r"mm_whisper_embeddings.whisper_encoder.conv_layers.1" ,
41
+ r"audio_tower.layer_norm": r"mm_whisper_embeddings.whisper_encoder.transformer.norm" ,
42
+ r"audio_tower.layers.(\d+).self_attn.(q|k|v)_proj": r"mm_whisper_embeddings.whisper_encoder.transformer.layers.\1.attention.w\2" ,
43
+ r"audio_tower.layers.(\d+).self_attn.out_proj": r"mm_whisper_embeddings.whisper_encoder.transformer.layers.\1.attention.wo" ,
44
+ r"audio_tower.layers.(\d+).self_attn_layer_norm": r"mm_whisper_embeddings.whisper_encoder.transformer.layers.\1.attention_norm" ,
45
+ r"audio_tower.layers.(\d+).fc(\d+)": r"mm_whisper_embeddings.whisper_encoder.transformer.layers.\1.feed_forward.w\2" ,
46
+ r"audio_tower.layers.(\d+).final_layer_norm": r"mm_whisper_embeddings.whisper_encoder.transformer.layers.\1.ffn_norm" ,
47
+ r"multi_modal_projector.linear_1": r"mm_whisper_embeddings.audio_language_projection.0" ,
48
+ r"multi_modal_projector.linear_2": r"mm_whisper_embeddings.audio_language_projection.2" ,
49
+ }
50
+ # fmt: on
51
+
52
+ SKIP_KEYS = ["audio_tower.embed_positions.weight"]
53
+
54
+ def add_quantization_config(config, hf_config: VoxtralConfig):
55
+ quantization_config = hf_config.quantization_config
56
+ mistral_ignore = [] # keys to ignore in the quantization config
57
+ for hf_key in quantization_config["ignore"]:
58
+ mistral_key = map_hf_key_to_mistral(hf_key)
59
+ mistral_ignore.append(mistral_key)
60
+ quantization_config["ignore"] = mistral_ignore
61
+ config["quantization"] = quantization_config
62
+
63
+ return config
64
+
65
+ def map_hf_key_to_mistral(hf_key):
66
+ """Map a key from HF format to Mistral format"""
67
+ for pattern, replacement in STATE_DICT_MAPPING.items():
68
+ new_key, n_replace = re.subn(pattern, replacement, hf_key)
69
+ if n_replace > 0:
70
+ return new_key.replace("weight_scale", "qscale_weight")
71
+
72
+ # If no mapping found, return the original key
73
+ return hf_key.replace("weight_scale", "qscale_weight")
74
+
75
+
76
+ def permute_for_mistral_rope(tensor, n_heads, dim1, dim2):
77
+ """Reverse the ROPE permutation to get back to Mistral format."""
78
+ tensor = tensor.view(n_heads, 2, dim1 // n_heads // 2, dim2)
79
+ tensor = tensor.transpose(1, 2)
80
+ tensor = tensor.reshape(dim1, dim2)
81
+ return tensor
82
+
83
+
84
+ def convert_state_dict(hf_state_dict, config):
85
+ """Convert HF Voxtral state dict to Mistral format"""
86
+ mistral_dict = {}
87
+
88
+ num_attention_heads = config["n_heads"]
89
+ hidden_size = config["dim"]
90
+ head_dim = config["head_dim"]
91
+ num_key_value_heads = config["n_kv_heads"]
92
+ key_value_dim = head_dim * num_key_value_heads
93
+ query_dim = head_dim * num_attention_heads
94
+
95
+ for hf_key, tensor in hf_state_dict.items():
96
+ if hf_key in SKIP_KEYS:
97
+ continue
98
+
99
+ mistral_key = map_hf_key_to_mistral(hf_key)
100
+
101
+ if "language_model" in hf_key:
102
+ if hf_key.endswith("q_proj.weight"):
103
+ tensor = permute_for_mistral_rope(tensor, num_attention_heads, query_dim, hidden_size)
104
+ elif hf_key.endswith("q_proj.weight_scale") and tensor.size(0) == num_attention_heads:
105
+ tensor = permute_for_mistral_rope(tensor, num_attention_heads, query_dim, 1)
106
+ elif hf_key.endswith("k_proj.weight"):
107
+ tensor = permute_for_mistral_rope(tensor, num_key_value_heads, key_value_dim, hidden_size)
108
+ elif hf_key.endswith("k_proj.weight_scale") and tensor.size(0) == num_key_value_heads:
109
+ tensor = permute_for_mistral_rope(tensor, num_key_value_heads, key_value_dim, 1)
110
+
111
+ mistral_dict[mistral_key] = tensor
112
+
113
+ return mistral_dict
114
+
115
+
116
+ def write_model(
117
+ input_path_or_repo,
118
+ output_dir,
119
+ unquantized_model_path=None,
120
+ ):
121
+ print("Converting HF Voxtral model to Mistral format.")
122
+ os.makedirs(output_dir, exist_ok=True)
123
+
124
+ # Load the HF Voxtral model
125
+ print(f"Loading HF Voxtral model from {input_path_or_repo}...")
126
+ hf_config = VoxtralConfig.from_pretrained(input_path_or_repo)
127
+
128
+ local_path = snapshot_download(input_path_or_repo)
129
+
130
+ # Convert config
131
+ config_path = os.path.join(local_path, "params.json")
132
+ with open(config_path, "r") as f:
133
+ config = json.load(f)
134
+ if os.path.exists(config_path):
135
+ if unquantized_model_path is not None:
136
+ config = add_quantization_config(config, hf_config)
137
+
138
+ with open(os.path.join(output_dir, "params.json"), "w") as f:
139
+ json.dump(config, f, indent=2)
140
+ else:
141
+ raise ValueError(f"Unquantized model config not found for {unquantized_model_path}")
142
+
143
+ # Convert state dict
144
+ print("Converting state dict...")
145
+ tensor_files = sorted([f for f in os.listdir(os.path.join(local_path)) if f.endswith(".safetensors")])
146
+
147
+ hf_state_dict = {}
148
+
149
+ for file in tensor_files:
150
+ file_path = os.path.join(local_path, file)
151
+ with safe_open(file_path, framework="pt", device="cuda") as f:
152
+ for key in f.keys():
153
+ hf_state_dict[key] = f.get_tensor(key)
154
+
155
+ mistral_state_dict = convert_state_dict(hf_state_dict, config)
156
+
157
+ # save the state dict
158
+ save_file(mistral_state_dict, os.path.join(output_dir, "consolidated.safetensors"))
159
+
160
+ del hf_state_dict, mistral_state_dict
161
+ gc.collect()
162
+ print("Model converted successfully.")
163
+
164
+ def write_tokenizer(input_path_or_repo: str, output_dir: str):
165
+ """Extract and save the tokenizer from Voxtral model"""
166
+ from transformers import MistralCommonTokenizer
167
+
168
+ print("Extracting tokenizer...")
169
+ tokenizer = MistralCommonTokenizer.from_pretrained(input_path_or_repo)
170
+ tokenizer.save_pretrained(output_dir)
171
+ print("Tokenizer saved successfully.")
172
+
173
+
174
+ def main():
175
+ parser = argparse.ArgumentParser(description="Convert HF Voxtral weights to Mistral format")
176
+ parser.add_argument(
177
+ "--input_path_or_repo",
178
+ type=str,
179
+ default="RedHatAI/Voxtral-Mini-3B-2507-FP8-dynamic",
180
+ help="Path or repo containing HF Voxtral model",
181
+ )
182
+ parser.add_argument(
183
+ "--output_dir",
184
+ type=str,
185
+ default="Voxtral-Mini-3B-2507-FP8-dynamic-converted",
186
+ help="Location to write Mistral model and tokenizer",
187
+ )
188
+ parser.add_argument(
189
+ "--skip_tokenizer",
190
+ action="store_true",
191
+ help="Skip tokenizer conversion"
192
+ )
193
+ parser.add_argument(
194
+ "--unquantized_model_path",
195
+ type=str,
196
+ default="mistralai/Voxtral-Mini-3B-2507",
197
+ help="Path to the unquantized model",
198
+ )
199
+ args = parser.parse_args()
200
+
201
+ write_model(
202
+ args.input_path_or_repo,
203
+ args.output_dir,
204
+ unquantized_model_path=args.unquantized_model_path,
205
+ )
206
+
207
+ if not args.skip_tokenizer:
208
+ write_tokenizer(
209
+ args.input_path_or_repo,
210
+ args.output_dir,
211
+ )
212
+
213
+
214
+ if __name__ == "__main__":
215
+ main()
params.json CHANGED
@@ -30,5 +30,241 @@
30
  "downsample_factor": 4
31
  }
32
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  }
34
  }
 
30
  "downsample_factor": 4
31
  }
32
  }
33
+ },
34
+ "quantization": {
35
+ "config_groups": {
36
+ "group_0": {
37
+ "input_activations": {
38
+ "actorder": null,
39
+ "block_structure": null,
40
+ "dynamic": true,
41
+ "group_size": null,
42
+ "num_bits": 8,
43
+ "observer": null,
44
+ "observer_kwargs": {},
45
+ "strategy": "token",
46
+ "symmetric": true,
47
+ "type": "float"
48
+ },
49
+ "output_activations": null,
50
+ "targets": [
51
+ "Linear"
52
+ ],
53
+ "weights": {
54
+ "actorder": null,
55
+ "block_structure": null,
56
+ "dynamic": false,
57
+ "group_size": null,
58
+ "num_bits": 8,
59
+ "observer": "minmax",
60
+ "observer_kwargs": {},
61
+ "strategy": "channel",
62
+ "symmetric": true,
63
+ "type": "float"
64
+ }
65
+ }
66
+ },
67
+ "format": "float-quantized",
68
+ "global_compression_ratio": null,
69
+ "ignore": [
70
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.0.attention.wk",
71
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.0.attention.wv",
72
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.0.attention.wq",
73
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.0.attention.wo",
74
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.0.feed_forward.w1",
75
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.0.feed_forward.w2",
76
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.1.attention.wk",
77
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.1.attention.wv",
78
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.1.attention.wq",
79
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.1.attention.wo",
80
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.1.feed_forward.w1",
81
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.1.feed_forward.w2",
82
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.2.attention.wk",
83
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.2.attention.wv",
84
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.2.attention.wq",
85
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.2.attention.wo",
86
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.2.feed_forward.w1",
87
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.2.feed_forward.w2",
88
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.3.attention.wk",
89
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.3.attention.wv",
90
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.3.attention.wq",
91
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.3.attention.wo",
92
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.3.feed_forward.w1",
93
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.3.feed_forward.w2",
94
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.4.attention.wk",
95
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.4.attention.wv",
96
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.4.attention.wq",
97
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.4.attention.wo",
98
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.4.feed_forward.w1",
99
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.4.feed_forward.w2",
100
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.5.attention.wk",
101
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.5.attention.wv",
102
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.5.attention.wq",
103
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.5.attention.wo",
104
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.5.feed_forward.w1",
105
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.5.feed_forward.w2",
106
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.6.attention.wk",
107
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.6.attention.wv",
108
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.6.attention.wq",
109
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.6.attention.wo",
110
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.6.feed_forward.w1",
111
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.6.feed_forward.w2",
112
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.7.attention.wk",
113
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.7.attention.wv",
114
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.7.attention.wq",
115
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.7.attention.wo",
116
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.7.feed_forward.w1",
117
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.7.feed_forward.w2",
118
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.8.attention.wk",
119
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.8.attention.wv",
120
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.8.attention.wq",
121
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.8.attention.wo",
122
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.8.feed_forward.w1",
123
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.8.feed_forward.w2",
124
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.9.attention.wk",
125
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.9.attention.wv",
126
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.9.attention.wq",
127
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.9.attention.wo",
128
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.9.feed_forward.w1",
129
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.9.feed_forward.w2",
130
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.10.attention.wk",
131
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.10.attention.wv",
132
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.10.attention.wq",
133
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.10.attention.wo",
134
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.10.feed_forward.w1",
135
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.10.feed_forward.w2",
136
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.11.attention.wk",
137
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.11.attention.wv",
138
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.11.attention.wq",
139
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.11.attention.wo",
140
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.11.feed_forward.w1",
141
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.11.feed_forward.w2",
142
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.12.attention.wk",
143
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.12.attention.wv",
144
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.12.attention.wq",
145
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.12.attention.wo",
146
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.12.feed_forward.w1",
147
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.12.feed_forward.w2",
148
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.13.attention.wk",
149
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.13.attention.wv",
150
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.13.attention.wq",
151
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.13.attention.wo",
152
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.13.feed_forward.w1",
153
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.13.feed_forward.w2",
154
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.14.attention.wk",
155
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.14.attention.wv",
156
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.14.attention.wq",
157
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.14.attention.wo",
158
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.14.feed_forward.w1",
159
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.14.feed_forward.w2",
160
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.15.attention.wk",
161
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.15.attention.wv",
162
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.15.attention.wq",
163
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.15.attention.wo",
164
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.15.feed_forward.w1",
165
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.15.feed_forward.w2",
166
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.16.attention.wk",
167
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.16.attention.wv",
168
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.16.attention.wq",
169
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.16.attention.wo",
170
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.16.feed_forward.w1",
171
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.16.feed_forward.w2",
172
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.17.attention.wk",
173
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.17.attention.wv",
174
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.17.attention.wq",
175
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.17.attention.wo",
176
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.17.feed_forward.w1",
177
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.17.feed_forward.w2",
178
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.18.attention.wk",
179
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.18.attention.wv",
180
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.18.attention.wq",
181
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.18.attention.wo",
182
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.18.feed_forward.w1",
183
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.18.feed_forward.w2",
184
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.19.attention.wk",
185
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.19.attention.wv",
186
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.19.attention.wq",
187
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.19.attention.wo",
188
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.19.feed_forward.w1",
189
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.19.feed_forward.w2",
190
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.20.attention.wk",
191
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.20.attention.wv",
192
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.20.attention.wq",
193
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.20.attention.wo",
194
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.20.feed_forward.w1",
195
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.20.feed_forward.w2",
196
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.21.attention.wk",
197
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.21.attention.wv",
198
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.21.attention.wq",
199
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.21.attention.wo",
200
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.21.feed_forward.w1",
201
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.21.feed_forward.w2",
202
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.22.attention.wk",
203
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.22.attention.wv",
204
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.22.attention.wq",
205
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.22.attention.wo",
206
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.22.feed_forward.w1",
207
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.22.feed_forward.w2",
208
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.23.attention.wk",
209
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.23.attention.wv",
210
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.23.attention.wq",
211
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.23.attention.wo",
212
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.23.feed_forward.w1",
213
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.23.feed_forward.w2",
214
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.24.attention.wk",
215
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.24.attention.wv",
216
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.24.attention.wq",
217
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.24.attention.wo",
218
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.24.feed_forward.w1",
219
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.24.feed_forward.w2",
220
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.25.attention.wk",
221
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.25.attention.wv",
222
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.25.attention.wq",
223
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.25.attention.wo",
224
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.25.feed_forward.w1",
225
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.25.feed_forward.w2",
226
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.26.attention.wk",
227
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.26.attention.wv",
228
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.26.attention.wq",
229
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.26.attention.wo",
230
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.26.feed_forward.w1",
231
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.26.feed_forward.w2",
232
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.27.attention.wk",
233
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.27.attention.wv",
234
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.27.attention.wq",
235
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.27.attention.wo",
236
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.27.feed_forward.w1",
237
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.27.feed_forward.w2",
238
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.28.attention.wk",
239
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.28.attention.wv",
240
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.28.attention.wq",
241
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.28.attention.wo",
242
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.28.feed_forward.w1",
243
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.28.feed_forward.w2",
244
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.29.attention.wk",
245
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.29.attention.wv",
246
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.29.attention.wq",
247
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.29.attention.wo",
248
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.29.feed_forward.w1",
249
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.29.feed_forward.w2",
250
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.30.attention.wk",
251
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.30.attention.wv",
252
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.30.attention.wq",
253
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.30.attention.wo",
254
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.30.feed_forward.w1",
255
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.30.feed_forward.w2",
256
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.31.attention.wk",
257
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.31.attention.wv",
258
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.31.attention.wq",
259
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.31.attention.wo",
260
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.31.feed_forward.w1",
261
+ "mm_whisper_embeddings.whisper_encoder.transformer.layers.31.feed_forward.w2",
262
+ "output",
263
+ "mm_whisper_embeddings.audio_language_projection.0",
264
+ "mm_whisper_embeddings.audio_language_projection.2"
265
+ ],
266
+ "kv_cache_scheme": null,
267
+ "quant_method": "compressed-tensors",
268
+ "quantization_status": "compressed"
269
  }
270
  }