bowenbaoamd commited on
Commit
6a03bf5
·
verified ·
1 Parent(s): 19c1d68

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/model/mistralai/Mixtral-8x7B-Instruct-v0.1",
3
  "architectures": [
4
  "MixtralForCausalLM"
5
  ],
@@ -30,69 +30,37 @@
30
  "weight_merge_groups": null
31
  },
32
  "ignored_layers": [
33
- "model.layers.0.self_attn.o_proj",
34
  "model.layers.0.block_sparse_moe.gate",
35
- "model.layers.1.self_attn.o_proj",
36
  "model.layers.1.block_sparse_moe.gate",
37
- "model.layers.2.self_attn.o_proj",
38
  "model.layers.2.block_sparse_moe.gate",
39
- "model.layers.3.self_attn.o_proj",
40
  "model.layers.3.block_sparse_moe.gate",
41
- "model.layers.4.self_attn.o_proj",
42
  "model.layers.4.block_sparse_moe.gate",
43
- "model.layers.5.self_attn.o_proj",
44
  "model.layers.5.block_sparse_moe.gate",
45
- "model.layers.6.self_attn.o_proj",
46
  "model.layers.6.block_sparse_moe.gate",
47
- "model.layers.7.self_attn.o_proj",
48
  "model.layers.7.block_sparse_moe.gate",
49
- "model.layers.8.self_attn.o_proj",
50
  "model.layers.8.block_sparse_moe.gate",
51
- "model.layers.9.self_attn.o_proj",
52
  "model.layers.9.block_sparse_moe.gate",
53
- "model.layers.10.self_attn.o_proj",
54
  "model.layers.10.block_sparse_moe.gate",
55
- "model.layers.11.self_attn.o_proj",
56
  "model.layers.11.block_sparse_moe.gate",
57
- "model.layers.12.self_attn.o_proj",
58
  "model.layers.12.block_sparse_moe.gate",
59
- "model.layers.13.self_attn.o_proj",
60
  "model.layers.13.block_sparse_moe.gate",
61
- "model.layers.14.self_attn.o_proj",
62
  "model.layers.14.block_sparse_moe.gate",
63
- "model.layers.15.self_attn.o_proj",
64
  "model.layers.15.block_sparse_moe.gate",
65
- "model.layers.16.self_attn.o_proj",
66
  "model.layers.16.block_sparse_moe.gate",
67
- "model.layers.17.self_attn.o_proj",
68
  "model.layers.17.block_sparse_moe.gate",
69
- "model.layers.18.self_attn.o_proj",
70
  "model.layers.18.block_sparse_moe.gate",
71
- "model.layers.19.self_attn.o_proj",
72
  "model.layers.19.block_sparse_moe.gate",
73
- "model.layers.20.self_attn.o_proj",
74
  "model.layers.20.block_sparse_moe.gate",
75
- "model.layers.21.self_attn.o_proj",
76
  "model.layers.21.block_sparse_moe.gate",
77
- "model.layers.22.self_attn.o_proj",
78
  "model.layers.22.block_sparse_moe.gate",
79
- "model.layers.23.self_attn.o_proj",
80
  "model.layers.23.block_sparse_moe.gate",
81
- "model.layers.24.self_attn.o_proj",
82
  "model.layers.24.block_sparse_moe.gate",
83
- "model.layers.25.self_attn.o_proj",
84
  "model.layers.25.block_sparse_moe.gate",
85
- "model.layers.26.self_attn.o_proj",
86
  "model.layers.26.block_sparse_moe.gate",
87
- "model.layers.27.self_attn.o_proj",
88
  "model.layers.27.block_sparse_moe.gate",
89
- "model.layers.28.self_attn.o_proj",
90
  "model.layers.28.block_sparse_moe.gate",
91
- "model.layers.29.self_attn.o_proj",
92
  "model.layers.29.block_sparse_moe.gate",
93
- "model.layers.30.self_attn.o_proj",
94
  "model.layers.30.block_sparse_moe.gate",
95
- "model.layers.31.self_attn.o_proj",
96
  "model.layers.31.block_sparse_moe.gate",
97
  "lm_head"
98
  ],
 
1
  {
2
+ "_name_or_path": "/model/mistralai/Mixtral-8x7B-Instruct-v0.1-MLCommons",
3
  "architectures": [
4
  "MixtralForCausalLM"
5
  ],
 
30
  "weight_merge_groups": null
31
  },
32
  "ignored_layers": [
 
33
  "model.layers.0.block_sparse_moe.gate",
 
34
  "model.layers.1.block_sparse_moe.gate",
 
35
  "model.layers.2.block_sparse_moe.gate",
 
36
  "model.layers.3.block_sparse_moe.gate",
 
37
  "model.layers.4.block_sparse_moe.gate",
 
38
  "model.layers.5.block_sparse_moe.gate",
 
39
  "model.layers.6.block_sparse_moe.gate",
 
40
  "model.layers.7.block_sparse_moe.gate",
 
41
  "model.layers.8.block_sparse_moe.gate",
 
42
  "model.layers.9.block_sparse_moe.gate",
 
43
  "model.layers.10.block_sparse_moe.gate",
 
44
  "model.layers.11.block_sparse_moe.gate",
 
45
  "model.layers.12.block_sparse_moe.gate",
 
46
  "model.layers.13.block_sparse_moe.gate",
 
47
  "model.layers.14.block_sparse_moe.gate",
 
48
  "model.layers.15.block_sparse_moe.gate",
 
49
  "model.layers.16.block_sparse_moe.gate",
 
50
  "model.layers.17.block_sparse_moe.gate",
 
51
  "model.layers.18.block_sparse_moe.gate",
 
52
  "model.layers.19.block_sparse_moe.gate",
 
53
  "model.layers.20.block_sparse_moe.gate",
 
54
  "model.layers.21.block_sparse_moe.gate",
 
55
  "model.layers.22.block_sparse_moe.gate",
 
56
  "model.layers.23.block_sparse_moe.gate",
 
57
  "model.layers.24.block_sparse_moe.gate",
 
58
  "model.layers.25.block_sparse_moe.gate",
 
59
  "model.layers.26.block_sparse_moe.gate",
 
60
  "model.layers.27.block_sparse_moe.gate",
 
61
  "model.layers.28.block_sparse_moe.gate",
 
62
  "model.layers.29.block_sparse_moe.gate",
 
63
  "model.layers.30.block_sparse_moe.gate",
 
64
  "model.layers.31.block_sparse_moe.gate",
65
  "lm_head"
66
  ],
model-00001-of-00010.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bcb950c9534c37a1a23de2cbc4cee35a1d8190f2e85aa2102b8e33242afef858
3
- size 4960111600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ecd14ec5d85f2c066dbb7f51ef8be30e0b08a203265450b45eed2a63acb1ac8
3
+ size 4951724404
model-00002-of-00010.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d681596f0d5034a261fa558709e8901cc5a5898c5aa9417a8307a47d7c0242d6
3
- size 4991502916
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aa6b0680351be68dcee66544094e9ca1639bb00809a922e1eb278d2190b23ae
3
+ size 4999892088
model-00003-of-00010.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f00b5e2ce10654caf281d8ff1e641b53dd3cc7917bf6f4d8b578d8c6d50bc986
3
- size 4991519940
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15d66cf16d3f70e368d03a4f14adde83680aab0eb8fa77cf9855df6f7c056b72
3
+ size 4983198624
model-00004-of-00010.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2292011a9f7aae4705836f87599d765b4b34cf44330ab4ce2ee007a07683f2b9
3
- size 4991569892
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68a5200086c3543ead430588b88bd9ee6bd733204d2cdeb4a9ab33e216cb00d8
3
+ size 4999892344
model-00005-of-00010.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9056294f3a654d736cb87a7f6e96e2637d2081891855a83be34c8a69d36a9a7c
3
- size 4991503324
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b073cb23401fa7aed985e96fca66b69cc33f463d52afb9c7a4027bbd6b07ccf6
3
+ size 4999909308
model-00006-of-00010.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df81de3e2a20de00ac63c9e12707b4e2caf099a3325b582e3a350056d73ba1ae
3
- size 4991586796
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02325fcd4cbeead017c342bc0dffc84611bcc1db2d174672e4534efeea72a929
3
+ size 4983181812
model-00007-of-00010.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e325f381fc0a185a3506e34dd066bab362ecfddaa3626e80c5ec37283f8404d3
3
- size 4991503204
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2420bbfc5134419522aefe880a730c69ac82bcb7a754cfe63857347debd4f769
3
+ size 4999892472
model-00008-of-00010.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef36f466071df886a0cac5ea8f338c4699de89cd6d9ff608576bb5a5b1b5d383
3
- size 4991520228
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d7e3cb3975f378758fb6a74c41674d694dc60ef62ed7220494b0ef04d4ffe20
3
+ size 4983198840
model-00009-of-00010.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8bf4c7d74cf9125239b7b32587e4b2fefb7a99e3bd71728cae5c4e80a551f84
3
- size 4991569892
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d881a918e94bb35844a2373251643d2c75940078ec9bb7fc345f0361e2434c0
3
+ size 4999892424
model-00010-of-00010.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35ef3bf088cb3893f98692dbc3504ae848ba3591274d124b3314d6936b718f7c
3
- size 2611077968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:874b967850a292cc1c0acbbcce7fb48286bea2346ed46567aead9b47948e9ad9
3
+ size 2065815212
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 47503126144
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00010-of-00010.safetensors",
@@ -84,7 +84,9 @@
84
  "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00010.safetensors",
85
  "model.layers.0.self_attn.k_proj.weight_scale": "model-00001-of-00010.safetensors",
86
  "model.layers.0.self_attn.k_scale": "model-00001-of-00010.safetensors",
 
87
  "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00010.safetensors",
 
88
  "model.layers.0.self_attn.q_proj.input_scale": "model-00001-of-00010.safetensors",
89
  "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00010.safetensors",
90
  "model.layers.0.self_attn.q_proj.weight_scale": "model-00001-of-00010.safetensors",
@@ -171,7 +173,9 @@
171
  "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00010.safetensors",
172
  "model.layers.1.self_attn.k_proj.weight_scale": "model-00001-of-00010.safetensors",
173
  "model.layers.1.self_attn.k_scale": "model-00001-of-00010.safetensors",
 
174
  "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00010.safetensors",
 
175
  "model.layers.1.self_attn.q_proj.input_scale": "model-00001-of-00010.safetensors",
176
  "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00010.safetensors",
177
  "model.layers.1.self_attn.q_proj.weight_scale": "model-00001-of-00010.safetensors",
@@ -179,12 +183,12 @@
179
  "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00010.safetensors",
180
  "model.layers.1.self_attn.v_proj.weight_scale": "model-00001-of-00010.safetensors",
181
  "model.layers.1.self_attn.v_scale": "model-00001-of-00010.safetensors",
182
- "model.layers.10.block_sparse_moe.experts.0.w1.input_scale": "model-00004-of-00010.safetensors",
183
- "model.layers.10.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00010.safetensors",
184
- "model.layers.10.block_sparse_moe.experts.0.w1.weight_scale": "model-00004-of-00010.safetensors",
185
- "model.layers.10.block_sparse_moe.experts.0.w2.input_scale": "model-00004-of-00010.safetensors",
186
- "model.layers.10.block_sparse_moe.experts.0.w2.weight": "model-00004-of-00010.safetensors",
187
- "model.layers.10.block_sparse_moe.experts.0.w2.weight_scale": "model-00004-of-00010.safetensors",
188
  "model.layers.10.block_sparse_moe.experts.0.w3.input_scale": "model-00004-of-00010.safetensors",
189
  "model.layers.10.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00010.safetensors",
190
  "model.layers.10.block_sparse_moe.experts.0.w3.weight_scale": "model-00004-of-00010.safetensors",
@@ -251,20 +255,22 @@
251
  "model.layers.10.block_sparse_moe.experts.7.w3.input_scale": "model-00004-of-00010.safetensors",
252
  "model.layers.10.block_sparse_moe.experts.7.w3.weight": "model-00004-of-00010.safetensors",
253
  "model.layers.10.block_sparse_moe.experts.7.w3.weight_scale": "model-00004-of-00010.safetensors",
254
- "model.layers.10.block_sparse_moe.gate.weight": "model-00004-of-00010.safetensors",
255
  "model.layers.10.input_layernorm.weight": "model-00004-of-00010.safetensors",
256
  "model.layers.10.post_attention_layernorm.weight": "model-00004-of-00010.safetensors",
257
- "model.layers.10.self_attn.k_proj.input_scale": "model-00004-of-00010.safetensors",
258
- "model.layers.10.self_attn.k_proj.weight": "model-00004-of-00010.safetensors",
259
- "model.layers.10.self_attn.k_proj.weight_scale": "model-00004-of-00010.safetensors",
260
  "model.layers.10.self_attn.k_scale": "model-00003-of-00010.safetensors",
261
- "model.layers.10.self_attn.o_proj.weight": "model-00004-of-00010.safetensors",
262
- "model.layers.10.self_attn.q_proj.input_scale": "model-00004-of-00010.safetensors",
263
- "model.layers.10.self_attn.q_proj.weight": "model-00004-of-00010.safetensors",
264
- "model.layers.10.self_attn.q_proj.weight_scale": "model-00004-of-00010.safetensors",
265
- "model.layers.10.self_attn.v_proj.input_scale": "model-00004-of-00010.safetensors",
266
- "model.layers.10.self_attn.v_proj.weight": "model-00004-of-00010.safetensors",
267
- "model.layers.10.self_attn.v_proj.weight_scale": "model-00004-of-00010.safetensors",
 
 
268
  "model.layers.10.self_attn.v_scale": "model-00003-of-00010.safetensors",
269
  "model.layers.11.block_sparse_moe.experts.0.w1.input_scale": "model-00004-of-00010.safetensors",
270
  "model.layers.11.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00010.safetensors",
@@ -345,7 +351,9 @@
345
  "model.layers.11.self_attn.k_proj.weight": "model-00004-of-00010.safetensors",
346
  "model.layers.11.self_attn.k_proj.weight_scale": "model-00004-of-00010.safetensors",
347
  "model.layers.11.self_attn.k_scale": "model-00004-of-00010.safetensors",
 
348
  "model.layers.11.self_attn.o_proj.weight": "model-00004-of-00010.safetensors",
 
349
  "model.layers.11.self_attn.q_proj.input_scale": "model-00004-of-00010.safetensors",
350
  "model.layers.11.self_attn.q_proj.weight": "model-00004-of-00010.safetensors",
351
  "model.layers.11.self_attn.q_proj.weight_scale": "model-00004-of-00010.safetensors",
@@ -432,7 +440,9 @@
432
  "model.layers.12.self_attn.k_proj.weight": "model-00004-of-00010.safetensors",
433
  "model.layers.12.self_attn.k_proj.weight_scale": "model-00004-of-00010.safetensors",
434
  "model.layers.12.self_attn.k_scale": "model-00004-of-00010.safetensors",
 
435
  "model.layers.12.self_attn.o_proj.weight": "model-00004-of-00010.safetensors",
 
436
  "model.layers.12.self_attn.q_proj.input_scale": "model-00004-of-00010.safetensors",
437
  "model.layers.12.self_attn.q_proj.weight": "model-00004-of-00010.safetensors",
438
  "model.layers.12.self_attn.q_proj.weight_scale": "model-00004-of-00010.safetensors",
@@ -467,18 +477,18 @@
467
  "model.layers.13.block_sparse_moe.experts.2.w3.input_scale": "model-00004-of-00010.safetensors",
468
  "model.layers.13.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00010.safetensors",
469
  "model.layers.13.block_sparse_moe.experts.2.w3.weight_scale": "model-00004-of-00010.safetensors",
470
- "model.layers.13.block_sparse_moe.experts.3.w1.input_scale": "model-00005-of-00010.safetensors",
471
- "model.layers.13.block_sparse_moe.experts.3.w1.weight": "model-00005-of-00010.safetensors",
472
- "model.layers.13.block_sparse_moe.experts.3.w1.weight_scale": "model-00005-of-00010.safetensors",
473
- "model.layers.13.block_sparse_moe.experts.3.w2.input_scale": "model-00005-of-00010.safetensors",
474
- "model.layers.13.block_sparse_moe.experts.3.w2.weight": "model-00005-of-00010.safetensors",
475
- "model.layers.13.block_sparse_moe.experts.3.w2.weight_scale": "model-00005-of-00010.safetensors",
476
- "model.layers.13.block_sparse_moe.experts.3.w3.input_scale": "model-00005-of-00010.safetensors",
477
- "model.layers.13.block_sparse_moe.experts.3.w3.weight": "model-00005-of-00010.safetensors",
478
- "model.layers.13.block_sparse_moe.experts.3.w3.weight_scale": "model-00005-of-00010.safetensors",
479
- "model.layers.13.block_sparse_moe.experts.4.w1.input_scale": "model-00005-of-00010.safetensors",
480
- "model.layers.13.block_sparse_moe.experts.4.w1.weight": "model-00005-of-00010.safetensors",
481
- "model.layers.13.block_sparse_moe.experts.4.w1.weight_scale": "model-00005-of-00010.safetensors",
482
  "model.layers.13.block_sparse_moe.experts.4.w2.input_scale": "model-00005-of-00010.safetensors",
483
  "model.layers.13.block_sparse_moe.experts.4.w2.weight": "model-00005-of-00010.safetensors",
484
  "model.layers.13.block_sparse_moe.experts.4.w2.weight_scale": "model-00005-of-00010.safetensors",
@@ -519,7 +529,9 @@
519
  "model.layers.13.self_attn.k_proj.weight": "model-00004-of-00010.safetensors",
520
  "model.layers.13.self_attn.k_proj.weight_scale": "model-00004-of-00010.safetensors",
521
  "model.layers.13.self_attn.k_scale": "model-00004-of-00010.safetensors",
 
522
  "model.layers.13.self_attn.o_proj.weight": "model-00004-of-00010.safetensors",
 
523
  "model.layers.13.self_attn.q_proj.input_scale": "model-00004-of-00010.safetensors",
524
  "model.layers.13.self_attn.q_proj.weight": "model-00004-of-00010.safetensors",
525
  "model.layers.13.self_attn.q_proj.weight_scale": "model-00004-of-00010.safetensors",
@@ -606,7 +618,9 @@
606
  "model.layers.14.self_attn.k_proj.weight": "model-00005-of-00010.safetensors",
607
  "model.layers.14.self_attn.k_proj.weight_scale": "model-00005-of-00010.safetensors",
608
  "model.layers.14.self_attn.k_scale": "model-00005-of-00010.safetensors",
 
609
  "model.layers.14.self_attn.o_proj.weight": "model-00005-of-00010.safetensors",
 
610
  "model.layers.14.self_attn.q_proj.input_scale": "model-00005-of-00010.safetensors",
611
  "model.layers.14.self_attn.q_proj.weight": "model-00005-of-00010.safetensors",
612
  "model.layers.14.self_attn.q_proj.weight_scale": "model-00005-of-00010.safetensors",
@@ -693,7 +707,9 @@
693
  "model.layers.15.self_attn.k_proj.weight": "model-00005-of-00010.safetensors",
694
  "model.layers.15.self_attn.k_proj.weight_scale": "model-00005-of-00010.safetensors",
695
  "model.layers.15.self_attn.k_scale": "model-00005-of-00010.safetensors",
 
696
  "model.layers.15.self_attn.o_proj.weight": "model-00005-of-00010.safetensors",
 
697
  "model.layers.15.self_attn.q_proj.input_scale": "model-00005-of-00010.safetensors",
698
  "model.layers.15.self_attn.q_proj.weight": "model-00005-of-00010.safetensors",
699
  "model.layers.15.self_attn.q_proj.weight_scale": "model-00005-of-00010.safetensors",
@@ -758,29 +774,31 @@
758
  "model.layers.16.block_sparse_moe.experts.6.w1.input_scale": "model-00005-of-00010.safetensors",
759
  "model.layers.16.block_sparse_moe.experts.6.w1.weight": "model-00005-of-00010.safetensors",
760
  "model.layers.16.block_sparse_moe.experts.6.w1.weight_scale": "model-00005-of-00010.safetensors",
761
- "model.layers.16.block_sparse_moe.experts.6.w2.input_scale": "model-00006-of-00010.safetensors",
762
- "model.layers.16.block_sparse_moe.experts.6.w2.weight": "model-00006-of-00010.safetensors",
763
- "model.layers.16.block_sparse_moe.experts.6.w2.weight_scale": "model-00006-of-00010.safetensors",
764
- "model.layers.16.block_sparse_moe.experts.6.w3.input_scale": "model-00006-of-00010.safetensors",
765
- "model.layers.16.block_sparse_moe.experts.6.w3.weight": "model-00006-of-00010.safetensors",
766
- "model.layers.16.block_sparse_moe.experts.6.w3.weight_scale": "model-00006-of-00010.safetensors",
767
- "model.layers.16.block_sparse_moe.experts.7.w1.input_scale": "model-00006-of-00010.safetensors",
768
- "model.layers.16.block_sparse_moe.experts.7.w1.weight": "model-00006-of-00010.safetensors",
769
- "model.layers.16.block_sparse_moe.experts.7.w1.weight_scale": "model-00006-of-00010.safetensors",
770
- "model.layers.16.block_sparse_moe.experts.7.w2.input_scale": "model-00006-of-00010.safetensors",
771
- "model.layers.16.block_sparse_moe.experts.7.w2.weight": "model-00006-of-00010.safetensors",
772
- "model.layers.16.block_sparse_moe.experts.7.w2.weight_scale": "model-00006-of-00010.safetensors",
773
- "model.layers.16.block_sparse_moe.experts.7.w3.input_scale": "model-00006-of-00010.safetensors",
774
- "model.layers.16.block_sparse_moe.experts.7.w3.weight": "model-00006-of-00010.safetensors",
775
- "model.layers.16.block_sparse_moe.experts.7.w3.weight_scale": "model-00006-of-00010.safetensors",
776
  "model.layers.16.block_sparse_moe.gate.weight": "model-00005-of-00010.safetensors",
777
- "model.layers.16.input_layernorm.weight": "model-00006-of-00010.safetensors",
778
- "model.layers.16.post_attention_layernorm.weight": "model-00006-of-00010.safetensors",
779
  "model.layers.16.self_attn.k_proj.input_scale": "model-00005-of-00010.safetensors",
780
  "model.layers.16.self_attn.k_proj.weight": "model-00005-of-00010.safetensors",
781
  "model.layers.16.self_attn.k_proj.weight_scale": "model-00005-of-00010.safetensors",
782
  "model.layers.16.self_attn.k_scale": "model-00005-of-00010.safetensors",
 
783
  "model.layers.16.self_attn.o_proj.weight": "model-00005-of-00010.safetensors",
 
784
  "model.layers.16.self_attn.q_proj.input_scale": "model-00005-of-00010.safetensors",
785
  "model.layers.16.self_attn.q_proj.weight": "model-00005-of-00010.safetensors",
786
  "model.layers.16.self_attn.q_proj.weight_scale": "model-00005-of-00010.safetensors",
@@ -866,15 +884,17 @@
866
  "model.layers.17.self_attn.k_proj.input_scale": "model-00006-of-00010.safetensors",
867
  "model.layers.17.self_attn.k_proj.weight": "model-00006-of-00010.safetensors",
868
  "model.layers.17.self_attn.k_proj.weight_scale": "model-00006-of-00010.safetensors",
869
- "model.layers.17.self_attn.k_scale": "model-00006-of-00010.safetensors",
 
870
  "model.layers.17.self_attn.o_proj.weight": "model-00006-of-00010.safetensors",
 
871
  "model.layers.17.self_attn.q_proj.input_scale": "model-00006-of-00010.safetensors",
872
  "model.layers.17.self_attn.q_proj.weight": "model-00006-of-00010.safetensors",
873
  "model.layers.17.self_attn.q_proj.weight_scale": "model-00006-of-00010.safetensors",
874
  "model.layers.17.self_attn.v_proj.input_scale": "model-00006-of-00010.safetensors",
875
  "model.layers.17.self_attn.v_proj.weight": "model-00006-of-00010.safetensors",
876
  "model.layers.17.self_attn.v_proj.weight_scale": "model-00006-of-00010.safetensors",
877
- "model.layers.17.self_attn.v_scale": "model-00006-of-00010.safetensors",
878
  "model.layers.18.block_sparse_moe.experts.0.w1.input_scale": "model-00006-of-00010.safetensors",
879
  "model.layers.18.block_sparse_moe.experts.0.w1.weight": "model-00006-of-00010.safetensors",
880
  "model.layers.18.block_sparse_moe.experts.0.w1.weight_scale": "model-00006-of-00010.safetensors",
@@ -954,7 +974,9 @@
954
  "model.layers.18.self_attn.k_proj.weight": "model-00006-of-00010.safetensors",
955
  "model.layers.18.self_attn.k_proj.weight_scale": "model-00006-of-00010.safetensors",
956
  "model.layers.18.self_attn.k_scale": "model-00006-of-00010.safetensors",
 
957
  "model.layers.18.self_attn.o_proj.weight": "model-00006-of-00010.safetensors",
 
958
  "model.layers.18.self_attn.q_proj.input_scale": "model-00006-of-00010.safetensors",
959
  "model.layers.18.self_attn.q_proj.weight": "model-00006-of-00010.safetensors",
960
  "model.layers.18.self_attn.q_proj.weight_scale": "model-00006-of-00010.safetensors",
@@ -1041,7 +1063,9 @@
1041
  "model.layers.19.self_attn.k_proj.weight": "model-00006-of-00010.safetensors",
1042
  "model.layers.19.self_attn.k_proj.weight_scale": "model-00006-of-00010.safetensors",
1043
  "model.layers.19.self_attn.k_scale": "model-00006-of-00010.safetensors",
 
1044
  "model.layers.19.self_attn.o_proj.weight": "model-00006-of-00010.safetensors",
 
1045
  "model.layers.19.self_attn.q_proj.input_scale": "model-00006-of-00010.safetensors",
1046
  "model.layers.19.self_attn.q_proj.weight": "model-00006-of-00010.safetensors",
1047
  "model.layers.19.self_attn.q_proj.weight_scale": "model-00006-of-00010.safetensors",
@@ -1128,7 +1152,9 @@
1128
  "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00010.safetensors",
1129
  "model.layers.2.self_attn.k_proj.weight_scale": "model-00001-of-00010.safetensors",
1130
  "model.layers.2.self_attn.k_scale": "model-00001-of-00010.safetensors",
 
1131
  "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00010.safetensors",
 
1132
  "model.layers.2.self_attn.q_proj.input_scale": "model-00001-of-00010.safetensors",
1133
  "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00010.safetensors",
1134
  "model.layers.2.self_attn.q_proj.weight_scale": "model-00001-of-00010.safetensors",
@@ -1148,24 +1174,24 @@
1148
  "model.layers.20.block_sparse_moe.experts.1.w1.input_scale": "model-00006-of-00010.safetensors",
1149
  "model.layers.20.block_sparse_moe.experts.1.w1.weight": "model-00006-of-00010.safetensors",
1150
  "model.layers.20.block_sparse_moe.experts.1.w1.weight_scale": "model-00006-of-00010.safetensors",
1151
- "model.layers.20.block_sparse_moe.experts.1.w2.input_scale": "model-00007-of-00010.safetensors",
1152
- "model.layers.20.block_sparse_moe.experts.1.w2.weight": "model-00007-of-00010.safetensors",
1153
- "model.layers.20.block_sparse_moe.experts.1.w2.weight_scale": "model-00007-of-00010.safetensors",
1154
- "model.layers.20.block_sparse_moe.experts.1.w3.input_scale": "model-00007-of-00010.safetensors",
1155
- "model.layers.20.block_sparse_moe.experts.1.w3.weight": "model-00007-of-00010.safetensors",
1156
- "model.layers.20.block_sparse_moe.experts.1.w3.weight_scale": "model-00007-of-00010.safetensors",
1157
- "model.layers.20.block_sparse_moe.experts.2.w1.input_scale": "model-00007-of-00010.safetensors",
1158
- "model.layers.20.block_sparse_moe.experts.2.w1.weight": "model-00007-of-00010.safetensors",
1159
- "model.layers.20.block_sparse_moe.experts.2.w1.weight_scale": "model-00007-of-00010.safetensors",
1160
- "model.layers.20.block_sparse_moe.experts.2.w2.input_scale": "model-00007-of-00010.safetensors",
1161
- "model.layers.20.block_sparse_moe.experts.2.w2.weight": "model-00007-of-00010.safetensors",
1162
- "model.layers.20.block_sparse_moe.experts.2.w2.weight_scale": "model-00007-of-00010.safetensors",
1163
- "model.layers.20.block_sparse_moe.experts.2.w3.input_scale": "model-00007-of-00010.safetensors",
1164
- "model.layers.20.block_sparse_moe.experts.2.w3.weight": "model-00007-of-00010.safetensors",
1165
- "model.layers.20.block_sparse_moe.experts.2.w3.weight_scale": "model-00007-of-00010.safetensors",
1166
- "model.layers.20.block_sparse_moe.experts.3.w1.input_scale": "model-00007-of-00010.safetensors",
1167
- "model.layers.20.block_sparse_moe.experts.3.w1.weight": "model-00007-of-00010.safetensors",
1168
- "model.layers.20.block_sparse_moe.experts.3.w1.weight_scale": "model-00007-of-00010.safetensors",
1169
  "model.layers.20.block_sparse_moe.experts.3.w2.input_scale": "model-00007-of-00010.safetensors",
1170
  "model.layers.20.block_sparse_moe.experts.3.w2.weight": "model-00007-of-00010.safetensors",
1171
  "model.layers.20.block_sparse_moe.experts.3.w2.weight_scale": "model-00007-of-00010.safetensors",
@@ -1215,7 +1241,9 @@
1215
  "model.layers.20.self_attn.k_proj.weight": "model-00006-of-00010.safetensors",
1216
  "model.layers.20.self_attn.k_proj.weight_scale": "model-00006-of-00010.safetensors",
1217
  "model.layers.20.self_attn.k_scale": "model-00006-of-00010.safetensors",
 
1218
  "model.layers.20.self_attn.o_proj.weight": "model-00006-of-00010.safetensors",
 
1219
  "model.layers.20.self_attn.q_proj.input_scale": "model-00006-of-00010.safetensors",
1220
  "model.layers.20.self_attn.q_proj.weight": "model-00006-of-00010.safetensors",
1221
  "model.layers.20.self_attn.q_proj.weight_scale": "model-00006-of-00010.safetensors",
@@ -1302,7 +1330,9 @@
1302
  "model.layers.21.self_attn.k_proj.weight": "model-00007-of-00010.safetensors",
1303
  "model.layers.21.self_attn.k_proj.weight_scale": "model-00007-of-00010.safetensors",
1304
  "model.layers.21.self_attn.k_scale": "model-00007-of-00010.safetensors",
 
1305
  "model.layers.21.self_attn.o_proj.weight": "model-00007-of-00010.safetensors",
 
1306
  "model.layers.21.self_attn.q_proj.input_scale": "model-00007-of-00010.safetensors",
1307
  "model.layers.21.self_attn.q_proj.weight": "model-00007-of-00010.safetensors",
1308
  "model.layers.21.self_attn.q_proj.weight_scale": "model-00007-of-00010.safetensors",
@@ -1389,7 +1419,9 @@
1389
  "model.layers.22.self_attn.k_proj.weight": "model-00007-of-00010.safetensors",
1390
  "model.layers.22.self_attn.k_proj.weight_scale": "model-00007-of-00010.safetensors",
1391
  "model.layers.22.self_attn.k_scale": "model-00007-of-00010.safetensors",
 
1392
  "model.layers.22.self_attn.o_proj.weight": "model-00007-of-00010.safetensors",
 
1393
  "model.layers.22.self_attn.q_proj.input_scale": "model-00007-of-00010.safetensors",
1394
  "model.layers.22.self_attn.q_proj.weight": "model-00007-of-00010.safetensors",
1395
  "model.layers.22.self_attn.q_proj.weight_scale": "model-00007-of-00010.safetensors",
@@ -1439,27 +1471,27 @@
1439
  "model.layers.23.block_sparse_moe.experts.4.w2.input_scale": "model-00007-of-00010.safetensors",
1440
  "model.layers.23.block_sparse_moe.experts.4.w2.weight": "model-00007-of-00010.safetensors",
1441
  "model.layers.23.block_sparse_moe.experts.4.w2.weight_scale": "model-00007-of-00010.safetensors",
1442
- "model.layers.23.block_sparse_moe.experts.4.w3.input_scale": "model-00008-of-00010.safetensors",
1443
- "model.layers.23.block_sparse_moe.experts.4.w3.weight": "model-00008-of-00010.safetensors",
1444
- "model.layers.23.block_sparse_moe.experts.4.w3.weight_scale": "model-00008-of-00010.safetensors",
1445
- "model.layers.23.block_sparse_moe.experts.5.w1.input_scale": "model-00008-of-00010.safetensors",
1446
- "model.layers.23.block_sparse_moe.experts.5.w1.weight": "model-00008-of-00010.safetensors",
1447
- "model.layers.23.block_sparse_moe.experts.5.w1.weight_scale": "model-00008-of-00010.safetensors",
1448
- "model.layers.23.block_sparse_moe.experts.5.w2.input_scale": "model-00008-of-00010.safetensors",
1449
- "model.layers.23.block_sparse_moe.experts.5.w2.weight": "model-00008-of-00010.safetensors",
1450
- "model.layers.23.block_sparse_moe.experts.5.w2.weight_scale": "model-00008-of-00010.safetensors",
1451
- "model.layers.23.block_sparse_moe.experts.5.w3.input_scale": "model-00008-of-00010.safetensors",
1452
- "model.layers.23.block_sparse_moe.experts.5.w3.weight": "model-00008-of-00010.safetensors",
1453
- "model.layers.23.block_sparse_moe.experts.5.w3.weight_scale": "model-00008-of-00010.safetensors",
1454
- "model.layers.23.block_sparse_moe.experts.6.w1.input_scale": "model-00008-of-00010.safetensors",
1455
- "model.layers.23.block_sparse_moe.experts.6.w1.weight": "model-00008-of-00010.safetensors",
1456
- "model.layers.23.block_sparse_moe.experts.6.w1.weight_scale": "model-00008-of-00010.safetensors",
1457
- "model.layers.23.block_sparse_moe.experts.6.w2.input_scale": "model-00008-of-00010.safetensors",
1458
- "model.layers.23.block_sparse_moe.experts.6.w2.weight": "model-00008-of-00010.safetensors",
1459
- "model.layers.23.block_sparse_moe.experts.6.w2.weight_scale": "model-00008-of-00010.safetensors",
1460
- "model.layers.23.block_sparse_moe.experts.6.w3.input_scale": "model-00008-of-00010.safetensors",
1461
- "model.layers.23.block_sparse_moe.experts.6.w3.weight": "model-00008-of-00010.safetensors",
1462
- "model.layers.23.block_sparse_moe.experts.6.w3.weight_scale": "model-00008-of-00010.safetensors",
1463
  "model.layers.23.block_sparse_moe.experts.7.w1.input_scale": "model-00008-of-00010.safetensors",
1464
  "model.layers.23.block_sparse_moe.experts.7.w1.weight": "model-00008-of-00010.safetensors",
1465
  "model.layers.23.block_sparse_moe.experts.7.w1.weight_scale": "model-00008-of-00010.safetensors",
@@ -1476,7 +1508,9 @@
1476
  "model.layers.23.self_attn.k_proj.weight": "model-00007-of-00010.safetensors",
1477
  "model.layers.23.self_attn.k_proj.weight_scale": "model-00007-of-00010.safetensors",
1478
  "model.layers.23.self_attn.k_scale": "model-00007-of-00010.safetensors",
 
1479
  "model.layers.23.self_attn.o_proj.weight": "model-00007-of-00010.safetensors",
 
1480
  "model.layers.23.self_attn.q_proj.input_scale": "model-00007-of-00010.safetensors",
1481
  "model.layers.23.self_attn.q_proj.weight": "model-00007-of-00010.safetensors",
1482
  "model.layers.23.self_attn.q_proj.weight_scale": "model-00007-of-00010.safetensors",
@@ -1563,7 +1597,9 @@
1563
  "model.layers.24.self_attn.k_proj.weight": "model-00008-of-00010.safetensors",
1564
  "model.layers.24.self_attn.k_proj.weight_scale": "model-00008-of-00010.safetensors",
1565
  "model.layers.24.self_attn.k_scale": "model-00008-of-00010.safetensors",
 
1566
  "model.layers.24.self_attn.o_proj.weight": "model-00008-of-00010.safetensors",
 
1567
  "model.layers.24.self_attn.q_proj.input_scale": "model-00008-of-00010.safetensors",
1568
  "model.layers.24.self_attn.q_proj.weight": "model-00008-of-00010.safetensors",
1569
  "model.layers.24.self_attn.q_proj.weight_scale": "model-00008-of-00010.safetensors",
@@ -1650,7 +1686,9 @@
1650
  "model.layers.25.self_attn.k_proj.weight": "model-00008-of-00010.safetensors",
1651
  "model.layers.25.self_attn.k_proj.weight_scale": "model-00008-of-00010.safetensors",
1652
  "model.layers.25.self_attn.k_scale": "model-00008-of-00010.safetensors",
 
1653
  "model.layers.25.self_attn.o_proj.weight": "model-00008-of-00010.safetensors",
 
1654
  "model.layers.25.self_attn.q_proj.input_scale": "model-00008-of-00010.safetensors",
1655
  "model.layers.25.self_attn.q_proj.weight": "model-00008-of-00010.safetensors",
1656
  "model.layers.25.self_attn.q_proj.weight_scale": "model-00008-of-00010.safetensors",
@@ -1737,7 +1775,9 @@
1737
  "model.layers.26.self_attn.k_proj.weight": "model-00008-of-00010.safetensors",
1738
  "model.layers.26.self_attn.k_proj.weight_scale": "model-00008-of-00010.safetensors",
1739
  "model.layers.26.self_attn.k_scale": "model-00008-of-00010.safetensors",
 
1740
  "model.layers.26.self_attn.o_proj.weight": "model-00008-of-00010.safetensors",
 
1741
  "model.layers.26.self_attn.q_proj.input_scale": "model-00008-of-00010.safetensors",
1742
  "model.layers.26.self_attn.q_proj.weight": "model-00008-of-00010.safetensors",
1743
  "model.layers.26.self_attn.q_proj.weight_scale": "model-00008-of-00010.safetensors",
@@ -1745,27 +1785,27 @@
1745
  "model.layers.26.self_attn.v_proj.weight": "model-00008-of-00010.safetensors",
1746
  "model.layers.26.self_attn.v_proj.weight_scale": "model-00008-of-00010.safetensors",
1747
  "model.layers.26.self_attn.v_scale": "model-00008-of-00010.safetensors",
1748
- "model.layers.27.block_sparse_moe.experts.0.w1.input_scale": "model-00009-of-00010.safetensors",
1749
- "model.layers.27.block_sparse_moe.experts.0.w1.weight": "model-00009-of-00010.safetensors",
1750
- "model.layers.27.block_sparse_moe.experts.0.w1.weight_scale": "model-00009-of-00010.safetensors",
1751
- "model.layers.27.block_sparse_moe.experts.0.w2.input_scale": "model-00009-of-00010.safetensors",
1752
- "model.layers.27.block_sparse_moe.experts.0.w2.weight": "model-00009-of-00010.safetensors",
1753
- "model.layers.27.block_sparse_moe.experts.0.w2.weight_scale": "model-00009-of-00010.safetensors",
1754
- "model.layers.27.block_sparse_moe.experts.0.w3.input_scale": "model-00009-of-00010.safetensors",
1755
- "model.layers.27.block_sparse_moe.experts.0.w3.weight": "model-00009-of-00010.safetensors",
1756
- "model.layers.27.block_sparse_moe.experts.0.w3.weight_scale": "model-00009-of-00010.safetensors",
1757
- "model.layers.27.block_sparse_moe.experts.1.w1.input_scale": "model-00009-of-00010.safetensors",
1758
- "model.layers.27.block_sparse_moe.experts.1.w1.weight": "model-00009-of-00010.safetensors",
1759
- "model.layers.27.block_sparse_moe.experts.1.w1.weight_scale": "model-00009-of-00010.safetensors",
1760
- "model.layers.27.block_sparse_moe.experts.1.w2.input_scale": "model-00009-of-00010.safetensors",
1761
- "model.layers.27.block_sparse_moe.experts.1.w2.weight": "model-00009-of-00010.safetensors",
1762
- "model.layers.27.block_sparse_moe.experts.1.w2.weight_scale": "model-00009-of-00010.safetensors",
1763
- "model.layers.27.block_sparse_moe.experts.1.w3.input_scale": "model-00009-of-00010.safetensors",
1764
- "model.layers.27.block_sparse_moe.experts.1.w3.weight": "model-00009-of-00010.safetensors",
1765
- "model.layers.27.block_sparse_moe.experts.1.w3.weight_scale": "model-00009-of-00010.safetensors",
1766
- "model.layers.27.block_sparse_moe.experts.2.w1.input_scale": "model-00009-of-00010.safetensors",
1767
- "model.layers.27.block_sparse_moe.experts.2.w1.weight": "model-00009-of-00010.safetensors",
1768
- "model.layers.27.block_sparse_moe.experts.2.w1.weight_scale": "model-00009-of-00010.safetensors",
1769
  "model.layers.27.block_sparse_moe.experts.2.w2.input_scale": "model-00009-of-00010.safetensors",
1770
  "model.layers.27.block_sparse_moe.experts.2.w2.weight": "model-00009-of-00010.safetensors",
1771
  "model.layers.27.block_sparse_moe.experts.2.w2.weight_scale": "model-00009-of-00010.safetensors",
@@ -1817,20 +1857,22 @@
1817
  "model.layers.27.block_sparse_moe.experts.7.w3.input_scale": "model-00009-of-00010.safetensors",
1818
  "model.layers.27.block_sparse_moe.experts.7.w3.weight": "model-00009-of-00010.safetensors",
1819
  "model.layers.27.block_sparse_moe.experts.7.w3.weight_scale": "model-00009-of-00010.safetensors",
1820
- "model.layers.27.block_sparse_moe.gate.weight": "model-00009-of-00010.safetensors",
1821
  "model.layers.27.input_layernorm.weight": "model-00009-of-00010.safetensors",
1822
  "model.layers.27.post_attention_layernorm.weight": "model-00009-of-00010.safetensors",
1823
- "model.layers.27.self_attn.k_proj.input_scale": "model-00009-of-00010.safetensors",
1824
- "model.layers.27.self_attn.k_proj.weight": "model-00009-of-00010.safetensors",
1825
- "model.layers.27.self_attn.k_proj.weight_scale": "model-00009-of-00010.safetensors",
1826
  "model.layers.27.self_attn.k_scale": "model-00008-of-00010.safetensors",
1827
- "model.layers.27.self_attn.o_proj.weight": "model-00009-of-00010.safetensors",
1828
- "model.layers.27.self_attn.q_proj.input_scale": "model-00009-of-00010.safetensors",
1829
- "model.layers.27.self_attn.q_proj.weight": "model-00009-of-00010.safetensors",
1830
- "model.layers.27.self_attn.q_proj.weight_scale": "model-00009-of-00010.safetensors",
1831
- "model.layers.27.self_attn.v_proj.input_scale": "model-00009-of-00010.safetensors",
1832
- "model.layers.27.self_attn.v_proj.weight": "model-00009-of-00010.safetensors",
1833
- "model.layers.27.self_attn.v_proj.weight_scale": "model-00009-of-00010.safetensors",
 
 
1834
  "model.layers.27.self_attn.v_scale": "model-00008-of-00010.safetensors",
1835
  "model.layers.28.block_sparse_moe.experts.0.w1.input_scale": "model-00009-of-00010.safetensors",
1836
  "model.layers.28.block_sparse_moe.experts.0.w1.weight": "model-00009-of-00010.safetensors",
@@ -1911,7 +1953,9 @@
1911
  "model.layers.28.self_attn.k_proj.weight": "model-00009-of-00010.safetensors",
1912
  "model.layers.28.self_attn.k_proj.weight_scale": "model-00009-of-00010.safetensors",
1913
  "model.layers.28.self_attn.k_scale": "model-00009-of-00010.safetensors",
 
1914
  "model.layers.28.self_attn.o_proj.weight": "model-00009-of-00010.safetensors",
 
1915
  "model.layers.28.self_attn.q_proj.input_scale": "model-00009-of-00010.safetensors",
1916
  "model.layers.28.self_attn.q_proj.weight": "model-00009-of-00010.safetensors",
1917
  "model.layers.28.self_attn.q_proj.weight_scale": "model-00009-of-00010.safetensors",
@@ -1998,7 +2042,9 @@
1998
  "model.layers.29.self_attn.k_proj.weight": "model-00009-of-00010.safetensors",
1999
  "model.layers.29.self_attn.k_proj.weight_scale": "model-00009-of-00010.safetensors",
2000
  "model.layers.29.self_attn.k_scale": "model-00009-of-00010.safetensors",
 
2001
  "model.layers.29.self_attn.o_proj.weight": "model-00009-of-00010.safetensors",
 
2002
  "model.layers.29.self_attn.q_proj.input_scale": "model-00009-of-00010.safetensors",
2003
  "model.layers.29.self_attn.q_proj.weight": "model-00009-of-00010.safetensors",
2004
  "model.layers.29.self_attn.q_proj.weight_scale": "model-00009-of-00010.safetensors",
@@ -2018,9 +2064,9 @@
2018
  "model.layers.3.block_sparse_moe.experts.1.w1.input_scale": "model-00001-of-00010.safetensors",
2019
  "model.layers.3.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00010.safetensors",
2020
  "model.layers.3.block_sparse_moe.experts.1.w1.weight_scale": "model-00001-of-00010.safetensors",
2021
- "model.layers.3.block_sparse_moe.experts.1.w2.input_scale": "model-00002-of-00010.safetensors",
2022
- "model.layers.3.block_sparse_moe.experts.1.w2.weight": "model-00002-of-00010.safetensors",
2023
- "model.layers.3.block_sparse_moe.experts.1.w2.weight_scale": "model-00002-of-00010.safetensors",
2024
  "model.layers.3.block_sparse_moe.experts.1.w3.input_scale": "model-00002-of-00010.safetensors",
2025
  "model.layers.3.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00010.safetensors",
2026
  "model.layers.3.block_sparse_moe.experts.1.w3.weight_scale": "model-00002-of-00010.safetensors",
@@ -2085,7 +2131,9 @@
2085
  "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00010.safetensors",
2086
  "model.layers.3.self_attn.k_proj.weight_scale": "model-00001-of-00010.safetensors",
2087
  "model.layers.3.self_attn.k_scale": "model-00001-of-00010.safetensors",
 
2088
  "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00010.safetensors",
 
2089
  "model.layers.3.self_attn.q_proj.input_scale": "model-00001-of-00010.safetensors",
2090
  "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00010.safetensors",
2091
  "model.layers.3.self_attn.q_proj.weight_scale": "model-00001-of-00010.safetensors",
@@ -2120,33 +2168,33 @@
2120
  "model.layers.30.block_sparse_moe.experts.2.w3.input_scale": "model-00009-of-00010.safetensors",
2121
  "model.layers.30.block_sparse_moe.experts.2.w3.weight": "model-00009-of-00010.safetensors",
2122
  "model.layers.30.block_sparse_moe.experts.2.w3.weight_scale": "model-00009-of-00010.safetensors",
2123
- "model.layers.30.block_sparse_moe.experts.3.w1.input_scale": "model-00010-of-00010.safetensors",
2124
- "model.layers.30.block_sparse_moe.experts.3.w1.weight": "model-00010-of-00010.safetensors",
2125
- "model.layers.30.block_sparse_moe.experts.3.w1.weight_scale": "model-00010-of-00010.safetensors",
2126
- "model.layers.30.block_sparse_moe.experts.3.w2.input_scale": "model-00010-of-00010.safetensors",
2127
- "model.layers.30.block_sparse_moe.experts.3.w2.weight": "model-00010-of-00010.safetensors",
2128
- "model.layers.30.block_sparse_moe.experts.3.w2.weight_scale": "model-00010-of-00010.safetensors",
2129
- "model.layers.30.block_sparse_moe.experts.3.w3.input_scale": "model-00010-of-00010.safetensors",
2130
- "model.layers.30.block_sparse_moe.experts.3.w3.weight": "model-00010-of-00010.safetensors",
2131
- "model.layers.30.block_sparse_moe.experts.3.w3.weight_scale": "model-00010-of-00010.safetensors",
2132
- "model.layers.30.block_sparse_moe.experts.4.w1.input_scale": "model-00010-of-00010.safetensors",
2133
- "model.layers.30.block_sparse_moe.experts.4.w1.weight": "model-00010-of-00010.safetensors",
2134
- "model.layers.30.block_sparse_moe.experts.4.w1.weight_scale": "model-00010-of-00010.safetensors",
2135
- "model.layers.30.block_sparse_moe.experts.4.w2.input_scale": "model-00010-of-00010.safetensors",
2136
- "model.layers.30.block_sparse_moe.experts.4.w2.weight": "model-00010-of-00010.safetensors",
2137
- "model.layers.30.block_sparse_moe.experts.4.w2.weight_scale": "model-00010-of-00010.safetensors",
2138
- "model.layers.30.block_sparse_moe.experts.4.w3.input_scale": "model-00010-of-00010.safetensors",
2139
- "model.layers.30.block_sparse_moe.experts.4.w3.weight": "model-00010-of-00010.safetensors",
2140
- "model.layers.30.block_sparse_moe.experts.4.w3.weight_scale": "model-00010-of-00010.safetensors",
2141
- "model.layers.30.block_sparse_moe.experts.5.w1.input_scale": "model-00010-of-00010.safetensors",
2142
- "model.layers.30.block_sparse_moe.experts.5.w1.weight": "model-00010-of-00010.safetensors",
2143
- "model.layers.30.block_sparse_moe.experts.5.w1.weight_scale": "model-00010-of-00010.safetensors",
2144
- "model.layers.30.block_sparse_moe.experts.5.w2.input_scale": "model-00010-of-00010.safetensors",
2145
- "model.layers.30.block_sparse_moe.experts.5.w2.weight": "model-00010-of-00010.safetensors",
2146
- "model.layers.30.block_sparse_moe.experts.5.w2.weight_scale": "model-00010-of-00010.safetensors",
2147
- "model.layers.30.block_sparse_moe.experts.5.w3.input_scale": "model-00010-of-00010.safetensors",
2148
- "model.layers.30.block_sparse_moe.experts.5.w3.weight": "model-00010-of-00010.safetensors",
2149
- "model.layers.30.block_sparse_moe.experts.5.w3.weight_scale": "model-00010-of-00010.safetensors",
2150
  "model.layers.30.block_sparse_moe.experts.6.w1.input_scale": "model-00010-of-00010.safetensors",
2151
  "model.layers.30.block_sparse_moe.experts.6.w1.weight": "model-00010-of-00010.safetensors",
2152
  "model.layers.30.block_sparse_moe.experts.6.w1.weight_scale": "model-00010-of-00010.safetensors",
@@ -2172,7 +2220,9 @@
2172
  "model.layers.30.self_attn.k_proj.weight": "model-00009-of-00010.safetensors",
2173
  "model.layers.30.self_attn.k_proj.weight_scale": "model-00009-of-00010.safetensors",
2174
  "model.layers.30.self_attn.k_scale": "model-00009-of-00010.safetensors",
 
2175
  "model.layers.30.self_attn.o_proj.weight": "model-00009-of-00010.safetensors",
 
2176
  "model.layers.30.self_attn.q_proj.input_scale": "model-00009-of-00010.safetensors",
2177
  "model.layers.30.self_attn.q_proj.weight": "model-00009-of-00010.safetensors",
2178
  "model.layers.30.self_attn.q_proj.weight_scale": "model-00009-of-00010.safetensors",
@@ -2259,7 +2309,9 @@
2259
  "model.layers.31.self_attn.k_proj.weight": "model-00010-of-00010.safetensors",
2260
  "model.layers.31.self_attn.k_proj.weight_scale": "model-00010-of-00010.safetensors",
2261
  "model.layers.31.self_attn.k_scale": "model-00010-of-00010.safetensors",
 
2262
  "model.layers.31.self_attn.o_proj.weight": "model-00010-of-00010.safetensors",
 
2263
  "model.layers.31.self_attn.q_proj.input_scale": "model-00010-of-00010.safetensors",
2264
  "model.layers.31.self_attn.q_proj.weight": "model-00010-of-00010.safetensors",
2265
  "model.layers.31.self_attn.q_proj.weight_scale": "model-00010-of-00010.safetensors",
@@ -2346,7 +2398,9 @@
2346
  "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00010.safetensors",
2347
  "model.layers.4.self_attn.k_proj.weight_scale": "model-00002-of-00010.safetensors",
2348
  "model.layers.4.self_attn.k_scale": "model-00002-of-00010.safetensors",
 
2349
  "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00010.safetensors",
 
2350
  "model.layers.4.self_attn.q_proj.input_scale": "model-00002-of-00010.safetensors",
2351
  "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00010.safetensors",
2352
  "model.layers.4.self_attn.q_proj.weight_scale": "model-00002-of-00010.safetensors",
@@ -2433,7 +2487,9 @@
2433
  "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00010.safetensors",
2434
  "model.layers.5.self_attn.k_proj.weight_scale": "model-00002-of-00010.safetensors",
2435
  "model.layers.5.self_attn.k_scale": "model-00002-of-00010.safetensors",
 
2436
  "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00010.safetensors",
 
2437
  "model.layers.5.self_attn.q_proj.input_scale": "model-00002-of-00010.safetensors",
2438
  "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00010.safetensors",
2439
  "model.layers.5.self_attn.q_proj.weight_scale": "model-00002-of-00010.safetensors",
@@ -2483,12 +2539,12 @@
2483
  "model.layers.6.block_sparse_moe.experts.4.w2.input_scale": "model-00002-of-00010.safetensors",
2484
  "model.layers.6.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00010.safetensors",
2485
  "model.layers.6.block_sparse_moe.experts.4.w2.weight_scale": "model-00002-of-00010.safetensors",
2486
- "model.layers.6.block_sparse_moe.experts.4.w3.input_scale": "model-00003-of-00010.safetensors",
2487
- "model.layers.6.block_sparse_moe.experts.4.w3.weight": "model-00003-of-00010.safetensors",
2488
- "model.layers.6.block_sparse_moe.experts.4.w3.weight_scale": "model-00003-of-00010.safetensors",
2489
- "model.layers.6.block_sparse_moe.experts.5.w1.input_scale": "model-00003-of-00010.safetensors",
2490
- "model.layers.6.block_sparse_moe.experts.5.w1.weight": "model-00003-of-00010.safetensors",
2491
- "model.layers.6.block_sparse_moe.experts.5.w1.weight_scale": "model-00003-of-00010.safetensors",
2492
  "model.layers.6.block_sparse_moe.experts.5.w2.input_scale": "model-00003-of-00010.safetensors",
2493
  "model.layers.6.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00010.safetensors",
2494
  "model.layers.6.block_sparse_moe.experts.5.w2.weight_scale": "model-00003-of-00010.safetensors",
@@ -2520,7 +2576,9 @@
2520
  "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00010.safetensors",
2521
  "model.layers.6.self_attn.k_proj.weight_scale": "model-00002-of-00010.safetensors",
2522
  "model.layers.6.self_attn.k_scale": "model-00002-of-00010.safetensors",
 
2523
  "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00010.safetensors",
 
2524
  "model.layers.6.self_attn.q_proj.input_scale": "model-00002-of-00010.safetensors",
2525
  "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00010.safetensors",
2526
  "model.layers.6.self_attn.q_proj.weight_scale": "model-00002-of-00010.safetensors",
@@ -2607,7 +2665,9 @@
2607
  "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00010.safetensors",
2608
  "model.layers.7.self_attn.k_proj.weight_scale": "model-00003-of-00010.safetensors",
2609
  "model.layers.7.self_attn.k_scale": "model-00003-of-00010.safetensors",
 
2610
  "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00010.safetensors",
 
2611
  "model.layers.7.self_attn.q_proj.input_scale": "model-00003-of-00010.safetensors",
2612
  "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00010.safetensors",
2613
  "model.layers.7.self_attn.q_proj.weight_scale": "model-00003-of-00010.safetensors",
@@ -2694,7 +2754,9 @@
2694
  "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00010.safetensors",
2695
  "model.layers.8.self_attn.k_proj.weight_scale": "model-00003-of-00010.safetensors",
2696
  "model.layers.8.self_attn.k_scale": "model-00003-of-00010.safetensors",
 
2697
  "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00010.safetensors",
 
2698
  "model.layers.8.self_attn.q_proj.input_scale": "model-00003-of-00010.safetensors",
2699
  "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00010.safetensors",
2700
  "model.layers.8.self_attn.q_proj.weight_scale": "model-00003-of-00010.safetensors",
@@ -2781,7 +2843,9 @@
2781
  "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00010.safetensors",
2782
  "model.layers.9.self_attn.k_proj.weight_scale": "model-00003-of-00010.safetensors",
2783
  "model.layers.9.self_attn.k_scale": "model-00003-of-00010.safetensors",
 
2784
  "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00010.safetensors",
 
2785
  "model.layers.9.self_attn.q_proj.input_scale": "model-00003-of-00010.safetensors",
2786
  "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00010.safetensors",
2787
  "model.layers.9.self_attn.q_proj.weight_scale": "model-00003-of-00010.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 46966255232
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00010-of-00010.safetensors",
 
84
  "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00010.safetensors",
85
  "model.layers.0.self_attn.k_proj.weight_scale": "model-00001-of-00010.safetensors",
86
  "model.layers.0.self_attn.k_scale": "model-00001-of-00010.safetensors",
87
+ "model.layers.0.self_attn.o_proj.input_scale": "model-00001-of-00010.safetensors",
88
  "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00010.safetensors",
89
+ "model.layers.0.self_attn.o_proj.weight_scale": "model-00001-of-00010.safetensors",
90
  "model.layers.0.self_attn.q_proj.input_scale": "model-00001-of-00010.safetensors",
91
  "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00010.safetensors",
92
  "model.layers.0.self_attn.q_proj.weight_scale": "model-00001-of-00010.safetensors",
 
173
  "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00010.safetensors",
174
  "model.layers.1.self_attn.k_proj.weight_scale": "model-00001-of-00010.safetensors",
175
  "model.layers.1.self_attn.k_scale": "model-00001-of-00010.safetensors",
176
+ "model.layers.1.self_attn.o_proj.input_scale": "model-00001-of-00010.safetensors",
177
  "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00010.safetensors",
178
+ "model.layers.1.self_attn.o_proj.weight_scale": "model-00001-of-00010.safetensors",
179
  "model.layers.1.self_attn.q_proj.input_scale": "model-00001-of-00010.safetensors",
180
  "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00010.safetensors",
181
  "model.layers.1.self_attn.q_proj.weight_scale": "model-00001-of-00010.safetensors",
 
183
  "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00010.safetensors",
184
  "model.layers.1.self_attn.v_proj.weight_scale": "model-00001-of-00010.safetensors",
185
  "model.layers.1.self_attn.v_scale": "model-00001-of-00010.safetensors",
186
+ "model.layers.10.block_sparse_moe.experts.0.w1.input_scale": "model-00003-of-00010.safetensors",
187
+ "model.layers.10.block_sparse_moe.experts.0.w1.weight": "model-00003-of-00010.safetensors",
188
+ "model.layers.10.block_sparse_moe.experts.0.w1.weight_scale": "model-00003-of-00010.safetensors",
189
+ "model.layers.10.block_sparse_moe.experts.0.w2.input_scale": "model-00003-of-00010.safetensors",
190
+ "model.layers.10.block_sparse_moe.experts.0.w2.weight": "model-00003-of-00010.safetensors",
191
+ "model.layers.10.block_sparse_moe.experts.0.w2.weight_scale": "model-00003-of-00010.safetensors",
192
  "model.layers.10.block_sparse_moe.experts.0.w3.input_scale": "model-00004-of-00010.safetensors",
193
  "model.layers.10.block_sparse_moe.experts.0.w3.weight": "model-00004-of-00010.safetensors",
194
  "model.layers.10.block_sparse_moe.experts.0.w3.weight_scale": "model-00004-of-00010.safetensors",
 
255
  "model.layers.10.block_sparse_moe.experts.7.w3.input_scale": "model-00004-of-00010.safetensors",
256
  "model.layers.10.block_sparse_moe.experts.7.w3.weight": "model-00004-of-00010.safetensors",
257
  "model.layers.10.block_sparse_moe.experts.7.w3.weight_scale": "model-00004-of-00010.safetensors",
258
+ "model.layers.10.block_sparse_moe.gate.weight": "model-00003-of-00010.safetensors",
259
  "model.layers.10.input_layernorm.weight": "model-00004-of-00010.safetensors",
260
  "model.layers.10.post_attention_layernorm.weight": "model-00004-of-00010.safetensors",
261
+ "model.layers.10.self_attn.k_proj.input_scale": "model-00003-of-00010.safetensors",
262
+ "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00010.safetensors",
263
+ "model.layers.10.self_attn.k_proj.weight_scale": "model-00003-of-00010.safetensors",
264
  "model.layers.10.self_attn.k_scale": "model-00003-of-00010.safetensors",
265
+ "model.layers.10.self_attn.o_proj.input_scale": "model-00003-of-00010.safetensors",
266
+ "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00010.safetensors",
267
+ "model.layers.10.self_attn.o_proj.weight_scale": "model-00003-of-00010.safetensors",
268
+ "model.layers.10.self_attn.q_proj.input_scale": "model-00003-of-00010.safetensors",
269
+ "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00010.safetensors",
270
+ "model.layers.10.self_attn.q_proj.weight_scale": "model-00003-of-00010.safetensors",
271
+ "model.layers.10.self_attn.v_proj.input_scale": "model-00003-of-00010.safetensors",
272
+ "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00010.safetensors",
273
+ "model.layers.10.self_attn.v_proj.weight_scale": "model-00003-of-00010.safetensors",
274
  "model.layers.10.self_attn.v_scale": "model-00003-of-00010.safetensors",
275
  "model.layers.11.block_sparse_moe.experts.0.w1.input_scale": "model-00004-of-00010.safetensors",
276
  "model.layers.11.block_sparse_moe.experts.0.w1.weight": "model-00004-of-00010.safetensors",
 
351
  "model.layers.11.self_attn.k_proj.weight": "model-00004-of-00010.safetensors",
352
  "model.layers.11.self_attn.k_proj.weight_scale": "model-00004-of-00010.safetensors",
353
  "model.layers.11.self_attn.k_scale": "model-00004-of-00010.safetensors",
354
+ "model.layers.11.self_attn.o_proj.input_scale": "model-00004-of-00010.safetensors",
355
  "model.layers.11.self_attn.o_proj.weight": "model-00004-of-00010.safetensors",
356
+ "model.layers.11.self_attn.o_proj.weight_scale": "model-00004-of-00010.safetensors",
357
  "model.layers.11.self_attn.q_proj.input_scale": "model-00004-of-00010.safetensors",
358
  "model.layers.11.self_attn.q_proj.weight": "model-00004-of-00010.safetensors",
359
  "model.layers.11.self_attn.q_proj.weight_scale": "model-00004-of-00010.safetensors",
 
440
  "model.layers.12.self_attn.k_proj.weight": "model-00004-of-00010.safetensors",
441
  "model.layers.12.self_attn.k_proj.weight_scale": "model-00004-of-00010.safetensors",
442
  "model.layers.12.self_attn.k_scale": "model-00004-of-00010.safetensors",
443
+ "model.layers.12.self_attn.o_proj.input_scale": "model-00004-of-00010.safetensors",
444
  "model.layers.12.self_attn.o_proj.weight": "model-00004-of-00010.safetensors",
445
+ "model.layers.12.self_attn.o_proj.weight_scale": "model-00004-of-00010.safetensors",
446
  "model.layers.12.self_attn.q_proj.input_scale": "model-00004-of-00010.safetensors",
447
  "model.layers.12.self_attn.q_proj.weight": "model-00004-of-00010.safetensors",
448
  "model.layers.12.self_attn.q_proj.weight_scale": "model-00004-of-00010.safetensors",
 
477
  "model.layers.13.block_sparse_moe.experts.2.w3.input_scale": "model-00004-of-00010.safetensors",
478
  "model.layers.13.block_sparse_moe.experts.2.w3.weight": "model-00004-of-00010.safetensors",
479
  "model.layers.13.block_sparse_moe.experts.2.w3.weight_scale": "model-00004-of-00010.safetensors",
480
+ "model.layers.13.block_sparse_moe.experts.3.w1.input_scale": "model-00004-of-00010.safetensors",
481
+ "model.layers.13.block_sparse_moe.experts.3.w1.weight": "model-00004-of-00010.safetensors",
482
+ "model.layers.13.block_sparse_moe.experts.3.w1.weight_scale": "model-00004-of-00010.safetensors",
483
+ "model.layers.13.block_sparse_moe.experts.3.w2.input_scale": "model-00004-of-00010.safetensors",
484
+ "model.layers.13.block_sparse_moe.experts.3.w2.weight": "model-00004-of-00010.safetensors",
485
+ "model.layers.13.block_sparse_moe.experts.3.w2.weight_scale": "model-00004-of-00010.safetensors",
486
+ "model.layers.13.block_sparse_moe.experts.3.w3.input_scale": "model-00004-of-00010.safetensors",
487
+ "model.layers.13.block_sparse_moe.experts.3.w3.weight": "model-00004-of-00010.safetensors",
488
+ "model.layers.13.block_sparse_moe.experts.3.w3.weight_scale": "model-00004-of-00010.safetensors",
489
+ "model.layers.13.block_sparse_moe.experts.4.w1.input_scale": "model-00004-of-00010.safetensors",
490
+ "model.layers.13.block_sparse_moe.experts.4.w1.weight": "model-00004-of-00010.safetensors",
491
+ "model.layers.13.block_sparse_moe.experts.4.w1.weight_scale": "model-00004-of-00010.safetensors",
492
  "model.layers.13.block_sparse_moe.experts.4.w2.input_scale": "model-00005-of-00010.safetensors",
493
  "model.layers.13.block_sparse_moe.experts.4.w2.weight": "model-00005-of-00010.safetensors",
494
  "model.layers.13.block_sparse_moe.experts.4.w2.weight_scale": "model-00005-of-00010.safetensors",
 
529
  "model.layers.13.self_attn.k_proj.weight": "model-00004-of-00010.safetensors",
530
  "model.layers.13.self_attn.k_proj.weight_scale": "model-00004-of-00010.safetensors",
531
  "model.layers.13.self_attn.k_scale": "model-00004-of-00010.safetensors",
532
+ "model.layers.13.self_attn.o_proj.input_scale": "model-00004-of-00010.safetensors",
533
  "model.layers.13.self_attn.o_proj.weight": "model-00004-of-00010.safetensors",
534
+ "model.layers.13.self_attn.o_proj.weight_scale": "model-00004-of-00010.safetensors",
535
  "model.layers.13.self_attn.q_proj.input_scale": "model-00004-of-00010.safetensors",
536
  "model.layers.13.self_attn.q_proj.weight": "model-00004-of-00010.safetensors",
537
  "model.layers.13.self_attn.q_proj.weight_scale": "model-00004-of-00010.safetensors",
 
618
  "model.layers.14.self_attn.k_proj.weight": "model-00005-of-00010.safetensors",
619
  "model.layers.14.self_attn.k_proj.weight_scale": "model-00005-of-00010.safetensors",
620
  "model.layers.14.self_attn.k_scale": "model-00005-of-00010.safetensors",
621
+ "model.layers.14.self_attn.o_proj.input_scale": "model-00005-of-00010.safetensors",
622
  "model.layers.14.self_attn.o_proj.weight": "model-00005-of-00010.safetensors",
623
+ "model.layers.14.self_attn.o_proj.weight_scale": "model-00005-of-00010.safetensors",
624
  "model.layers.14.self_attn.q_proj.input_scale": "model-00005-of-00010.safetensors",
625
  "model.layers.14.self_attn.q_proj.weight": "model-00005-of-00010.safetensors",
626
  "model.layers.14.self_attn.q_proj.weight_scale": "model-00005-of-00010.safetensors",
 
707
  "model.layers.15.self_attn.k_proj.weight": "model-00005-of-00010.safetensors",
708
  "model.layers.15.self_attn.k_proj.weight_scale": "model-00005-of-00010.safetensors",
709
  "model.layers.15.self_attn.k_scale": "model-00005-of-00010.safetensors",
710
+ "model.layers.15.self_attn.o_proj.input_scale": "model-00005-of-00010.safetensors",
711
  "model.layers.15.self_attn.o_proj.weight": "model-00005-of-00010.safetensors",
712
+ "model.layers.15.self_attn.o_proj.weight_scale": "model-00005-of-00010.safetensors",
713
  "model.layers.15.self_attn.q_proj.input_scale": "model-00005-of-00010.safetensors",
714
  "model.layers.15.self_attn.q_proj.weight": "model-00005-of-00010.safetensors",
715
  "model.layers.15.self_attn.q_proj.weight_scale": "model-00005-of-00010.safetensors",
 
774
  "model.layers.16.block_sparse_moe.experts.6.w1.input_scale": "model-00005-of-00010.safetensors",
775
  "model.layers.16.block_sparse_moe.experts.6.w1.weight": "model-00005-of-00010.safetensors",
776
  "model.layers.16.block_sparse_moe.experts.6.w1.weight_scale": "model-00005-of-00010.safetensors",
777
+ "model.layers.16.block_sparse_moe.experts.6.w2.input_scale": "model-00005-of-00010.safetensors",
778
+ "model.layers.16.block_sparse_moe.experts.6.w2.weight": "model-00005-of-00010.safetensors",
779
+ "model.layers.16.block_sparse_moe.experts.6.w2.weight_scale": "model-00005-of-00010.safetensors",
780
+ "model.layers.16.block_sparse_moe.experts.6.w3.input_scale": "model-00005-of-00010.safetensors",
781
+ "model.layers.16.block_sparse_moe.experts.6.w3.weight": "model-00005-of-00010.safetensors",
782
+ "model.layers.16.block_sparse_moe.experts.6.w3.weight_scale": "model-00005-of-00010.safetensors",
783
+ "model.layers.16.block_sparse_moe.experts.7.w1.input_scale": "model-00005-of-00010.safetensors",
784
+ "model.layers.16.block_sparse_moe.experts.7.w1.weight": "model-00005-of-00010.safetensors",
785
+ "model.layers.16.block_sparse_moe.experts.7.w1.weight_scale": "model-00005-of-00010.safetensors",
786
+ "model.layers.16.block_sparse_moe.experts.7.w2.input_scale": "model-00005-of-00010.safetensors",
787
+ "model.layers.16.block_sparse_moe.experts.7.w2.weight": "model-00005-of-00010.safetensors",
788
+ "model.layers.16.block_sparse_moe.experts.7.w2.weight_scale": "model-00005-of-00010.safetensors",
789
+ "model.layers.16.block_sparse_moe.experts.7.w3.input_scale": "model-00005-of-00010.safetensors",
790
+ "model.layers.16.block_sparse_moe.experts.7.w3.weight": "model-00005-of-00010.safetensors",
791
+ "model.layers.16.block_sparse_moe.experts.7.w3.weight_scale": "model-00005-of-00010.safetensors",
792
  "model.layers.16.block_sparse_moe.gate.weight": "model-00005-of-00010.safetensors",
793
+ "model.layers.16.input_layernorm.weight": "model-00005-of-00010.safetensors",
794
+ "model.layers.16.post_attention_layernorm.weight": "model-00005-of-00010.safetensors",
795
  "model.layers.16.self_attn.k_proj.input_scale": "model-00005-of-00010.safetensors",
796
  "model.layers.16.self_attn.k_proj.weight": "model-00005-of-00010.safetensors",
797
  "model.layers.16.self_attn.k_proj.weight_scale": "model-00005-of-00010.safetensors",
798
  "model.layers.16.self_attn.k_scale": "model-00005-of-00010.safetensors",
799
+ "model.layers.16.self_attn.o_proj.input_scale": "model-00005-of-00010.safetensors",
800
  "model.layers.16.self_attn.o_proj.weight": "model-00005-of-00010.safetensors",
801
+ "model.layers.16.self_attn.o_proj.weight_scale": "model-00005-of-00010.safetensors",
802
  "model.layers.16.self_attn.q_proj.input_scale": "model-00005-of-00010.safetensors",
803
  "model.layers.16.self_attn.q_proj.weight": "model-00005-of-00010.safetensors",
804
  "model.layers.16.self_attn.q_proj.weight_scale": "model-00005-of-00010.safetensors",
 
884
  "model.layers.17.self_attn.k_proj.input_scale": "model-00006-of-00010.safetensors",
885
  "model.layers.17.self_attn.k_proj.weight": "model-00006-of-00010.safetensors",
886
  "model.layers.17.self_attn.k_proj.weight_scale": "model-00006-of-00010.safetensors",
887
+ "model.layers.17.self_attn.k_scale": "model-00005-of-00010.safetensors",
888
+ "model.layers.17.self_attn.o_proj.input_scale": "model-00006-of-00010.safetensors",
889
  "model.layers.17.self_attn.o_proj.weight": "model-00006-of-00010.safetensors",
890
+ "model.layers.17.self_attn.o_proj.weight_scale": "model-00006-of-00010.safetensors",
891
  "model.layers.17.self_attn.q_proj.input_scale": "model-00006-of-00010.safetensors",
892
  "model.layers.17.self_attn.q_proj.weight": "model-00006-of-00010.safetensors",
893
  "model.layers.17.self_attn.q_proj.weight_scale": "model-00006-of-00010.safetensors",
894
  "model.layers.17.self_attn.v_proj.input_scale": "model-00006-of-00010.safetensors",
895
  "model.layers.17.self_attn.v_proj.weight": "model-00006-of-00010.safetensors",
896
  "model.layers.17.self_attn.v_proj.weight_scale": "model-00006-of-00010.safetensors",
897
+ "model.layers.17.self_attn.v_scale": "model-00005-of-00010.safetensors",
898
  "model.layers.18.block_sparse_moe.experts.0.w1.input_scale": "model-00006-of-00010.safetensors",
899
  "model.layers.18.block_sparse_moe.experts.0.w1.weight": "model-00006-of-00010.safetensors",
900
  "model.layers.18.block_sparse_moe.experts.0.w1.weight_scale": "model-00006-of-00010.safetensors",
 
974
  "model.layers.18.self_attn.k_proj.weight": "model-00006-of-00010.safetensors",
975
  "model.layers.18.self_attn.k_proj.weight_scale": "model-00006-of-00010.safetensors",
976
  "model.layers.18.self_attn.k_scale": "model-00006-of-00010.safetensors",
977
+ "model.layers.18.self_attn.o_proj.input_scale": "model-00006-of-00010.safetensors",
978
  "model.layers.18.self_attn.o_proj.weight": "model-00006-of-00010.safetensors",
979
+ "model.layers.18.self_attn.o_proj.weight_scale": "model-00006-of-00010.safetensors",
980
  "model.layers.18.self_attn.q_proj.input_scale": "model-00006-of-00010.safetensors",
981
  "model.layers.18.self_attn.q_proj.weight": "model-00006-of-00010.safetensors",
982
  "model.layers.18.self_attn.q_proj.weight_scale": "model-00006-of-00010.safetensors",
 
1063
  "model.layers.19.self_attn.k_proj.weight": "model-00006-of-00010.safetensors",
1064
  "model.layers.19.self_attn.k_proj.weight_scale": "model-00006-of-00010.safetensors",
1065
  "model.layers.19.self_attn.k_scale": "model-00006-of-00010.safetensors",
1066
+ "model.layers.19.self_attn.o_proj.input_scale": "model-00006-of-00010.safetensors",
1067
  "model.layers.19.self_attn.o_proj.weight": "model-00006-of-00010.safetensors",
1068
+ "model.layers.19.self_attn.o_proj.weight_scale": "model-00006-of-00010.safetensors",
1069
  "model.layers.19.self_attn.q_proj.input_scale": "model-00006-of-00010.safetensors",
1070
  "model.layers.19.self_attn.q_proj.weight": "model-00006-of-00010.safetensors",
1071
  "model.layers.19.self_attn.q_proj.weight_scale": "model-00006-of-00010.safetensors",
 
1152
  "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00010.safetensors",
1153
  "model.layers.2.self_attn.k_proj.weight_scale": "model-00001-of-00010.safetensors",
1154
  "model.layers.2.self_attn.k_scale": "model-00001-of-00010.safetensors",
1155
+ "model.layers.2.self_attn.o_proj.input_scale": "model-00001-of-00010.safetensors",
1156
  "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00010.safetensors",
1157
+ "model.layers.2.self_attn.o_proj.weight_scale": "model-00001-of-00010.safetensors",
1158
  "model.layers.2.self_attn.q_proj.input_scale": "model-00001-of-00010.safetensors",
1159
  "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00010.safetensors",
1160
  "model.layers.2.self_attn.q_proj.weight_scale": "model-00001-of-00010.safetensors",
 
1174
  "model.layers.20.block_sparse_moe.experts.1.w1.input_scale": "model-00006-of-00010.safetensors",
1175
  "model.layers.20.block_sparse_moe.experts.1.w1.weight": "model-00006-of-00010.safetensors",
1176
  "model.layers.20.block_sparse_moe.experts.1.w1.weight_scale": "model-00006-of-00010.safetensors",
1177
+ "model.layers.20.block_sparse_moe.experts.1.w2.input_scale": "model-00006-of-00010.safetensors",
1178
+ "model.layers.20.block_sparse_moe.experts.1.w2.weight": "model-00006-of-00010.safetensors",
1179
+ "model.layers.20.block_sparse_moe.experts.1.w2.weight_scale": "model-00006-of-00010.safetensors",
1180
+ "model.layers.20.block_sparse_moe.experts.1.w3.input_scale": "model-00006-of-00010.safetensors",
1181
+ "model.layers.20.block_sparse_moe.experts.1.w3.weight": "model-00006-of-00010.safetensors",
1182
+ "model.layers.20.block_sparse_moe.experts.1.w3.weight_scale": "model-00006-of-00010.safetensors",
1183
+ "model.layers.20.block_sparse_moe.experts.2.w1.input_scale": "model-00006-of-00010.safetensors",
1184
+ "model.layers.20.block_sparse_moe.experts.2.w1.weight": "model-00006-of-00010.safetensors",
1185
+ "model.layers.20.block_sparse_moe.experts.2.w1.weight_scale": "model-00006-of-00010.safetensors",
1186
+ "model.layers.20.block_sparse_moe.experts.2.w2.input_scale": "model-00006-of-00010.safetensors",
1187
+ "model.layers.20.block_sparse_moe.experts.2.w2.weight": "model-00006-of-00010.safetensors",
1188
+ "model.layers.20.block_sparse_moe.experts.2.w2.weight_scale": "model-00006-of-00010.safetensors",
1189
+ "model.layers.20.block_sparse_moe.experts.2.w3.input_scale": "model-00006-of-00010.safetensors",
1190
+ "model.layers.20.block_sparse_moe.experts.2.w3.weight": "model-00006-of-00010.safetensors",
1191
+ "model.layers.20.block_sparse_moe.experts.2.w3.weight_scale": "model-00006-of-00010.safetensors",
1192
+ "model.layers.20.block_sparse_moe.experts.3.w1.input_scale": "model-00006-of-00010.safetensors",
1193
+ "model.layers.20.block_sparse_moe.experts.3.w1.weight": "model-00006-of-00010.safetensors",
1194
+ "model.layers.20.block_sparse_moe.experts.3.w1.weight_scale": "model-00006-of-00010.safetensors",
1195
  "model.layers.20.block_sparse_moe.experts.3.w2.input_scale": "model-00007-of-00010.safetensors",
1196
  "model.layers.20.block_sparse_moe.experts.3.w2.weight": "model-00007-of-00010.safetensors",
1197
  "model.layers.20.block_sparse_moe.experts.3.w2.weight_scale": "model-00007-of-00010.safetensors",
 
1241
  "model.layers.20.self_attn.k_proj.weight": "model-00006-of-00010.safetensors",
1242
  "model.layers.20.self_attn.k_proj.weight_scale": "model-00006-of-00010.safetensors",
1243
  "model.layers.20.self_attn.k_scale": "model-00006-of-00010.safetensors",
1244
+ "model.layers.20.self_attn.o_proj.input_scale": "model-00006-of-00010.safetensors",
1245
  "model.layers.20.self_attn.o_proj.weight": "model-00006-of-00010.safetensors",
1246
+ "model.layers.20.self_attn.o_proj.weight_scale": "model-00006-of-00010.safetensors",
1247
  "model.layers.20.self_attn.q_proj.input_scale": "model-00006-of-00010.safetensors",
1248
  "model.layers.20.self_attn.q_proj.weight": "model-00006-of-00010.safetensors",
1249
  "model.layers.20.self_attn.q_proj.weight_scale": "model-00006-of-00010.safetensors",
 
1330
  "model.layers.21.self_attn.k_proj.weight": "model-00007-of-00010.safetensors",
1331
  "model.layers.21.self_attn.k_proj.weight_scale": "model-00007-of-00010.safetensors",
1332
  "model.layers.21.self_attn.k_scale": "model-00007-of-00010.safetensors",
1333
+ "model.layers.21.self_attn.o_proj.input_scale": "model-00007-of-00010.safetensors",
1334
  "model.layers.21.self_attn.o_proj.weight": "model-00007-of-00010.safetensors",
1335
+ "model.layers.21.self_attn.o_proj.weight_scale": "model-00007-of-00010.safetensors",
1336
  "model.layers.21.self_attn.q_proj.input_scale": "model-00007-of-00010.safetensors",
1337
  "model.layers.21.self_attn.q_proj.weight": "model-00007-of-00010.safetensors",
1338
  "model.layers.21.self_attn.q_proj.weight_scale": "model-00007-of-00010.safetensors",
 
1419
  "model.layers.22.self_attn.k_proj.weight": "model-00007-of-00010.safetensors",
1420
  "model.layers.22.self_attn.k_proj.weight_scale": "model-00007-of-00010.safetensors",
1421
  "model.layers.22.self_attn.k_scale": "model-00007-of-00010.safetensors",
1422
+ "model.layers.22.self_attn.o_proj.input_scale": "model-00007-of-00010.safetensors",
1423
  "model.layers.22.self_attn.o_proj.weight": "model-00007-of-00010.safetensors",
1424
+ "model.layers.22.self_attn.o_proj.weight_scale": "model-00007-of-00010.safetensors",
1425
  "model.layers.22.self_attn.q_proj.input_scale": "model-00007-of-00010.safetensors",
1426
  "model.layers.22.self_attn.q_proj.weight": "model-00007-of-00010.safetensors",
1427
  "model.layers.22.self_attn.q_proj.weight_scale": "model-00007-of-00010.safetensors",
 
1471
  "model.layers.23.block_sparse_moe.experts.4.w2.input_scale": "model-00007-of-00010.safetensors",
1472
  "model.layers.23.block_sparse_moe.experts.4.w2.weight": "model-00007-of-00010.safetensors",
1473
  "model.layers.23.block_sparse_moe.experts.4.w2.weight_scale": "model-00007-of-00010.safetensors",
1474
+ "model.layers.23.block_sparse_moe.experts.4.w3.input_scale": "model-00007-of-00010.safetensors",
1475
+ "model.layers.23.block_sparse_moe.experts.4.w3.weight": "model-00007-of-00010.safetensors",
1476
+ "model.layers.23.block_sparse_moe.experts.4.w3.weight_scale": "model-00007-of-00010.safetensors",
1477
+ "model.layers.23.block_sparse_moe.experts.5.w1.input_scale": "model-00007-of-00010.safetensors",
1478
+ "model.layers.23.block_sparse_moe.experts.5.w1.weight": "model-00007-of-00010.safetensors",
1479
+ "model.layers.23.block_sparse_moe.experts.5.w1.weight_scale": "model-00007-of-00010.safetensors",
1480
+ "model.layers.23.block_sparse_moe.experts.5.w2.input_scale": "model-00007-of-00010.safetensors",
1481
+ "model.layers.23.block_sparse_moe.experts.5.w2.weight": "model-00007-of-00010.safetensors",
1482
+ "model.layers.23.block_sparse_moe.experts.5.w2.weight_scale": "model-00007-of-00010.safetensors",
1483
+ "model.layers.23.block_sparse_moe.experts.5.w3.input_scale": "model-00007-of-00010.safetensors",
1484
+ "model.layers.23.block_sparse_moe.experts.5.w3.weight": "model-00007-of-00010.safetensors",
1485
+ "model.layers.23.block_sparse_moe.experts.5.w3.weight_scale": "model-00007-of-00010.safetensors",
1486
+ "model.layers.23.block_sparse_moe.experts.6.w1.input_scale": "model-00007-of-00010.safetensors",
1487
+ "model.layers.23.block_sparse_moe.experts.6.w1.weight": "model-00007-of-00010.safetensors",
1488
+ "model.layers.23.block_sparse_moe.experts.6.w1.weight_scale": "model-00007-of-00010.safetensors",
1489
+ "model.layers.23.block_sparse_moe.experts.6.w2.input_scale": "model-00007-of-00010.safetensors",
1490
+ "model.layers.23.block_sparse_moe.experts.6.w2.weight": "model-00007-of-00010.safetensors",
1491
+ "model.layers.23.block_sparse_moe.experts.6.w2.weight_scale": "model-00007-of-00010.safetensors",
1492
+ "model.layers.23.block_sparse_moe.experts.6.w3.input_scale": "model-00007-of-00010.safetensors",
1493
+ "model.layers.23.block_sparse_moe.experts.6.w3.weight": "model-00007-of-00010.safetensors",
1494
+ "model.layers.23.block_sparse_moe.experts.6.w3.weight_scale": "model-00007-of-00010.safetensors",
1495
  "model.layers.23.block_sparse_moe.experts.7.w1.input_scale": "model-00008-of-00010.safetensors",
1496
  "model.layers.23.block_sparse_moe.experts.7.w1.weight": "model-00008-of-00010.safetensors",
1497
  "model.layers.23.block_sparse_moe.experts.7.w1.weight_scale": "model-00008-of-00010.safetensors",
 
1508
  "model.layers.23.self_attn.k_proj.weight": "model-00007-of-00010.safetensors",
1509
  "model.layers.23.self_attn.k_proj.weight_scale": "model-00007-of-00010.safetensors",
1510
  "model.layers.23.self_attn.k_scale": "model-00007-of-00010.safetensors",
1511
+ "model.layers.23.self_attn.o_proj.input_scale": "model-00007-of-00010.safetensors",
1512
  "model.layers.23.self_attn.o_proj.weight": "model-00007-of-00010.safetensors",
1513
+ "model.layers.23.self_attn.o_proj.weight_scale": "model-00007-of-00010.safetensors",
1514
  "model.layers.23.self_attn.q_proj.input_scale": "model-00007-of-00010.safetensors",
1515
  "model.layers.23.self_attn.q_proj.weight": "model-00007-of-00010.safetensors",
1516
  "model.layers.23.self_attn.q_proj.weight_scale": "model-00007-of-00010.safetensors",
 
1597
  "model.layers.24.self_attn.k_proj.weight": "model-00008-of-00010.safetensors",
1598
  "model.layers.24.self_attn.k_proj.weight_scale": "model-00008-of-00010.safetensors",
1599
  "model.layers.24.self_attn.k_scale": "model-00008-of-00010.safetensors",
1600
+ "model.layers.24.self_attn.o_proj.input_scale": "model-00008-of-00010.safetensors",
1601
  "model.layers.24.self_attn.o_proj.weight": "model-00008-of-00010.safetensors",
1602
+ "model.layers.24.self_attn.o_proj.weight_scale": "model-00008-of-00010.safetensors",
1603
  "model.layers.24.self_attn.q_proj.input_scale": "model-00008-of-00010.safetensors",
1604
  "model.layers.24.self_attn.q_proj.weight": "model-00008-of-00010.safetensors",
1605
  "model.layers.24.self_attn.q_proj.weight_scale": "model-00008-of-00010.safetensors",
 
1686
  "model.layers.25.self_attn.k_proj.weight": "model-00008-of-00010.safetensors",
1687
  "model.layers.25.self_attn.k_proj.weight_scale": "model-00008-of-00010.safetensors",
1688
  "model.layers.25.self_attn.k_scale": "model-00008-of-00010.safetensors",
1689
+ "model.layers.25.self_attn.o_proj.input_scale": "model-00008-of-00010.safetensors",
1690
  "model.layers.25.self_attn.o_proj.weight": "model-00008-of-00010.safetensors",
1691
+ "model.layers.25.self_attn.o_proj.weight_scale": "model-00008-of-00010.safetensors",
1692
  "model.layers.25.self_attn.q_proj.input_scale": "model-00008-of-00010.safetensors",
1693
  "model.layers.25.self_attn.q_proj.weight": "model-00008-of-00010.safetensors",
1694
  "model.layers.25.self_attn.q_proj.weight_scale": "model-00008-of-00010.safetensors",
 
1775
  "model.layers.26.self_attn.k_proj.weight": "model-00008-of-00010.safetensors",
1776
  "model.layers.26.self_attn.k_proj.weight_scale": "model-00008-of-00010.safetensors",
1777
  "model.layers.26.self_attn.k_scale": "model-00008-of-00010.safetensors",
1778
+ "model.layers.26.self_attn.o_proj.input_scale": "model-00008-of-00010.safetensors",
1779
  "model.layers.26.self_attn.o_proj.weight": "model-00008-of-00010.safetensors",
1780
+ "model.layers.26.self_attn.o_proj.weight_scale": "model-00008-of-00010.safetensors",
1781
  "model.layers.26.self_attn.q_proj.input_scale": "model-00008-of-00010.safetensors",
1782
  "model.layers.26.self_attn.q_proj.weight": "model-00008-of-00010.safetensors",
1783
  "model.layers.26.self_attn.q_proj.weight_scale": "model-00008-of-00010.safetensors",
 
1785
  "model.layers.26.self_attn.v_proj.weight": "model-00008-of-00010.safetensors",
1786
  "model.layers.26.self_attn.v_proj.weight_scale": "model-00008-of-00010.safetensors",
1787
  "model.layers.26.self_attn.v_scale": "model-00008-of-00010.safetensors",
1788
+ "model.layers.27.block_sparse_moe.experts.0.w1.input_scale": "model-00008-of-00010.safetensors",
1789
+ "model.layers.27.block_sparse_moe.experts.0.w1.weight": "model-00008-of-00010.safetensors",
1790
+ "model.layers.27.block_sparse_moe.experts.0.w1.weight_scale": "model-00008-of-00010.safetensors",
1791
+ "model.layers.27.block_sparse_moe.experts.0.w2.input_scale": "model-00008-of-00010.safetensors",
1792
+ "model.layers.27.block_sparse_moe.experts.0.w2.weight": "model-00008-of-00010.safetensors",
1793
+ "model.layers.27.block_sparse_moe.experts.0.w2.weight_scale": "model-00008-of-00010.safetensors",
1794
+ "model.layers.27.block_sparse_moe.experts.0.w3.input_scale": "model-00008-of-00010.safetensors",
1795
+ "model.layers.27.block_sparse_moe.experts.0.w3.weight": "model-00008-of-00010.safetensors",
1796
+ "model.layers.27.block_sparse_moe.experts.0.w3.weight_scale": "model-00008-of-00010.safetensors",
1797
+ "model.layers.27.block_sparse_moe.experts.1.w1.input_scale": "model-00008-of-00010.safetensors",
1798
+ "model.layers.27.block_sparse_moe.experts.1.w1.weight": "model-00008-of-00010.safetensors",
1799
+ "model.layers.27.block_sparse_moe.experts.1.w1.weight_scale": "model-00008-of-00010.safetensors",
1800
+ "model.layers.27.block_sparse_moe.experts.1.w2.input_scale": "model-00008-of-00010.safetensors",
1801
+ "model.layers.27.block_sparse_moe.experts.1.w2.weight": "model-00008-of-00010.safetensors",
1802
+ "model.layers.27.block_sparse_moe.experts.1.w2.weight_scale": "model-00008-of-00010.safetensors",
1803
+ "model.layers.27.block_sparse_moe.experts.1.w3.input_scale": "model-00008-of-00010.safetensors",
1804
+ "model.layers.27.block_sparse_moe.experts.1.w3.weight": "model-00008-of-00010.safetensors",
1805
+ "model.layers.27.block_sparse_moe.experts.1.w3.weight_scale": "model-00008-of-00010.safetensors",
1806
+ "model.layers.27.block_sparse_moe.experts.2.w1.input_scale": "model-00008-of-00010.safetensors",
1807
+ "model.layers.27.block_sparse_moe.experts.2.w1.weight": "model-00008-of-00010.safetensors",
1808
+ "model.layers.27.block_sparse_moe.experts.2.w1.weight_scale": "model-00008-of-00010.safetensors",
1809
  "model.layers.27.block_sparse_moe.experts.2.w2.input_scale": "model-00009-of-00010.safetensors",
1810
  "model.layers.27.block_sparse_moe.experts.2.w2.weight": "model-00009-of-00010.safetensors",
1811
  "model.layers.27.block_sparse_moe.experts.2.w2.weight_scale": "model-00009-of-00010.safetensors",
 
1857
  "model.layers.27.block_sparse_moe.experts.7.w3.input_scale": "model-00009-of-00010.safetensors",
1858
  "model.layers.27.block_sparse_moe.experts.7.w3.weight": "model-00009-of-00010.safetensors",
1859
  "model.layers.27.block_sparse_moe.experts.7.w3.weight_scale": "model-00009-of-00010.safetensors",
1860
+ "model.layers.27.block_sparse_moe.gate.weight": "model-00008-of-00010.safetensors",
1861
  "model.layers.27.input_layernorm.weight": "model-00009-of-00010.safetensors",
1862
  "model.layers.27.post_attention_layernorm.weight": "model-00009-of-00010.safetensors",
1863
+ "model.layers.27.self_attn.k_proj.input_scale": "model-00008-of-00010.safetensors",
1864
+ "model.layers.27.self_attn.k_proj.weight": "model-00008-of-00010.safetensors",
1865
+ "model.layers.27.self_attn.k_proj.weight_scale": "model-00008-of-00010.safetensors",
1866
  "model.layers.27.self_attn.k_scale": "model-00008-of-00010.safetensors",
1867
+ "model.layers.27.self_attn.o_proj.input_scale": "model-00008-of-00010.safetensors",
1868
+ "model.layers.27.self_attn.o_proj.weight": "model-00008-of-00010.safetensors",
1869
+ "model.layers.27.self_attn.o_proj.weight_scale": "model-00008-of-00010.safetensors",
1870
+ "model.layers.27.self_attn.q_proj.input_scale": "model-00008-of-00010.safetensors",
1871
+ "model.layers.27.self_attn.q_proj.weight": "model-00008-of-00010.safetensors",
1872
+ "model.layers.27.self_attn.q_proj.weight_scale": "model-00008-of-00010.safetensors",
1873
+ "model.layers.27.self_attn.v_proj.input_scale": "model-00008-of-00010.safetensors",
1874
+ "model.layers.27.self_attn.v_proj.weight": "model-00008-of-00010.safetensors",
1875
+ "model.layers.27.self_attn.v_proj.weight_scale": "model-00008-of-00010.safetensors",
1876
  "model.layers.27.self_attn.v_scale": "model-00008-of-00010.safetensors",
1877
  "model.layers.28.block_sparse_moe.experts.0.w1.input_scale": "model-00009-of-00010.safetensors",
1878
  "model.layers.28.block_sparse_moe.experts.0.w1.weight": "model-00009-of-00010.safetensors",
 
1953
  "model.layers.28.self_attn.k_proj.weight": "model-00009-of-00010.safetensors",
1954
  "model.layers.28.self_attn.k_proj.weight_scale": "model-00009-of-00010.safetensors",
1955
  "model.layers.28.self_attn.k_scale": "model-00009-of-00010.safetensors",
1956
+ "model.layers.28.self_attn.o_proj.input_scale": "model-00009-of-00010.safetensors",
1957
  "model.layers.28.self_attn.o_proj.weight": "model-00009-of-00010.safetensors",
1958
+ "model.layers.28.self_attn.o_proj.weight_scale": "model-00009-of-00010.safetensors",
1959
  "model.layers.28.self_attn.q_proj.input_scale": "model-00009-of-00010.safetensors",
1960
  "model.layers.28.self_attn.q_proj.weight": "model-00009-of-00010.safetensors",
1961
  "model.layers.28.self_attn.q_proj.weight_scale": "model-00009-of-00010.safetensors",
 
2042
  "model.layers.29.self_attn.k_proj.weight": "model-00009-of-00010.safetensors",
2043
  "model.layers.29.self_attn.k_proj.weight_scale": "model-00009-of-00010.safetensors",
2044
  "model.layers.29.self_attn.k_scale": "model-00009-of-00010.safetensors",
2045
+ "model.layers.29.self_attn.o_proj.input_scale": "model-00009-of-00010.safetensors",
2046
  "model.layers.29.self_attn.o_proj.weight": "model-00009-of-00010.safetensors",
2047
+ "model.layers.29.self_attn.o_proj.weight_scale": "model-00009-of-00010.safetensors",
2048
  "model.layers.29.self_attn.q_proj.input_scale": "model-00009-of-00010.safetensors",
2049
  "model.layers.29.self_attn.q_proj.weight": "model-00009-of-00010.safetensors",
2050
  "model.layers.29.self_attn.q_proj.weight_scale": "model-00009-of-00010.safetensors",
 
2064
  "model.layers.3.block_sparse_moe.experts.1.w1.input_scale": "model-00001-of-00010.safetensors",
2065
  "model.layers.3.block_sparse_moe.experts.1.w1.weight": "model-00001-of-00010.safetensors",
2066
  "model.layers.3.block_sparse_moe.experts.1.w1.weight_scale": "model-00001-of-00010.safetensors",
2067
+ "model.layers.3.block_sparse_moe.experts.1.w2.input_scale": "model-00001-of-00010.safetensors",
2068
+ "model.layers.3.block_sparse_moe.experts.1.w2.weight": "model-00001-of-00010.safetensors",
2069
+ "model.layers.3.block_sparse_moe.experts.1.w2.weight_scale": "model-00001-of-00010.safetensors",
2070
  "model.layers.3.block_sparse_moe.experts.1.w3.input_scale": "model-00002-of-00010.safetensors",
2071
  "model.layers.3.block_sparse_moe.experts.1.w3.weight": "model-00002-of-00010.safetensors",
2072
  "model.layers.3.block_sparse_moe.experts.1.w3.weight_scale": "model-00002-of-00010.safetensors",
 
2131
  "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00010.safetensors",
2132
  "model.layers.3.self_attn.k_proj.weight_scale": "model-00001-of-00010.safetensors",
2133
  "model.layers.3.self_attn.k_scale": "model-00001-of-00010.safetensors",
2134
+ "model.layers.3.self_attn.o_proj.input_scale": "model-00001-of-00010.safetensors",
2135
  "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00010.safetensors",
2136
+ "model.layers.3.self_attn.o_proj.weight_scale": "model-00001-of-00010.safetensors",
2137
  "model.layers.3.self_attn.q_proj.input_scale": "model-00001-of-00010.safetensors",
2138
  "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00010.safetensors",
2139
  "model.layers.3.self_attn.q_proj.weight_scale": "model-00001-of-00010.safetensors",
 
2168
  "model.layers.30.block_sparse_moe.experts.2.w3.input_scale": "model-00009-of-00010.safetensors",
2169
  "model.layers.30.block_sparse_moe.experts.2.w3.weight": "model-00009-of-00010.safetensors",
2170
  "model.layers.30.block_sparse_moe.experts.2.w3.weight_scale": "model-00009-of-00010.safetensors",
2171
+ "model.layers.30.block_sparse_moe.experts.3.w1.input_scale": "model-00009-of-00010.safetensors",
2172
+ "model.layers.30.block_sparse_moe.experts.3.w1.weight": "model-00009-of-00010.safetensors",
2173
+ "model.layers.30.block_sparse_moe.experts.3.w1.weight_scale": "model-00009-of-00010.safetensors",
2174
+ "model.layers.30.block_sparse_moe.experts.3.w2.input_scale": "model-00009-of-00010.safetensors",
2175
+ "model.layers.30.block_sparse_moe.experts.3.w2.weight": "model-00009-of-00010.safetensors",
2176
+ "model.layers.30.block_sparse_moe.experts.3.w2.weight_scale": "model-00009-of-00010.safetensors",
2177
+ "model.layers.30.block_sparse_moe.experts.3.w3.input_scale": "model-00009-of-00010.safetensors",
2178
+ "model.layers.30.block_sparse_moe.experts.3.w3.weight": "model-00009-of-00010.safetensors",
2179
+ "model.layers.30.block_sparse_moe.experts.3.w3.weight_scale": "model-00009-of-00010.safetensors",
2180
+ "model.layers.30.block_sparse_moe.experts.4.w1.input_scale": "model-00009-of-00010.safetensors",
2181
+ "model.layers.30.block_sparse_moe.experts.4.w1.weight": "model-00009-of-00010.safetensors",
2182
+ "model.layers.30.block_sparse_moe.experts.4.w1.weight_scale": "model-00009-of-00010.safetensors",
2183
+ "model.layers.30.block_sparse_moe.experts.4.w2.input_scale": "model-00009-of-00010.safetensors",
2184
+ "model.layers.30.block_sparse_moe.experts.4.w2.weight": "model-00009-of-00010.safetensors",
2185
+ "model.layers.30.block_sparse_moe.experts.4.w2.weight_scale": "model-00009-of-00010.safetensors",
2186
+ "model.layers.30.block_sparse_moe.experts.4.w3.input_scale": "model-00009-of-00010.safetensors",
2187
+ "model.layers.30.block_sparse_moe.experts.4.w3.weight": "model-00009-of-00010.safetensors",
2188
+ "model.layers.30.block_sparse_moe.experts.4.w3.weight_scale": "model-00009-of-00010.safetensors",
2189
+ "model.layers.30.block_sparse_moe.experts.5.w1.input_scale": "model-00009-of-00010.safetensors",
2190
+ "model.layers.30.block_sparse_moe.experts.5.w1.weight": "model-00009-of-00010.safetensors",
2191
+ "model.layers.30.block_sparse_moe.experts.5.w1.weight_scale": "model-00009-of-00010.safetensors",
2192
+ "model.layers.30.block_sparse_moe.experts.5.w2.input_scale": "model-00009-of-00010.safetensors",
2193
+ "model.layers.30.block_sparse_moe.experts.5.w2.weight": "model-00009-of-00010.safetensors",
2194
+ "model.layers.30.block_sparse_moe.experts.5.w2.weight_scale": "model-00009-of-00010.safetensors",
2195
+ "model.layers.30.block_sparse_moe.experts.5.w3.input_scale": "model-00009-of-00010.safetensors",
2196
+ "model.layers.30.block_sparse_moe.experts.5.w3.weight": "model-00009-of-00010.safetensors",
2197
+ "model.layers.30.block_sparse_moe.experts.5.w3.weight_scale": "model-00009-of-00010.safetensors",
2198
  "model.layers.30.block_sparse_moe.experts.6.w1.input_scale": "model-00010-of-00010.safetensors",
2199
  "model.layers.30.block_sparse_moe.experts.6.w1.weight": "model-00010-of-00010.safetensors",
2200
  "model.layers.30.block_sparse_moe.experts.6.w1.weight_scale": "model-00010-of-00010.safetensors",
 
2220
  "model.layers.30.self_attn.k_proj.weight": "model-00009-of-00010.safetensors",
2221
  "model.layers.30.self_attn.k_proj.weight_scale": "model-00009-of-00010.safetensors",
2222
  "model.layers.30.self_attn.k_scale": "model-00009-of-00010.safetensors",
2223
+ "model.layers.30.self_attn.o_proj.input_scale": "model-00009-of-00010.safetensors",
2224
  "model.layers.30.self_attn.o_proj.weight": "model-00009-of-00010.safetensors",
2225
+ "model.layers.30.self_attn.o_proj.weight_scale": "model-00009-of-00010.safetensors",
2226
  "model.layers.30.self_attn.q_proj.input_scale": "model-00009-of-00010.safetensors",
2227
  "model.layers.30.self_attn.q_proj.weight": "model-00009-of-00010.safetensors",
2228
  "model.layers.30.self_attn.q_proj.weight_scale": "model-00009-of-00010.safetensors",
 
2309
  "model.layers.31.self_attn.k_proj.weight": "model-00010-of-00010.safetensors",
2310
  "model.layers.31.self_attn.k_proj.weight_scale": "model-00010-of-00010.safetensors",
2311
  "model.layers.31.self_attn.k_scale": "model-00010-of-00010.safetensors",
2312
+ "model.layers.31.self_attn.o_proj.input_scale": "model-00010-of-00010.safetensors",
2313
  "model.layers.31.self_attn.o_proj.weight": "model-00010-of-00010.safetensors",
2314
+ "model.layers.31.self_attn.o_proj.weight_scale": "model-00010-of-00010.safetensors",
2315
  "model.layers.31.self_attn.q_proj.input_scale": "model-00010-of-00010.safetensors",
2316
  "model.layers.31.self_attn.q_proj.weight": "model-00010-of-00010.safetensors",
2317
  "model.layers.31.self_attn.q_proj.weight_scale": "model-00010-of-00010.safetensors",
 
2398
  "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00010.safetensors",
2399
  "model.layers.4.self_attn.k_proj.weight_scale": "model-00002-of-00010.safetensors",
2400
  "model.layers.4.self_attn.k_scale": "model-00002-of-00010.safetensors",
2401
+ "model.layers.4.self_attn.o_proj.input_scale": "model-00002-of-00010.safetensors",
2402
  "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00010.safetensors",
2403
+ "model.layers.4.self_attn.o_proj.weight_scale": "model-00002-of-00010.safetensors",
2404
  "model.layers.4.self_attn.q_proj.input_scale": "model-00002-of-00010.safetensors",
2405
  "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00010.safetensors",
2406
  "model.layers.4.self_attn.q_proj.weight_scale": "model-00002-of-00010.safetensors",
 
2487
  "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00010.safetensors",
2488
  "model.layers.5.self_attn.k_proj.weight_scale": "model-00002-of-00010.safetensors",
2489
  "model.layers.5.self_attn.k_scale": "model-00002-of-00010.safetensors",
2490
+ "model.layers.5.self_attn.o_proj.input_scale": "model-00002-of-00010.safetensors",
2491
  "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00010.safetensors",
2492
+ "model.layers.5.self_attn.o_proj.weight_scale": "model-00002-of-00010.safetensors",
2493
  "model.layers.5.self_attn.q_proj.input_scale": "model-00002-of-00010.safetensors",
2494
  "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00010.safetensors",
2495
  "model.layers.5.self_attn.q_proj.weight_scale": "model-00002-of-00010.safetensors",
 
2539
  "model.layers.6.block_sparse_moe.experts.4.w2.input_scale": "model-00002-of-00010.safetensors",
2540
  "model.layers.6.block_sparse_moe.experts.4.w2.weight": "model-00002-of-00010.safetensors",
2541
  "model.layers.6.block_sparse_moe.experts.4.w2.weight_scale": "model-00002-of-00010.safetensors",
2542
+ "model.layers.6.block_sparse_moe.experts.4.w3.input_scale": "model-00002-of-00010.safetensors",
2543
+ "model.layers.6.block_sparse_moe.experts.4.w3.weight": "model-00002-of-00010.safetensors",
2544
+ "model.layers.6.block_sparse_moe.experts.4.w3.weight_scale": "model-00002-of-00010.safetensors",
2545
+ "model.layers.6.block_sparse_moe.experts.5.w1.input_scale": "model-00002-of-00010.safetensors",
2546
+ "model.layers.6.block_sparse_moe.experts.5.w1.weight": "model-00002-of-00010.safetensors",
2547
+ "model.layers.6.block_sparse_moe.experts.5.w1.weight_scale": "model-00002-of-00010.safetensors",
2548
  "model.layers.6.block_sparse_moe.experts.5.w2.input_scale": "model-00003-of-00010.safetensors",
2549
  "model.layers.6.block_sparse_moe.experts.5.w2.weight": "model-00003-of-00010.safetensors",
2550
  "model.layers.6.block_sparse_moe.experts.5.w2.weight_scale": "model-00003-of-00010.safetensors",
 
2576
  "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00010.safetensors",
2577
  "model.layers.6.self_attn.k_proj.weight_scale": "model-00002-of-00010.safetensors",
2578
  "model.layers.6.self_attn.k_scale": "model-00002-of-00010.safetensors",
2579
+ "model.layers.6.self_attn.o_proj.input_scale": "model-00002-of-00010.safetensors",
2580
  "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00010.safetensors",
2581
+ "model.layers.6.self_attn.o_proj.weight_scale": "model-00002-of-00010.safetensors",
2582
  "model.layers.6.self_attn.q_proj.input_scale": "model-00002-of-00010.safetensors",
2583
  "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00010.safetensors",
2584
  "model.layers.6.self_attn.q_proj.weight_scale": "model-00002-of-00010.safetensors",
 
2665
  "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00010.safetensors",
2666
  "model.layers.7.self_attn.k_proj.weight_scale": "model-00003-of-00010.safetensors",
2667
  "model.layers.7.self_attn.k_scale": "model-00003-of-00010.safetensors",
2668
+ "model.layers.7.self_attn.o_proj.input_scale": "model-00003-of-00010.safetensors",
2669
  "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00010.safetensors",
2670
+ "model.layers.7.self_attn.o_proj.weight_scale": "model-00003-of-00010.safetensors",
2671
  "model.layers.7.self_attn.q_proj.input_scale": "model-00003-of-00010.safetensors",
2672
  "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00010.safetensors",
2673
  "model.layers.7.self_attn.q_proj.weight_scale": "model-00003-of-00010.safetensors",
 
2754
  "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00010.safetensors",
2755
  "model.layers.8.self_attn.k_proj.weight_scale": "model-00003-of-00010.safetensors",
2756
  "model.layers.8.self_attn.k_scale": "model-00003-of-00010.safetensors",
2757
+ "model.layers.8.self_attn.o_proj.input_scale": "model-00003-of-00010.safetensors",
2758
  "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00010.safetensors",
2759
+ "model.layers.8.self_attn.o_proj.weight_scale": "model-00003-of-00010.safetensors",
2760
  "model.layers.8.self_attn.q_proj.input_scale": "model-00003-of-00010.safetensors",
2761
  "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00010.safetensors",
2762
  "model.layers.8.self_attn.q_proj.weight_scale": "model-00003-of-00010.safetensors",
 
2843
  "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00010.safetensors",
2844
  "model.layers.9.self_attn.k_proj.weight_scale": "model-00003-of-00010.safetensors",
2845
  "model.layers.9.self_attn.k_scale": "model-00003-of-00010.safetensors",
2846
+ "model.layers.9.self_attn.o_proj.input_scale": "model-00003-of-00010.safetensors",
2847
  "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00010.safetensors",
2848
+ "model.layers.9.self_attn.o_proj.weight_scale": "model-00003-of-00010.safetensors",
2849
  "model.layers.9.self_attn.q_proj.input_scale": "model-00003-of-00010.safetensors",
2850
  "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00010.safetensors",
2851
  "model.layers.9.self_attn.q_proj.weight_scale": "model-00003-of-00010.safetensors",
tokenizer_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "add_bos_token": true,
3
  "add_eos_token": false,
4
- "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
  "0": {
7
  "content": "<unk>",
@@ -30,10 +30,10 @@
30
  },
31
  "additional_special_tokens": [],
32
  "bos_token": "<s>",
33
- "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content'] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n {%- endif %}\n {%- if message['role'] == 'user' %}\n {%- if loop.first and system_message is defined %}\n {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n {%- else %}\n {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n {%- endif %}\n {%- elif message['role'] == 'assistant' %}\n {{- ' ' + message['content'] + eos_token}}\n {%- else %}\n {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n {%- endif %}\n{%- endfor %}\n",
34
  "clean_up_tokenization_spaces": false,
35
  "eos_token": "</s>",
36
- "legacy": false,
37
  "model_max_length": 1000000000000000019884624838656,
38
  "pad_token": null,
39
  "sp_model_kwargs": {},
 
1
  {
2
  "add_bos_token": true,
3
  "add_eos_token": false,
4
+ "add_prefix_space": true,
5
  "added_tokens_decoder": {
6
  "0": {
7
  "content": "<unk>",
 
30
  },
31
  "additional_special_tokens": [],
32
  "bos_token": "<s>",
33
+ "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
34
  "clean_up_tokenization_spaces": false,
35
  "eos_token": "</s>",
36
+ "legacy": true,
37
  "model_max_length": 1000000000000000019884624838656,
38
  "pad_token": null,
39
  "sp_model_kwargs": {},