alexmarques commited on
Commit
9233f20
·
verified ·
1 Parent(s): 7fa8878

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +152 -31
README.md CHANGED
@@ -135,96 +135,133 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
135
 
136
  ### Accuracy
137
 
138
- #### Open LLM Leaderboard evaluation scores
139
  <table>
140
  <tr>
 
 
141
  <td><strong>Benchmark</strong>
142
  </td>
143
  <td><strong>Meta-Llama-3.1-8B-Instruct </strong>
144
  </td>
145
- <td><strong>Meta-Llama-3.1-8B-Instruct-quantized.w8a16 (this model)</strong>
146
  </td>
147
  <td><strong>Recovery</strong>
148
  </td>
149
  </tr>
150
  <tr>
151
- <td>MMLU (5-shot)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  </td>
153
- <td>68.32
154
  </td>
155
- <td>68.26
156
  </td>
157
  <td>99.9%
158
  </td>
159
  </tr>
160
  <tr>
161
- <td>MMLU (CoT, 0-shot)
162
  </td>
163
- <td>72.83
164
  </td>
165
- <td>72.44
166
  </td>
167
- <td>99.5%
168
  </td>
169
  </tr>
170
  <tr>
171
- <td>ARC Challenge (0-shot)
172
  </td>
173
- <td>81.40
174
  </td>
175
- <td>81.40
176
  </td>
177
- <td>100.0%
 
 
178
  </td>
179
  </tr>
180
  <tr>
181
- <td>GSM-8K (CoT, 8-shot, strict-match)
182
  </td>
183
- <td>82.79
184
  </td>
185
- <td>84.31
186
  </td>
187
- <td>101.8%
188
  </td>
189
  </tr>
190
  <tr>
191
- <td>Hellaswag (10-shot)
192
  </td>
193
- <td>80.47
194
  </td>
195
- <td>80.48
196
  </td>
197
  <td>100.0%
198
  </td>
199
  </tr>
200
  <tr>
201
- <td>Winogrande (5-shot)
202
  </td>
203
- <td>78.06
204
  </td>
205
- <td>77.51
206
  </td>
207
- <td>99.3%
208
  </td>
209
  </tr>
210
  <tr>
211
- <td>TruthfulQA (0-shot, mc2)
212
  </td>
213
- <td>54.48
214
  </td>
215
- <td>54.41
216
  </td>
217
  <td>99.9%
218
  </td>
219
  </tr>
220
  <tr>
221
- <td><strong>Average</strong>
222
  </td>
223
- <td><strong>74.05</strong>
224
  </td>
225
- <td><strong>74.12</strong>
226
  </td>
227
- <td><strong>100.1%</strong>
 
 
 
 
 
 
 
 
 
 
228
  </td>
229
  </tr>
230
  </table>
@@ -307,4 +344,88 @@ lm_eval \
307
  --tasks truthfulqa \
308
  --num_fewshot 0 \
309
  --batch_size auto
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  ```
 
135
 
136
  ### Accuracy
137
 
 
138
  <table>
139
  <tr>
140
+ <td><strong>Category</strong>
141
+ </td>
142
  <td><strong>Benchmark</strong>
143
  </td>
144
  <td><strong>Meta-Llama-3.1-8B-Instruct </strong>
145
  </td>
146
+ <td><strong>Meta-Llama-3.1-8B-Instruct-quantized.w8a8 (this model)</strong>
147
  </td>
148
  <td><strong>Recovery</strong>
149
  </td>
150
  </tr>
151
  <tr>
152
+ <td rowspan="5" ><strong>OpenLLM v1</strong>
153
+ </td>
154
+ </tr>
155
+ <tr>
156
+ <td>Hellaswag (10-shot)
157
+ </td>
158
+ <td>80.47
159
+ </td>
160
+ <td>80.48
161
+ </td>
162
+ <td>100.0%
163
+ </td>
164
+ </tr>
165
+ <tr>
166
+ <td>Winogrande (5-shot)
167
+ </td>
168
+ <td>78.06
169
+ </td>
170
+ <td>77.51
171
+ </td>
172
+ <td>99.3%
173
+ </td>
174
+ </tr>
175
+ <tr>
176
+ <td>TruthfulQA (0-shot, mc2)
177
  </td>
178
+ <td>54.48
179
  </td>
180
+ <td>54.41
181
  </td>
182
  <td>99.9%
183
  </td>
184
  </tr>
185
  <tr>
186
+ <td><strong>Average</strong>
187
  </td>
188
+ <td><strong>74.05</strong>
189
  </td>
190
+ <td><strong>74.12</strong>
191
  </td>
192
+ <td><strong>100.1%</strong>
193
  </td>
194
  </tr>
195
  <tr>
196
+ <td rowspan="9" ><strong>Multilingual</strong>
197
  </td>
198
+ <td>Portuguese MMLU (5-shot)
199
  </td>
200
+ <td>59.96
201
  </td>
202
+ <td>59.79
203
+ </td>
204
+ <td>99.8%
205
  </td>
206
  </tr>
207
  <tr>
208
+ <td>Spanish MMLU (5-shot)
209
  </td>
210
+ <td>60.25
211
  </td>
212
+ <td>59.92
213
  </td>
214
+ <td>99.4%
215
  </td>
216
  </tr>
217
  <tr>
218
+ <td>Italian MMLU (5-shot)
219
  </td>
220
+ <td>59.23
221
  </td>
222
+ <td>59.25
223
  </td>
224
  <td>100.0%
225
  </td>
226
  </tr>
227
  <tr>
228
+ <td>German MMLU (5-shot)
229
  </td>
230
+ <td>58.63
231
  </td>
232
+ <td>58.31
233
  </td>
234
+ <td>99.5%
235
  </td>
236
  </tr>
237
  <tr>
238
+ <td>French MMLU (5-shot)
239
  </td>
240
+ <td>59.65
241
  </td>
242
+ <td>59.57
243
  </td>
244
  <td>99.9%
245
  </td>
246
  </tr>
247
  <tr>
248
+ <td>Hindi MMLU (5-shot)
249
  </td>
250
+ <td>50.10
251
  </td>
252
+ <td>49.97
253
  </td>
254
+ <td>99.7%
255
+ </td>
256
+ </tr>
257
+ <tr>
258
+ <td>Thai MMLU (5-shot)
259
+ </td>
260
+ <td>49.12
261
+ </td>
262
+ <td>49.09
263
+ </td>
264
+ <td>99.9%
265
  </td>
266
  </tr>
267
  </table>
 
344
  --tasks truthfulqa \
345
  --num_fewshot 0 \
346
  --batch_size auto
347
+ ```
348
+
349
+ #### MMLU Portuguese
350
+ ```
351
+ lm_eval \
352
+ --model vllm \
353
+ --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
354
+ --tasks mmlu_pt_llama_3.1_instruct \
355
+ --fewshot_as_multiturn \
356
+ --apply_chat_template \
357
+ --num_fewshot 5 \
358
+ --batch_size auto
359
+ ```
360
+
361
+ #### MMLU Spanish
362
+ ```
363
+ lm_eval \
364
+ --model vllm \
365
+ --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
366
+ --tasks mmlu_es_llama_3.1_instruct \
367
+ --fewshot_as_multiturn \
368
+ --apply_chat_template \
369
+ --num_fewshot 5 \
370
+ --batch_size auto
371
+ ```
372
+
373
+ #### MMLU Italian
374
+ ```
375
+ lm_eval \
376
+ --model vllm \
377
+ --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
378
+ --tasks mmlu_it_llama_3.1_instruct \
379
+ --fewshot_as_multiturn \
380
+ --apply_chat_template \
381
+ --num_fewshot 5 \
382
+ --batch_size auto
383
+ ```
384
+
385
+ #### MMLU German
386
+ ```
387
+ lm_eval \
388
+ --model vllm \
389
+ --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
390
+ --tasks mmlu_de_llama_3.1_instruct \
391
+ --fewshot_as_multiturn \
392
+ --apply_chat_template \
393
+ --num_fewshot 5 \
394
+ --batch_size auto
395
+ ```
396
+
397
+ #### MMLU French
398
+ ```
399
+ lm_eval \
400
+ --model vllm \
401
+ --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
402
+ --tasks mmlu_fr_llama_3.1_instruct \
403
+ --fewshot_as_multiturn \
404
+ --apply_chat_template \
405
+ --num_fewshot 5 \
406
+ --batch_size auto
407
+ ```
408
+
409
+ #### MMLU Hindi
410
+ ```
411
+ lm_eval \
412
+ --model vllm \
413
+ --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
414
+ --tasks mmlu_hi_llama_3.1_instruct \
415
+ --fewshot_as_multiturn \
416
+ --apply_chat_template \
417
+ --num_fewshot 5 \
418
+ --batch_size auto
419
+ ```
420
+
421
+ #### MMLU Thai
422
+ ```
423
+ lm_eval \
424
+ --model vllm \
425
+ --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
426
+ --tasks mmlu_th_llama_3.1_instruct \
427
+ --fewshot_as_multiturn \
428
+ --apply_chat_template \
429
+ --num_fewshot 5 \
430
+ --batch_size auto
431
  ```