Update README.md
Browse files
README.md
CHANGED
@@ -135,96 +135,133 @@ This version of the lm-evaluation-harness includes versions of MMLU, ARC-Challen
|
|
135 |
|
136 |
### Accuracy
|
137 |
|
138 |
-
#### Open LLM Leaderboard evaluation scores
|
139 |
<table>
|
140 |
<tr>
|
|
|
|
|
141 |
<td><strong>Benchmark</strong>
|
142 |
</td>
|
143 |
<td><strong>Meta-Llama-3.1-8B-Instruct </strong>
|
144 |
</td>
|
145 |
-
<td><strong>Meta-Llama-3.1-8B-Instruct-quantized.
|
146 |
</td>
|
147 |
<td><strong>Recovery</strong>
|
148 |
</td>
|
149 |
</tr>
|
150 |
<tr>
|
151 |
-
<td>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
</td>
|
153 |
-
<td>
|
154 |
</td>
|
155 |
-
<td>
|
156 |
</td>
|
157 |
<td>99.9%
|
158 |
</td>
|
159 |
</tr>
|
160 |
<tr>
|
161 |
-
<td>
|
162 |
</td>
|
163 |
-
<td>
|
164 |
</td>
|
165 |
-
<td>
|
166 |
</td>
|
167 |
-
<td>
|
168 |
</td>
|
169 |
</tr>
|
170 |
<tr>
|
171 |
-
<td
|
172 |
</td>
|
173 |
-
<td>
|
174 |
</td>
|
175 |
-
<td>
|
176 |
</td>
|
177 |
-
<td>
|
|
|
|
|
178 |
</td>
|
179 |
</tr>
|
180 |
<tr>
|
181 |
-
<td>
|
182 |
</td>
|
183 |
-
<td>
|
184 |
</td>
|
185 |
-
<td>
|
186 |
</td>
|
187 |
-
<td>
|
188 |
</td>
|
189 |
</tr>
|
190 |
<tr>
|
191 |
-
<td>
|
192 |
</td>
|
193 |
-
<td>
|
194 |
</td>
|
195 |
-
<td>
|
196 |
</td>
|
197 |
<td>100.0%
|
198 |
</td>
|
199 |
</tr>
|
200 |
<tr>
|
201 |
-
<td>
|
202 |
</td>
|
203 |
-
<td>
|
204 |
</td>
|
205 |
-
<td>
|
206 |
</td>
|
207 |
-
<td>99.
|
208 |
</td>
|
209 |
</tr>
|
210 |
<tr>
|
211 |
-
|
212 |
</td>
|
213 |
-
<td>
|
214 |
</td>
|
215 |
-
<td>
|
216 |
</td>
|
217 |
<td>99.9%
|
218 |
</td>
|
219 |
</tr>
|
220 |
<tr>
|
221 |
-
<td
|
222 |
</td>
|
223 |
-
<td
|
224 |
</td>
|
225 |
-
<td
|
226 |
</td>
|
227 |
-
<td
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
</td>
|
229 |
</tr>
|
230 |
</table>
|
@@ -307,4 +344,88 @@ lm_eval \
|
|
307 |
--tasks truthfulqa \
|
308 |
--num_fewshot 0 \
|
309 |
--batch_size auto
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
```
|
|
|
135 |
|
136 |
### Accuracy
|
137 |
|
|
|
138 |
<table>
|
139 |
<tr>
|
140 |
+
<td><strong>Category</strong>
|
141 |
+
</td>
|
142 |
<td><strong>Benchmark</strong>
|
143 |
</td>
|
144 |
<td><strong>Meta-Llama-3.1-8B-Instruct </strong>
|
145 |
</td>
|
146 |
+
<td><strong>Meta-Llama-3.1-8B-Instruct-quantized.w8a8 (this model)</strong>
|
147 |
</td>
|
148 |
<td><strong>Recovery</strong>
|
149 |
</td>
|
150 |
</tr>
|
151 |
<tr>
|
152 |
+
<td rowspan="5" ><strong>OpenLLM v1</strong>
|
153 |
+
</td>
|
154 |
+
</tr>
|
155 |
+
<tr>
|
156 |
+
<td>Hellaswag (10-shot)
|
157 |
+
</td>
|
158 |
+
<td>80.47
|
159 |
+
</td>
|
160 |
+
<td>80.48
|
161 |
+
</td>
|
162 |
+
<td>100.0%
|
163 |
+
</td>
|
164 |
+
</tr>
|
165 |
+
<tr>
|
166 |
+
<td>Winogrande (5-shot)
|
167 |
+
</td>
|
168 |
+
<td>78.06
|
169 |
+
</td>
|
170 |
+
<td>77.51
|
171 |
+
</td>
|
172 |
+
<td>99.3%
|
173 |
+
</td>
|
174 |
+
</tr>
|
175 |
+
<tr>
|
176 |
+
<td>TruthfulQA (0-shot, mc2)
|
177 |
</td>
|
178 |
+
<td>54.48
|
179 |
</td>
|
180 |
+
<td>54.41
|
181 |
</td>
|
182 |
<td>99.9%
|
183 |
</td>
|
184 |
</tr>
|
185 |
<tr>
|
186 |
+
<td><strong>Average</strong>
|
187 |
</td>
|
188 |
+
<td><strong>74.05</strong>
|
189 |
</td>
|
190 |
+
<td><strong>74.12</strong>
|
191 |
</td>
|
192 |
+
<td><strong>100.1%</strong>
|
193 |
</td>
|
194 |
</tr>
|
195 |
<tr>
|
196 |
+
<td rowspan="9" ><strong>Multilingual</strong>
|
197 |
</td>
|
198 |
+
<td>Portuguese MMLU (5-shot)
|
199 |
</td>
|
200 |
+
<td>59.96
|
201 |
</td>
|
202 |
+
<td>59.79
|
203 |
+
</td>
|
204 |
+
<td>99.8%
|
205 |
</td>
|
206 |
</tr>
|
207 |
<tr>
|
208 |
+
<td>Spanish MMLU (5-shot)
|
209 |
</td>
|
210 |
+
<td>60.25
|
211 |
</td>
|
212 |
+
<td>59.92
|
213 |
</td>
|
214 |
+
<td>99.4%
|
215 |
</td>
|
216 |
</tr>
|
217 |
<tr>
|
218 |
+
<td>Italian MMLU (5-shot)
|
219 |
</td>
|
220 |
+
<td>59.23
|
221 |
</td>
|
222 |
+
<td>59.25
|
223 |
</td>
|
224 |
<td>100.0%
|
225 |
</td>
|
226 |
</tr>
|
227 |
<tr>
|
228 |
+
<td>German MMLU (5-shot)
|
229 |
</td>
|
230 |
+
<td>58.63
|
231 |
</td>
|
232 |
+
<td>58.31
|
233 |
</td>
|
234 |
+
<td>99.5%
|
235 |
</td>
|
236 |
</tr>
|
237 |
<tr>
|
238 |
+
<td>French MMLU (5-shot)
|
239 |
</td>
|
240 |
+
<td>59.65
|
241 |
</td>
|
242 |
+
<td>59.57
|
243 |
</td>
|
244 |
<td>99.9%
|
245 |
</td>
|
246 |
</tr>
|
247 |
<tr>
|
248 |
+
<td>Hindi MMLU (5-shot)
|
249 |
</td>
|
250 |
+
<td>50.10
|
251 |
</td>
|
252 |
+
<td>49.97
|
253 |
</td>
|
254 |
+
<td>99.7%
|
255 |
+
</td>
|
256 |
+
</tr>
|
257 |
+
<tr>
|
258 |
+
<td>Thai MMLU (5-shot)
|
259 |
+
</td>
|
260 |
+
<td>49.12
|
261 |
+
</td>
|
262 |
+
<td>49.09
|
263 |
+
</td>
|
264 |
+
<td>99.9%
|
265 |
</td>
|
266 |
</tr>
|
267 |
</table>
|
|
|
344 |
--tasks truthfulqa \
|
345 |
--num_fewshot 0 \
|
346 |
--batch_size auto
|
347 |
+
```
|
348 |
+
|
349 |
+
#### MMLU Portuguese
|
350 |
+
```
|
351 |
+
lm_eval \
|
352 |
+
--model vllm \
|
353 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
354 |
+
--tasks mmlu_pt_llama_3.1_instruct \
|
355 |
+
--fewshot_as_multiturn \
|
356 |
+
--apply_chat_template \
|
357 |
+
--num_fewshot 5 \
|
358 |
+
--batch_size auto
|
359 |
+
```
|
360 |
+
|
361 |
+
#### MMLU Spanish
|
362 |
+
```
|
363 |
+
lm_eval \
|
364 |
+
--model vllm \
|
365 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
366 |
+
--tasks mmlu_es_llama_3.1_instruct \
|
367 |
+
--fewshot_as_multiturn \
|
368 |
+
--apply_chat_template \
|
369 |
+
--num_fewshot 5 \
|
370 |
+
--batch_size auto
|
371 |
+
```
|
372 |
+
|
373 |
+
#### MMLU Italian
|
374 |
+
```
|
375 |
+
lm_eval \
|
376 |
+
--model vllm \
|
377 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
378 |
+
--tasks mmlu_it_llama_3.1_instruct \
|
379 |
+
--fewshot_as_multiturn \
|
380 |
+
--apply_chat_template \
|
381 |
+
--num_fewshot 5 \
|
382 |
+
--batch_size auto
|
383 |
+
```
|
384 |
+
|
385 |
+
#### MMLU German
|
386 |
+
```
|
387 |
+
lm_eval \
|
388 |
+
--model vllm \
|
389 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
390 |
+
--tasks mmlu_de_llama_3.1_instruct \
|
391 |
+
--fewshot_as_multiturn \
|
392 |
+
--apply_chat_template \
|
393 |
+
--num_fewshot 5 \
|
394 |
+
--batch_size auto
|
395 |
+
```
|
396 |
+
|
397 |
+
#### MMLU French
|
398 |
+
```
|
399 |
+
lm_eval \
|
400 |
+
--model vllm \
|
401 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
402 |
+
--tasks mmlu_fr_llama_3.1_instruct \
|
403 |
+
--fewshot_as_multiturn \
|
404 |
+
--apply_chat_template \
|
405 |
+
--num_fewshot 5 \
|
406 |
+
--batch_size auto
|
407 |
+
```
|
408 |
+
|
409 |
+
#### MMLU Hindi
|
410 |
+
```
|
411 |
+
lm_eval \
|
412 |
+
--model vllm \
|
413 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
414 |
+
--tasks mmlu_hi_llama_3.1_instruct \
|
415 |
+
--fewshot_as_multiturn \
|
416 |
+
--apply_chat_template \
|
417 |
+
--num_fewshot 5 \
|
418 |
+
--batch_size auto
|
419 |
+
```
|
420 |
+
|
421 |
+
#### MMLU Thai
|
422 |
+
```
|
423 |
+
lm_eval \
|
424 |
+
--model vllm \
|
425 |
+
--model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
|
426 |
+
--tasks mmlu_th_llama_3.1_instruct \
|
427 |
+
--fewshot_as_multiturn \
|
428 |
+
--apply_chat_template \
|
429 |
+
--num_fewshot 5 \
|
430 |
+
--batch_size auto
|
431 |
```
|