| dataset_path: ConBench/ConBench_D | |
| dataset_kwargs: | |
| token: True | |
| task: "ConBench" | |
| test_split: test | |
| output_type: generate_until | |
| doc_to_visual: !function utils.conbench_doc_to_visual | |
| doc_to_text: !function utils.conbench_doc_to_text | |
| doc_to_target: "answer" | |
| generation_kwargs: | |
| max_new_tokens: 1024 | |
| temperature: 0.2 | |
| top_p: 0 | |
| num_beams: 1 | |
| do_sample: True | |
| # The return value of process_results will be used by metrics | |
| process_results: !function utils.conbench_process_results | |
| # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results | |
| metric_list: | |
| - metric: ConScore_D | |
| aggregation: !function utils.conbench_aggregate_results | |
| higher_is_better: true | |
| metadata: | |
| - version: 0.0 | |