| import json | |
| roles_map = { | |
| 'system': 'system', | |
| 'user': 'user', | |
| 'human': 'user', | |
| 'assistant': 'assistant', | |
| 'gpt': 'assistant', | |
| 'AI': 'assistant', | |
| } | |
| pretrain_instruct_datasets = [ | |
| # | |
| # general instructs | |
| # | |
| # 138 MB, 205,568 | |
| {'kind': 'instruct', 'path': 'CohereForAI/aya_dataset', 'transform': lambda r: [ | |
| {'role': 'user', 'content': r['inputs']}, | |
| {'role': 'assistant', 'content': r['targets']}, | |
| ]}, | |
| # ~3 GB, 4,976,850 | |
| *[ | |
| {'kind': 'instruct', 'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'transform': lambda r: [ | |
| {'role': 'system', 'content': r['instruction']}, | |
| {'role': 'user', 'content': r['input']}, | |
| {'role': 'assistant', 'content': r['output']}, | |
| ]} | |
| for name in [ | |
| 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4', | |
| 'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k', | |
| ] | |
| ], | |
| # 1.48 GB, 1,420,909 | |
| # mlabonne/open-perfectblend | |
| # meta-math/MetaMathQA 395,000 | |
| # openbmb/UltraInteract_sft 288,579 | |
| # HuggingFaceH4/ultrachat_200k 207,865 | |
| # microsoft/orca-math-word-problems-200k 200,035 | |
| # HuggingFaceH4/ultrafeedback_binarized 187,405 | |
| # theblackcat102/evol-codealpaca-v1 111,272 | |
| # Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492 | |
| # mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362 | |
| *[ | |
| {'kind': 'instruct', 'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 10}%]', 'field': 'conversations', 'transform': lambda msgs: [ | |
| {'role': roles_map[m['from']], 'content': m['value']} | |
| for m in msgs | |
| ]} | |
| for i in range(0, 100, 10) | |
| ], | |
| # 4.58 GB, 1,752,473 | |
| # arcee-ai/The-Tome | |
| # - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct) | |
| # - TIGER-Lab/WebInstructSub (top-500k) - IGNORE | |
| # - jondurbin/airoboros-3.2 | |
| # - gardner/glaive-function-calling-v2-sharegpt | |
| # - arcee-ai/reasoning-sharegpt (SkunkworksAI/reasoning-0.01) | |
| # - arcee-ai/self-instruct-sharegpt (bigcode/self-oss-instruct-sc2-exec-filter-50k) | |
| # - cognitivecomputations/ultrainteract_trajectories_sharegpt | |
| # - cognitivecomputations/SystemChat-2.0 | |
| # - arcee-ai/qwen2-72b-magpie-en | |
| *[ | |
| {'kind': 'instruct', 'path': 'arcee-ai/The-Tome', 'split': f'train[{i}%:{i + 10}%]', 'field': 'conversations', 'transform': lambda msgs: [ | |
| {'role': roles_map[m['from']], 'content': m['value']} | |
| for m in msgs | |
| ]} | |
| for i in range(0, 100, 10) | |
| ], | |
| # 2.48 GB, 5,808,694 | |
| # rombodawg/Everything_Instruct_Multilingual | |
| # Science: | |
| # antiven0m/physical-reasoning-dpoScience | |
| # LawalAfeez/science-dataset | |
| # Social media: | |
| # Kyle1668/AG-Tweets | |
| # euclaise/reddit-instruct-curated | |
| # General Knowledge: | |
| # NousResearch/CharacterCodex_Characters | |
| # jstet/quotes-500k_Famous_Quotes | |
| # FronkonGames/steam-games-dataset_Video_Games | |
| # totuta_youtube_subs_howto100M_HowTo | |
| # Multi-lingual: | |
| # Amani27/massive_translation_dataset | |
| # udmurtNLP/udmurt-russian-english-labse | |
| # grosenthal/latin_english | |
| # msarmi9/korean-english-multitarget-ted-talks-task | |
| # HaiderSultanArc/MT-Urdu-English_Translate | |
| # Garsa3112/ChineseEnglishTranslationDataset | |
| # Cooking: | |
| # andrewsiah/se_cooking_preference_sft | |
| # Hieu-Phamkaggle/food_recipes | |
| # Writing: | |
| # shahules786/PoetryFoundationData | |
| # euclaise/writingprompts | |
| # qwedsacf/ivypanda-essaysEssay | |
| # Medicine: | |
| # keivalya/MedQuad-MedicalQnADataset | |
| # nuvocare/MSD | |
| # History: | |
| # ambrosfitz10k/history_data_v4 | |
| # Law: | |
| # dzunggg/legal-qa-v1 | |
| # Role-Play: | |
| # roleplay4/fun_CoupleRP | |
| # Undi95andrijdavid/roleplay-conversation-sharegpt | |
| # News: | |
| # RealTimeData/bbc_news_alltime | |
| # Coding: (rombodawg/code_bagel) | |
| # layoric/tiny-codes-alpaca | |
| # glaiveai/glaive-code-assistant-v3 | |
| # ajibawa-2023/Code-290k-ShareGPT | |
| # chargoddard/commitpack-ft-instruct-rated | |
| # iamtarun/code_instructions_120k_alpaca | |
| # ise-uiuc/Magicoder-Evol-Instruct-110K | |
| # cognitivecomputations/dolphin-coder | |
| # nickrosh/Evol-Instruct-Code-80k-v1 | |
| # coseal/CodeUltraFeedback_binarized | |
| # CyberNative/Code_Vulnerability_Security_DPO | |
| # Math: (rombodawg/code_bagel) | |
| # TIGER-Lab/MathInstruct | |
| # Function calling: (rombodawg/code_bagel) | |
| # glaiveai/glaive-function-calling-v2 | |
| # General Instruct: (rombodawg/OpenHermes-2.5-Uncensored) | |
| # teknium/OpenHermes-2.5 | |
| *[ | |
| {'kind': 'instruct', 'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': f'train[{i}%:{i + 10}%]', 'transform': lambda r: [ | |
| {'role': 'system', 'content': r['instruction']}, | |
| {'role': 'user', 'content': r['input']}, | |
| {'role': 'assistant', 'content': r['output']}, | |
| ]} | |
| for i in range(0, 100, 10) | |
| ], | |
| # 1.41 GB, 939,343 | |
| # allenai/tulu-3-sft-mixture | |
| # CoCoNot (ODC-BY-1.0), 10,983 prompts (Brahman et al., 2024) | |
| # FLAN v2 via ai2-adapt-dev/flan_v2_converted, 89,982 prompts (Longpre et al., 2023) | |
| # No Robots (CC-BY-NC-4.0), 9,500 prompts (Rajani et al. 2023) | |
| # OpenAssistant Guanaco (Apache 2.0), 7,132 prompts (Kopf et al., 2024) | |
| # Tulu 3 Persona MATH (ODC-BY-1.0), 149,960 prompts | |
| # Tulu 3 Persona GSM (ODC-BY-1.0), 49,980 prompts | |
| # Tulu 3 Persona Python (ODC-BY-1.0), 34,999 prompts | |
| # Tulu 3 Persona Algebra (ODC-BY-1.0), 20,000 prompts | |
| # Tulu 3 Persona IF (ODC-BY-1.0), 29,980 prompts | |
| # NuminaMath-TIR (Apache 2.0), 64,312 prompts (Beeching et al. 2024) | |
| # Tulu 3 WildGuardMix (Apache 2.0), 50,000 prompts (Han et al., 2024) | |
| # Tulu 3 WildJailbreak (ODC-BY-1.0), 50,000 prompts (Wildteaming, 2024) | |
| # Tulu 3 Hardcoded (CC-BY-4.0), 240 prompts | |
| # Aya (Apache 2.0), 100,000 prompts (Singh et al., 2024) | |
| # WildChat GPT-4 (ODC-BY-1.0), 100,000 prompts (Zhao et al., 2024) | |
| # TableGPT (MIT), 5,000 prompts (Zha et al., 2023) | |
| # SciRIFF (ODC-BY-1.0), 10,000 prompts (Wadden et al., 2024) | |
| # Evol CodeAlpaca (Apache 2.0), 107,276 prompts (Luo et al., 2023) | |
| *[ | |
| {'kind': 'instruct', 'path': 'allenai/tulu-3-sft-mixture', 'split': f'train[{i}%:{i + 10}%]', 'field': 'messages'} | |
| for i in range(0, 100, 10) | |
| ], | |
| # | |
| # tool/function calling | |
| # | |
| # 65.7 MB, 11,578 | |
| {'kind': 'instruct', 'path': 'NousResearch/hermes-function-calling-v1', 'field': 'conversations', 'transform': lambda msgs: [ | |
| {'role': roles_map[m['from']], 'content': m['value']} | |
| for m in msgs | |
| ]}, | |
| # | |
| # agent | |
| # | |
| # 1.51 GB, 485,874 | |
| *[ | |
| {'kind': 'instruct', 'path': 'arcee-ai/agent-data', 'split': f'train[{i}%:{i + 10}%]', 'field': 'conversations', 'transform': lambda msgs: [ | |
| {'role': roles_map[m['from']], 'content': m['value']} | |
| for m in msgs | |
| ]} | |
| for i in range(0, 100, 10) | |
| ], | |
| # 2.21 GB, 1,046,410 | |
| *[ | |
| {'kind': 'instruct', 'path': 'microsoft/orca-agentinstruct-1M-v1', 'split': split, 'field': 'messages', 'transform': lambda msgs: json.loads(msgs)} | |
| for split in [ | |
| 'creative_content', 'text_modification', 'struct2text_flow', 'rc', 'rag', | |
| 'text_extraction', 'mcq', 'follow_up', 'analytical_reasoning', 'fermi', 'fs_cot_flow', | |
| 'code_', 'brain_teaser', 'text_classification', 'open_domain_qa', | |
| ] | |
| ], | |
| # | |
| # general instructs | |
| # | |
| # 1.52 GB, 214k (3.98 GB, 814,334) | |
| {'kind': 'instruct', 'path': 'cognitivecomputations/dolphin-r1', 'data_files': 'dolphin-r1-nonreasoning.jsonl', 'split': 'train', 'field': 'messages'}, | |
| # 4.15 GB, 2,197,730 | |
| {'kind': 'instruct', 'path': 'HuggingFaceTB/smoltalk', 'name': 'all', 'field': 'messages'}, | |
| ] | |