{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9974847287100251, "eval_steps": 100, "global_step": 347, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 763.18623046875, "epoch": 0.01437297879985627, "grad_norm": 0.0765276625752449, "kl": -6.394833326339721e-06, "learning_rate": 2.8571428571428573e-06, "loss": 0.0157, "reward": 0.17431640625, "reward_std": 0.23442449774593116, "rewards/accuracy_reward": 0.08994140625, "rewards/format_reward": 0.084375, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 698.62314453125, "epoch": 0.02874595759971254, "grad_norm": 0.11506624519824982, "kl": 0.00981593132019043, "learning_rate": 5.7142857142857145e-06, "loss": 0.0584, "reward": 0.6732421875, "reward_std": 0.3674958860501647, "rewards/accuracy_reward": 0.07646484375, "rewards/format_reward": 0.59677734375, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 541.0048828125, "epoch": 0.04311893639956881, "grad_norm": 0.050542764365673065, "kl": 0.02561187744140625, "learning_rate": 8.571428571428571e-06, "loss": 0.0354, "reward": 1.036328125, "reward_std": 0.2127559134736657, "rewards/accuracy_reward": 0.102734375, "rewards/format_reward": 0.93359375, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 632.689453125, "epoch": 0.05749191519942508, "grad_norm": 0.03859843313694, "kl": 0.0311004638671875, "learning_rate": 1.1428571428571429e-05, "loss": 0.0249, "reward": 1.1552734375, "reward_std": 0.23164508808404208, "rewards/accuracy_reward": 0.20224609375, "rewards/format_reward": 0.95302734375, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 628.31357421875, "epoch": 0.07186489399928135, "grad_norm": 0.046529632061719894, "kl": 0.0368988037109375, "learning_rate": 1.4285714285714287e-05, "loss": 0.0151, "reward": 1.157421875, "reward_std": 0.20364541225135327, "rewards/accuracy_reward": 0.188671875, "rewards/format_reward": 0.96875, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 671.5869140625, "epoch": 0.08623787279913762, "grad_norm": 0.037584338337183, "kl": 0.03684234619140625, "learning_rate": 1.7142857142857142e-05, "loss": 0.0213, "reward": 1.165234375, "reward_std": 0.24268896747380495, "rewards/accuracy_reward": 0.2177734375, "rewards/format_reward": 0.9474609375, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 646.013671875, "epoch": 0.1006108515989939, "grad_norm": 0.34336549043655396, "kl": 0.151519775390625, "learning_rate": 2e-05, "loss": 0.0359, "reward": 1.14091796875, "reward_std": 0.2653762998059392, "rewards/accuracy_reward": 0.2052734375, "rewards/format_reward": 0.93564453125, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 624.95908203125, "epoch": 0.11498383039885016, "grad_norm": 0.052160657942295074, "kl": 0.595263671875, "learning_rate": 1.9987329060020616e-05, "loss": 0.0668, "reward": 1.081640625, "reward_std": 0.3258050443604589, "rewards/accuracy_reward": 0.20439453125, "rewards/format_reward": 0.87724609375, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 691.025390625, "epoch": 0.12935680919870643, "grad_norm": 0.27032357454299927, "kl": 0.1677734375, "learning_rate": 1.9949348350626456e-05, "loss": 0.034, "reward": 0.9642578125, "reward_std": 0.4391048148274422, "rewards/accuracy_reward": 0.18740234375, "rewards/format_reward": 0.77685546875, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 440.34853515625, "epoch": 0.1437297879985627, "grad_norm": 0.6052369475364685, "kl": 0.800189208984375, "learning_rate": 1.9886154122075344e-05, "loss": 0.0919, "reward": 0.89814453125, "reward_std": 0.38281605690717696, "rewards/accuracy_reward": 0.11865234375, "rewards/format_reward": 0.7794921875, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 487.60771484375, "epoch": 0.15810276679841898, "grad_norm": 0.28784340620040894, "kl": 2.12225341796875, "learning_rate": 1.979790652042268e-05, "loss": 0.1039, "reward": 0.85263671875, "reward_std": 0.4635654494166374, "rewards/accuracy_reward": 0.13447265625, "rewards/format_reward": 0.7181640625, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 718.88359375, "epoch": 0.17247574559827525, "grad_norm": 0.38119208812713623, "kl": 0.38172607421875, "learning_rate": 1.9684829181681236e-05, "loss": 0.0502, "reward": 1.06494140625, "reward_std": 0.3414448471739888, "rewards/accuracy_reward": 0.21650390625, "rewards/format_reward": 0.8484375, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 621.63818359375, "epoch": 0.18684872439813152, "grad_norm": 0.3849119246006012, "kl": 1.819970703125, "learning_rate": 1.954720866508546e-05, "loss": 0.1892, "reward": 0.9689453125, "reward_std": 0.4041255243122578, "rewards/accuracy_reward": 0.16826171875, "rewards/format_reward": 0.80068359375, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 502.92744140625, "epoch": 0.2012217031979878, "grad_norm": 0.16367273032665253, "kl": 0.688922119140625, "learning_rate": 1.9385393726896492e-05, "loss": 0.0581, "reward": 1.1560546875, "reward_std": 0.22550129257142543, "rewards/accuracy_reward": 0.19248046875, "rewards/format_reward": 0.96357421875, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 563.1029296875, "epoch": 0.21559468199784404, "grad_norm": 0.1713869571685791, "kl": 0.0900238037109375, "learning_rate": 1.9199794436588244e-05, "loss": 0.0071, "reward": 1.1892578125, "reward_std": 0.2032089052721858, "rewards/accuracy_reward": 0.21513671875, "rewards/format_reward": 0.97412109375, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 622.5634765625, "epoch": 0.2299676607977003, "grad_norm": 0.2464917004108429, "kl": 0.144158935546875, "learning_rate": 1.899088113765426e-05, "loss": 0.0189, "reward": 1.1546875, "reward_std": 0.2610320156440139, "rewards/accuracy_reward": 0.21083984375, "rewards/format_reward": 0.94384765625, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 652.05927734375, "epoch": 0.24434063959755659, "grad_norm": 0.2248377948999405, "kl": 0.716436767578125, "learning_rate": 1.875918325566888e-05, "loss": 0.0578, "reward": 1.06005859375, "reward_std": 0.33321408815681935, "rewards/accuracy_reward": 0.171484375, "rewards/format_reward": 0.88857421875, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 669.3861328125, "epoch": 0.25871361839741286, "grad_norm": 0.27829509973526, "kl": 0.617529296875, "learning_rate": 1.8505287956623298e-05, "loss": 0.0585, "reward": 1.14755859375, "reward_std": 0.2751380069181323, "rewards/accuracy_reward": 0.20859375, "rewards/format_reward": 0.93896484375, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 662.1236328125, "epoch": 0.27308659719726913, "grad_norm": 0.2939702868461609, "kl": 0.5397705078125, "learning_rate": 1.8229838658936566e-05, "loss": 0.0555, "reward": 1.137890625, "reward_std": 0.2469838338904083, "rewards/accuracy_reward": 0.1900390625, "rewards/format_reward": 0.9478515625, "step": 95 }, { "epoch": 0.2874595759971254, "grad_norm": 0.1728806495666504, "learning_rate": 1.7933533402912354e-05, "loss": 0.103, "step": 100 }, { "epoch": 0.2874595759971254, "eval_clip_ratio": 0.0, "eval_completion_length": 611.2384828951579, "eval_kl": 0.50033329778157, "eval_loss": 0.06100574508309364, "eval_reward": 1.1420381825938566, "eval_reward_std": 0.27033696519433437, "eval_rewards/accuracy_reward": 0.2020051194539249, "eval_rewards/format_reward": 0.9400330631399317, "eval_runtime": 16336.0108, "eval_samples_per_second": 0.287, "eval_steps_per_second": 0.002, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 629.018017578125, "epoch": 0.3018325547969817, "grad_norm": 0.1207083985209465, "kl": 1.06016845703125, "learning_rate": 1.761712308177359e-05, "loss": 0.1074, "reward": 1.059326171875, "reward_std": 0.35213989242911337, "rewards/accuracy_reward": 0.18974609375, "rewards/format_reward": 0.869580078125, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 622.68330078125, "epoch": 0.31620553359683795, "grad_norm": 0.12369602918624878, "kl": 2.13466796875, "learning_rate": 1.7281409538757886e-05, "loss": 0.1546, "reward": 1.06484375, "reward_std": 0.3502559883520007, "rewards/accuracy_reward": 0.1806640625, "rewards/format_reward": 0.8841796875, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 619.0666015625, "epoch": 0.3305785123966942, "grad_norm": 0.13101035356521606, "kl": 0.932763671875, "learning_rate": 1.6927243535095995e-05, "loss": 0.0856, "reward": 1.14521484375, "reward_std": 0.2656426582485437, "rewards/accuracy_reward": 0.20322265625, "rewards/format_reward": 0.9419921875, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 633.12802734375, "epoch": 0.3449514911965505, "grad_norm": 0.13193248212337494, "kl": 0.9656982421875, "learning_rate": 1.655552259402295e-05, "loss": 0.0881, "reward": 1.14560546875, "reward_std": 0.27462361557409165, "rewards/accuracy_reward": 0.21337890625, "rewards/format_reward": 0.9322265625, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 653.04599609375, "epoch": 0.35932446999640677, "grad_norm": 0.3534374535083771, "kl": 1.867626953125, "learning_rate": 1.6167188726285433e-05, "loss": 0.1558, "reward": 1.05126953125, "reward_std": 0.36074890177696944, "rewards/accuracy_reward": 0.18544921875, "rewards/format_reward": 0.8658203125, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 630.14736328125, "epoch": 0.37369744879626304, "grad_norm": 2.0081052780151367, "kl": 1.8935546875, "learning_rate": 1.5763226042909455e-05, "loss": 0.1105, "reward": 1.0998046875, "reward_std": 0.3096121703274548, "rewards/accuracy_reward": 0.18486328125, "rewards/format_reward": 0.91494140625, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 614.62197265625, "epoch": 0.3880704275961193, "grad_norm": 0.1118120476603508, "kl": 0.59337158203125, "learning_rate": 1.5344658261278013e-05, "loss": 0.031, "reward": 1.16611328125, "reward_std": 0.24496497269719839, "rewards/accuracy_reward": 0.21005859375, "rewards/format_reward": 0.9560546875, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 610.4009765625, "epoch": 0.4024434063959756, "grad_norm": 0.18786092102527618, "kl": 0.7201416015625, "learning_rate": 1.4912546110838775e-05, "loss": 0.0608, "reward": 1.1451171875, "reward_std": 0.2563774929381907, "rewards/accuracy_reward": 0.2021484375, "rewards/format_reward": 0.94296875, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 604.14072265625, "epoch": 0.41681638519583186, "grad_norm": 0.12442336976528168, "kl": 0.96689453125, "learning_rate": 1.4467984645016259e-05, "loss": 0.0834, "reward": 1.13984375, "reward_std": 0.2728093104436994, "rewards/accuracy_reward": 0.2001953125, "rewards/format_reward": 0.9396484375, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 612.0369140625, "epoch": 0.4311893639956881, "grad_norm": 0.17537765204906464, "kl": 0.687255859375, "learning_rate": 1.4012100466140579e-05, "loss": 0.0628, "reward": 1.12919921875, "reward_std": 0.24853361072018743, "rewards/accuracy_reward": 0.17646484375, "rewards/format_reward": 0.952734375, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 609.915234375, "epoch": 0.44556234279554435, "grad_norm": 0.11783521622419357, "kl": 0.83641357421875, "learning_rate": 1.3546048870425356e-05, "loss": 0.0734, "reward": 1.12666015625, "reward_std": 0.264958731085062, "rewards/accuracy_reward": 0.18427734375, "rewards/format_reward": 0.9423828125, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 614.70517578125, "epoch": 0.4599353215954006, "grad_norm": 0.13742466270923615, "kl": 0.7468505859375, "learning_rate": 1.3071010920229909e-05, "loss": 0.0682, "reward": 1.122265625, "reward_std": 0.2766525615006685, "rewards/accuracy_reward": 0.18798828125, "rewards/format_reward": 0.93427734375, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 625.36552734375, "epoch": 0.4743083003952569, "grad_norm": 0.4238876700401306, "kl": 1.381640625, "learning_rate": 1.2588190451025209e-05, "loss": 0.1039, "reward": 1.13544921875, "reward_std": 0.31343956142663953, "rewards/accuracy_reward": 0.2201171875, "rewards/format_reward": 0.91533203125, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 672.3779296875, "epoch": 0.48868127919511317, "grad_norm": 0.13015827536582947, "kl": 1.4199462890625, "learning_rate": 1.2098811020648475e-05, "loss": 0.0989, "reward": 1.11416015625, "reward_std": 0.3195471292361617, "rewards/accuracy_reward": 0.208203125, "rewards/format_reward": 0.90595703125, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 631.84326171875, "epoch": 0.5030542579949695, "grad_norm": 0.2257327437400818, "kl": 1.1652099609375, "learning_rate": 1.1604112808577603e-05, "loss": 0.101, "reward": 1.1236328125, "reward_std": 0.30357036273926497, "rewards/accuracy_reward": 0.211328125, "rewards/format_reward": 0.9123046875, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 622.3265625, "epoch": 0.5174272367948257, "grad_norm": 0.11806362867355347, "kl": 0.7406005859375, "learning_rate": 1.11053494730832e-05, "loss": 0.0699, "reward": 1.1373046875, "reward_std": 0.25564199751242994, "rewards/accuracy_reward": 0.19658203125, "rewards/format_reward": 0.94072265625, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 652.4876953125, "epoch": 0.531800215594682, "grad_norm": 0.12807710468769073, "kl": 0.58621826171875, "learning_rate": 1.0603784974222862e-05, "loss": 0.0587, "reward": 1.173046875, "reward_std": 0.26026681158691645, "rewards/accuracy_reward": 0.2248046875, "rewards/format_reward": 0.9482421875, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 630.433203125, "epoch": 0.5461731943945383, "grad_norm": 0.10217402130365372, "kl": 0.9344970703125, "learning_rate": 1.0100690370728756e-05, "loss": 0.0809, "reward": 1.1609375, "reward_std": 0.2667428271844983, "rewards/accuracy_reward": 0.2150390625, "rewards/format_reward": 0.9458984375, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 617.68701171875, "epoch": 0.5605461731943946, "grad_norm": 0.13498954474925995, "kl": 0.67510986328125, "learning_rate": 9.597340598905851e-06, "loss": 0.0603, "reward": 1.1654296875, "reward_std": 0.25683426298201084, "rewards/accuracy_reward": 0.21796875, "rewards/format_reward": 0.9474609375, "step": 195 }, { "epoch": 0.5749191519942508, "grad_norm": 0.1882268339395523, "learning_rate": 9.095011241703623e-06, "loss": 0.0719, "step": 200 }, { "epoch": 0.5749191519942508, "eval_clip_ratio": 0.0, "eval_completion_length": 658.4643835907503, "eval_kl": 0.7806433980375427, "eval_loss": 0.06092459335923195, "eval_reward": 1.149637372013652, "eval_reward_std": 0.27955490747409467, "eval_rewards/accuracy_reward": 0.2150170648464164, "eval_rewards/format_reward": 0.9346203071672355, "eval_runtime": 16414.3395, "eval_samples_per_second": 0.286, "eval_steps_per_second": 0.002, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 652.56220703125, "epoch": 0.589292130794107, "grad_norm": 0.1547040194272995, "kl": 0.93699951171875, "learning_rate": 8.594975296149076e-06, "loss": 0.0647, "reward": 1.1623046875, "reward_std": 0.28741056518629193, "rewards/accuracy_reward": 0.23125, "rewards/format_reward": 0.9310546875, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 674.5626953125, "epoch": 0.6036651095939634, "grad_norm": 0.25151509046554565, "kl": 0.9999267578125, "learning_rate": 8.098499947332935e-06, "loss": 0.0775, "reward": 1.1466796875, "reward_std": 0.30369703844189644, "rewards/accuracy_reward": 0.22509765625, "rewards/format_reward": 0.92158203125, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 676.977734375, "epoch": 0.6180380883938196, "grad_norm": 0.20043928921222687, "kl": 0.7748779296875, "learning_rate": 7.606843357124426e-06, "loss": 0.0573, "reward": 1.15302734375, "reward_std": 0.28829708844423296, "rewards/accuracy_reward": 0.2244140625, "rewards/format_reward": 0.92861328125, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 655.955078125, "epoch": 0.6324110671936759, "grad_norm": 0.12682239711284637, "kl": 0.7095947265625, "learning_rate": 7.12125147575254e-06, "loss": 0.0548, "reward": 1.1763671875, "reward_std": 0.25821941047906877, "rewards/accuracy_reward": 0.23046875, "rewards/format_reward": 0.9458984375, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 647.839453125, "epoch": 0.6467840459935321, "grad_norm": 0.13890360295772552, "kl": 0.63245849609375, "learning_rate": 6.6429548843339554e-06, "loss": 0.0502, "reward": 1.1654296875, "reward_std": 0.2512395134195685, "rewards/accuracy_reward": 0.21337890625, "rewards/format_reward": 0.95205078125, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 632.53623046875, "epoch": 0.6611570247933884, "grad_norm": 0.15598197281360626, "kl": 0.87559814453125, "learning_rate": 6.173165676349103e-06, "loss": 0.0703, "reward": 1.155078125, "reward_std": 0.2729664742946625, "rewards/accuracy_reward": 0.213671875, "rewards/format_reward": 0.94140625, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 641.5361328125, "epoch": 0.6755300035932447, "grad_norm": 0.15446113049983978, "kl": 0.77437744140625, "learning_rate": 5.713074385969457e-06, "loss": 0.0688, "reward": 1.16953125, "reward_std": 0.28331395238637924, "rewards/accuracy_reward": 0.2296875, "rewards/format_reward": 0.93984375, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 647.3654296875, "epoch": 0.689902982393101, "grad_norm": 0.2089157998561859, "kl": 1.21328125, "learning_rate": 5.263846971020108e-06, "loss": 0.1016, "reward": 1.116796875, "reward_std": 0.31174491699784995, "rewards/accuracy_reward": 0.2029296875, "rewards/format_reward": 0.9138671875, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 645.523828125, "epoch": 0.7042759611929572, "grad_norm": 0.16784484684467316, "kl": 0.791552734375, "learning_rate": 4.826621858223431e-06, "loss": 0.0734, "reward": 1.143359375, "reward_std": 0.28859285488724706, "rewards/accuracy_reward": 0.2154296875, "rewards/format_reward": 0.9279296875, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 619.52958984375, "epoch": 0.7186489399928135, "grad_norm": 0.1753949671983719, "kl": 0.98125, "learning_rate": 4.40250705821178e-06, "loss": 0.0812, "reward": 1.1546875, "reward_std": 0.2736880548298359, "rewards/accuracy_reward": 0.2154296875, "rewards/format_reward": 0.9392578125, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 635.1673828125, "epoch": 0.7330219187926698, "grad_norm": 0.20336733758449554, "kl": 0.55863037109375, "learning_rate": 3.99257735762021e-06, "loss": 0.0458, "reward": 1.17392578125, "reward_std": 0.23981231823563576, "rewards/accuracy_reward": 0.21728515625, "rewards/format_reward": 0.956640625, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 630.9087890625, "epoch": 0.7473948975925261, "grad_norm": 0.16080701351165771, "kl": 0.696923828125, "learning_rate": 3.5978715953751207e-06, "loss": 0.0567, "reward": 1.1685546875, "reward_std": 0.24907034020870925, "rewards/accuracy_reward": 0.21376953125, "rewards/format_reward": 0.95478515625, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 623.18583984375, "epoch": 0.7617678763923823, "grad_norm": 0.18338614702224731, "kl": 1.0648681640625, "learning_rate": 3.2193900300810908e-06, "loss": 0.0778, "reward": 1.151953125, "reward_std": 0.26931764371693134, "rewards/accuracy_reward": 0.210546875, "rewards/format_reward": 0.94140625, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 634.9572265625, "epoch": 0.7761408551922386, "grad_norm": 0.13022945821285248, "kl": 0.7796142578125, "learning_rate": 2.8580918051775542e-06, "loss": 0.065, "reward": 1.165625, "reward_std": 0.27459610607475043, "rewards/accuracy_reward": 0.2244140625, "rewards/format_reward": 0.9412109375, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 633.88232421875, "epoch": 0.7905138339920948, "grad_norm": 0.1719103306531906, "kl": 0.8088623046875, "learning_rate": 2.514892518288988e-06, "loss": 0.0696, "reward": 1.15087890625, "reward_std": 0.2822716049849987, "rewards/accuracy_reward": 0.21640625, "rewards/format_reward": 0.93447265625, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 636.853125, "epoch": 0.8048868127919512, "grad_norm": 0.21337589621543884, "kl": 0.9040283203125, "learning_rate": 2.190661900928426e-06, "loss": 0.0753, "reward": 1.1412109375, "reward_std": 0.2784146698191762, "rewards/accuracy_reward": 0.2029296875, "rewards/format_reward": 0.93828125, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 639.5638671875, "epoch": 0.8192597915918074, "grad_norm": 0.1362425535917282, "kl": 0.95645751953125, "learning_rate": 1.8862216144342692e-06, "loss": 0.0749, "reward": 1.14130859375, "reward_std": 0.2679125562310219, "rewards/accuracy_reward": 0.20546875, "rewards/format_reward": 0.93583984375, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 639.32958984375, "epoch": 0.8336327703916637, "grad_norm": 0.13494881987571716, "kl": 0.8051513671875, "learning_rate": 1.6023431677260215e-06, "loss": 0.0684, "reward": 1.16240234375, "reward_std": 0.26225354727357625, "rewards/accuracy_reward": 0.21396484375, "rewards/format_reward": 0.9484375, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 636.16083984375, "epoch": 0.8480057491915199, "grad_norm": 0.16026277840137482, "kl": 0.8697265625, "learning_rate": 1.339745962155613e-06, "loss": 0.0712, "reward": 1.15966796875, "reward_std": 0.2733839010819793, "rewards/accuracy_reward": 0.21552734375, "rewards/format_reward": 0.944140625, "step": 295 }, { "epoch": 0.8623787279913762, "grad_norm": 0.155064195394516, "learning_rate": 1.099095468409156e-06, "loss": 0.0785, "step": 300 }, { "epoch": 0.8623787279913762, "eval_clip_ratio": 0.0, "eval_completion_length": 628.6954391531569, "eval_kl": 0.8880319432593856, "eval_loss": 0.07323075085878372, "eval_reward": 1.1617160836177474, "eval_reward_std": 0.2670084892838888, "eval_rewards/accuracy_reward": 0.21819005972696245, "eval_rewards/format_reward": 0.943526023890785, "eval_runtime": 16336.911, "eval_samples_per_second": 0.287, "eval_steps_per_second": 0.002, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 636.594287109375, "epoch": 0.8767517067912325, "grad_norm": 0.1458193063735962, "kl": 0.95950927734375, "learning_rate": 8.810015400790994e-07, "loss": 0.0809, "reward": 1.16162109375, "reward_std": 0.26864673662930727, "rewards/accuracy_reward": 0.2203125, "rewards/format_reward": 0.94130859375, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 642.09326171875, "epoch": 0.8911246855910887, "grad_norm": 0.14581456780433655, "kl": 0.82933349609375, "learning_rate": 6.860168681805946e-07, "loss": 0.0661, "reward": 1.16982421875, "reward_std": 0.26240854635834693, "rewards/accuracy_reward": 0.2216796875, "rewards/format_reward": 0.94814453125, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 642.76943359375, "epoch": 0.905497664390945, "grad_norm": 0.16072359681129456, "kl": 0.80216064453125, "learning_rate": 5.146355805285452e-07, "loss": 0.0637, "reward": 1.17431640625, "reward_std": 0.2672739554196596, "rewards/accuracy_reward": 0.22734375, "rewards/format_reward": 0.94697265625, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 644.221484375, "epoch": 0.9198706431908013, "grad_norm": 0.1719951331615448, "kl": 0.84737548828125, "learning_rate": 3.6729198952483725e-07, "loss": 0.0748, "reward": 1.158203125, "reward_std": 0.2642348381690681, "rewards/accuracy_reward": 0.2169921875, "rewards/format_reward": 0.9412109375, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 649.33896484375, "epoch": 0.9342436219906576, "grad_norm": 0.20107921957969666, "kl": 0.87275390625, "learning_rate": 2.4435949152906144e-07, "loss": 0.0757, "reward": 1.15966796875, "reward_std": 0.27580115627497437, "rewards/accuracy_reward": 0.22021484375, "rewards/format_reward": 0.939453125, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 643.75712890625, "epoch": 0.9486166007905138, "grad_norm": 0.14510348439216614, "kl": 0.821826171875, "learning_rate": 1.4614962060194303e-07, "loss": 0.0658, "reward": 1.140625, "reward_std": 0.2549537133425474, "rewards/accuracy_reward": 0.1978515625, "rewards/format_reward": 0.9427734375, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 643.530078125, "epoch": 0.9629895795903701, "grad_norm": 0.14030759036540985, "kl": 0.77998046875, "learning_rate": 7.291125901946027e-08, "loss": 0.0701, "reward": 1.1693359375, "reward_std": 0.2593334957957268, "rewards/accuracy_reward": 0.22529296875, "rewards/format_reward": 0.94404296875, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 646.19736328125, "epoch": 0.9773625583902263, "grad_norm": 0.16929689049720764, "kl": 0.835546875, "learning_rate": 2.4830006558373975e-08, "loss": 0.0697, "reward": 1.162109375, "reward_std": 0.26842295806854966, "rewards/accuracy_reward": 0.2173828125, "rewards/format_reward": 0.9447265625, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 642.66806640625, "epoch": 0.9917355371900827, "grad_norm": 0.17319317162036896, "kl": 0.85125732421875, "learning_rate": 2.0277101514987184e-09, "loss": 0.0724, "reward": 1.15966796875, "reward_std": 0.2777851399034262, "rewards/accuracy_reward": 0.21982421875, "rewards/format_reward": 0.93984375, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 643.1118812561035, "epoch": 0.9974847287100251, "kl": 0.8223876953125, "reward": 1.182861328125, "reward_std": 0.2758036791346967, "rewards/accuracy_reward": 0.25390625, "rewards/format_reward": 0.928955078125, "step": 347, "total_flos": 0.0, "train_loss": 0.0704507840104852, "train_runtime": 435678.5475, "train_samples_per_second": 0.204, "train_steps_per_second": 0.001 } ], "logging_steps": 5, "max_steps": 347, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }