jahidhasan commited on Jun 29

Commit

8d03952

verified ·

1 Parent(s): ad3f789

Upload OS Reasoning model

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +107 -0
added_tokens.json +8 -0
checkpoint-1500/added_tokens.json +8 -0
checkpoint-1500/config.json +45 -0
checkpoint-1500/generation_config.json +6 -0
checkpoint-1500/merges.txt +0 -0
checkpoint-1500/model.safetensors +3 -0
checkpoint-1500/optimizer.pt +3 -0
checkpoint-1500/rng_state.pth +3 -0
checkpoint-1500/scheduler.pt +3 -0
checkpoint-1500/special_tokens_map.json +50 -0
checkpoint-1500/tokenizer.json +0 -0
checkpoint-1500/tokenizer_config.json +77 -0
checkpoint-1500/trainer_state.json +244 -0
checkpoint-1500/training_args.bin +3 -0
checkpoint-1500/vocab.json +0 -0
checkpoint-2000/added_tokens.json +8 -0
checkpoint-2000/config.json +45 -0
checkpoint-2000/generation_config.json +6 -0
checkpoint-2000/merges.txt +0 -0
checkpoint-2000/model.safetensors +3 -0
checkpoint-2000/optimizer.pt +3 -0
checkpoint-2000/rng_state.pth +3 -0
checkpoint-2000/scheduler.pt +3 -0
checkpoint-2000/special_tokens_map.json +50 -0
checkpoint-2000/tokenizer.json +0 -0
checkpoint-2000/tokenizer_config.json +77 -0
checkpoint-2000/trainer_state.json +314 -0
checkpoint-2000/training_args.bin +3 -0
checkpoint-2000/vocab.json +0 -0
checkpoint-2100/added_tokens.json +8 -0
checkpoint-2100/config.json +45 -0
checkpoint-2100/generation_config.json +6 -0
checkpoint-2100/merges.txt +0 -0
checkpoint-2100/model.safetensors +3 -0
checkpoint-2100/optimizer.pt +3 -0
checkpoint-2100/rng_state.pth +3 -0
checkpoint-2100/scheduler.pt +3 -0
checkpoint-2100/special_tokens_map.json +50 -0
checkpoint-2100/tokenizer.json +0 -0
checkpoint-2100/tokenizer_config.json +77 -0
checkpoint-2100/trainer_state.json +328 -0
checkpoint-2100/training_args.bin +3 -0
checkpoint-2100/vocab.json +0 -0
config.json +45 -0
generation_config.json +6 -0
merges.txt +0 -0
model.safetensors +3 -0
special_tokens_map.json +50 -0
tokenizer.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,107 @@

+---
+language: en
+tags:
+- operating-systems
+- reasoning
+- education
+- computer-science
+datasets:
+- custom
+metrics:
+- accuracy
+widget:
+- text: "Question: What is a process in operating systems? Reasoning:"
+  example_title: "Process Explanation"
+- text: "Question: How does virtual memory work? Reasoning:"
+  example_title: "Virtual Memory"
+---
+# Operating System Reasoning Model
+## Model Description
+This model is specifically fine-tuned for reasoning about Operating Systems concepts. It can:
+- Explain OS concepts with step-by-step reasoning
+- Solve OS-related problems
+- Compare different OS mechanisms
+- Provide educational explanations for students
+## Training Data
+The model was trained on content from multiple authoritative Operating Systems textbooks and resources:
+- **OSTEP (Operating Systems: Three Easy Pieces)** - 0 chapters
+- **xv6 Documentation** - System implementation details
+- **Academic OS Resources** - Additional educational content
+Total training examples: 3354
+## Usage
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+tokenizer = AutoTokenizer.from_pretrained("jahidhasan/os-reasoning-model")
+model = AutoModelForCausalLM.from_pretrained("jahidhasan/os-reasoning-model")
+# Generate reasoning
+question = "What is a deadlock in operating systems?"
+prompt = f"Question: {question}\nReasoning:"
+inputs = tokenizer(prompt, return_tensors="pt")
+outputs = model.generate(**inputs, max_length=200, temperature=0.7)
+response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(response)
+```
+## Model Architecture
+- **Base Model**: distilbert/distilgpt2
+- **Parameters**: 81,917,184
+- **Fine-tuning**: Specialized for OS domain reasoning
+## Performance
+The model demonstrates strong performance on:
+- Concept explanation tasks
+- Problem-solving scenarios
+- Comparative analysis
+- Educational Q&A
+## Limitations
+- Focused specifically on Operating Systems domain
+- May not perform well on general reasoning tasks
+- Requires clear, structured questions for best results
+## Citation
+```bibtex
+@misc{os-reasoning-model,
+  author = {Jahid Hasan},
+  title = {Operating System Reasoning Model},
+  year = {2025},
+  publisher = {Hugging Face},
+  howpublished = {\url{https://huggingface.co/jahidhasan/os-reasoning-model}}
+}
+```
+## Training Details
+- **Training Epochs**: 5
+- **Learning Rate**: 3e-5
+- **Batch Size**: 16
+- **Training Time**: Unknown
+## Educational Use
+This model is particularly useful for:
+- Computer Science students learning OS concepts
+- Educators creating OS curriculum
+- Self-study and review sessions
+- Assignment and project assistance
+---
+*Trained with ❤️ for OS education*

added_tokens.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "[/ANSWER]": 50262,
+  "[/OS_CONCEPT]": 50258,
+  "[/REASONING]": 50260,
+  "[ANSWER]": 50261,
+  "[OS_CONCEPT]": 50257,
+  "[REASONING]": 50259
+}

checkpoint-1500/added_tokens.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "[/ANSWER]": 50262,
+  "[/OS_CONCEPT]": 50258,
+  "[/REASONING]": 50260,
+  "[ANSWER]": 50261,
+  "[OS_CONCEPT]": 50257,
+  "[REASONING]": 50259
+}

checkpoint-1500/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "_num_labels": 1,
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 6,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.0",
+  "use_cache": true,
+  "vocab_size": 50263
+}

checkpoint-1500/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.53.0"
+}

checkpoint-1500/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1500/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f9737f4c51743981920356dc5d6ba50018754a9ee7db8fd946f658a5d23206d
+size 327676360

checkpoint-1500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b1f54cc2ca8d601f9985395c8998663489c1480eb7f2ed80f27d1d9181ec306
+size 655401338

checkpoint-1500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ca045602373bd80718d033f5ec56475d8a0d328f986673ad9e00da790d9601d
+size 14244

checkpoint-1500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48cda1885c98b0cead5fdea48b78a1926a5cdc27a238e5f4d6bb6fc551ae46d9
+size 1064

checkpoint-1500/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "[OS_CONCEPT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[/OS_CONCEPT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[REASONING]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[/REASONING]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[ANSWER]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[/ANSWER]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

checkpoint-1500/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50257": {
+      "content": "[OS_CONCEPT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50258": {
+      "content": "[/OS_CONCEPT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50259": {
+      "content": "[REASONING]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50260": {
+      "content": "[/REASONING]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50261": {
+      "content": "[ANSWER]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50262": {
+      "content": "[/ANSWER]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "[OS_CONCEPT]",
+    "[/OS_CONCEPT]",
+    "[REASONING]",
+    "[/REASONING]",
+    "[ANSWER]",
+    "[/ANSWER]"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

checkpoint-1500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,244 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.5724508050089447,
+  "eval_steps": 500,
+  "global_step": 1500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.11926058437686345,
+      "grad_norm": 8.05271053314209,
+      "learning_rate": 7.35e-06,
+      "loss": 5.4953,
+      "step": 50
+    },
+    {
+      "epoch": 0.2385211687537269,
+      "grad_norm": 2.7420473098754883,
+      "learning_rate": 1.485e-05,
+      "loss": 1.2318,
+      "step": 100
+    },
+    {
+      "epoch": 0.35778175313059035,
+      "grad_norm": 1.7430616617202759,
+      "learning_rate": 2.235e-05,
+      "loss": 0.7734,
+      "step": 150
+    },
+    {
+      "epoch": 0.4770423375074538,
+      "grad_norm": 1.5376101732254028,
+      "learning_rate": 2.985e-05,
+      "loss": 0.6646,
+      "step": 200
+    },
+    {
+      "epoch": 0.5963029218843172,
+      "grad_norm": 1.6270289421081543,
+      "learning_rate": 2.9226315789473687e-05,
+      "loss": 0.6334,
+      "step": 250
+    },
+    {
+      "epoch": 0.7155635062611807,
+      "grad_norm": 1.8452305793762207,
+      "learning_rate": 2.8436842105263156e-05,
+      "loss": 0.6103,
+      "step": 300
+    },
+    {
+      "epoch": 0.8348240906380441,
+      "grad_norm": 1.8059115409851074,
+      "learning_rate": 2.7647368421052632e-05,
+      "loss": 0.6186,
+      "step": 350
+    },
+    {
+      "epoch": 0.9540846750149076,
+      "grad_norm": 1.6886439323425293,
+      "learning_rate": 2.6857894736842105e-05,
+      "loss": 0.6045,
+      "step": 400
+    },
+    {
+      "epoch": 1.071556350626118,
+      "grad_norm": 1.4987449645996094,
+      "learning_rate": 2.6068421052631578e-05,
+      "loss": 0.5941,
+      "step": 450
+    },
+    {
+      "epoch": 1.1908169350029816,
+      "grad_norm": 1.5386848449707031,
+      "learning_rate": 2.527894736842105e-05,
+      "loss": 0.5678,
+      "step": 500
+    },
+    {
+      "epoch": 1.310077519379845,
+      "grad_norm": 1.5747556686401367,
+      "learning_rate": 2.4489473684210527e-05,
+      "loss": 0.5671,
+      "step": 550
+    },
+    {
+      "epoch": 1.4293381037567083,
+      "grad_norm": 1.5854023694992065,
+      "learning_rate": 2.37e-05,
+      "loss": 0.5376,
+      "step": 600
+    },
+    {
+      "epoch": 1.5485986881335718,
+      "grad_norm": 1.4520505666732788,
+      "learning_rate": 2.2910526315789473e-05,
+      "loss": 0.5632,
+      "step": 650
+    },
+    {
+      "epoch": 1.6678592725104353,
+      "grad_norm": 1.4415792226791382,
+      "learning_rate": 2.2121052631578946e-05,
+      "loss": 0.5629,
+      "step": 700
+    },
+    {
+      "epoch": 1.7871198568872988,
+      "grad_norm": 1.4224036931991577,
+      "learning_rate": 2.1331578947368422e-05,
+      "loss": 0.5647,
+      "step": 750
+    },
+    {
+      "epoch": 1.906380441264162,
+      "grad_norm": 1.4895819425582886,
+      "learning_rate": 2.0542105263157895e-05,
+      "loss": 0.54,
+      "step": 800
+    },
+    {
+      "epoch": 2.0238521168753727,
+      "grad_norm": 1.5508358478546143,
+      "learning_rate": 1.9752631578947368e-05,
+      "loss": 0.5695,
+      "step": 850
+    },
+    {
+      "epoch": 2.143112701252236,
+      "grad_norm": 1.5374252796173096,
+      "learning_rate": 1.896315789473684e-05,
+      "loss": 0.5228,
+      "step": 900
+    },
+    {
+      "epoch": 2.2623732856290997,
+      "grad_norm": 1.639708161354065,
+      "learning_rate": 1.8173684210526317e-05,
+      "loss": 0.5512,
+      "step": 950
+    },
+    {
+      "epoch": 2.381633870005963,
+      "grad_norm": 1.6390520334243774,
+      "learning_rate": 1.738421052631579e-05,
+      "loss": 0.5252,
+      "step": 1000
+    },
+    {
+      "epoch": 2.500894454382826,
+      "grad_norm": 1.4625619649887085,
+      "learning_rate": 1.6594736842105263e-05,
+      "loss": 0.5339,
+      "step": 1050
+    },
+    {
+      "epoch": 2.62015503875969,
+      "grad_norm": 1.3197258710861206,
+      "learning_rate": 1.5805263157894735e-05,
+      "loss": 0.5479,
+      "step": 1100
+    },
+    {
+      "epoch": 2.739415623136553,
+      "grad_norm": 1.4443845748901367,
+      "learning_rate": 1.5015789473684212e-05,
+      "loss": 0.5134,
+      "step": 1150
+    },
+    {
+      "epoch": 2.8586762075134167,
+      "grad_norm": 1.7365626096725464,
+      "learning_rate": 1.4226315789473685e-05,
+      "loss": 0.5253,
+      "step": 1200
+    },
+    {
+      "epoch": 2.97793679189028,
+      "grad_norm": 1.7469673156738281,
+      "learning_rate": 1.343684210526316e-05,
+      "loss": 0.5452,
+      "step": 1250
+    },
+    {
+      "epoch": 3.095408467501491,
+      "grad_norm": 1.7984752655029297,
+      "learning_rate": 1.2647368421052632e-05,
+      "loss": 0.5178,
+      "step": 1300
+    },
+    {
+      "epoch": 3.2146690518783543,
+      "grad_norm": 1.5190192461013794,
+      "learning_rate": 1.1857894736842105e-05,
+      "loss": 0.5261,
+      "step": 1350
+    },
+    {
+      "epoch": 3.3339296362552178,
+      "grad_norm": 1.5508211851119995,
+      "learning_rate": 1.106842105263158e-05,
+      "loss": 0.5435,
+      "step": 1400
+    },
+    {
+      "epoch": 3.4531902206320813,
+      "grad_norm": 1.8733484745025635,
+      "learning_rate": 1.0278947368421052e-05,
+      "loss": 0.5001,
+      "step": 1450
+    },
+    {
+      "epoch": 3.5724508050089447,
+      "grad_norm": 1.6355196237564087,
+      "learning_rate": 9.489473684210527e-06,
+      "loss": 0.5005,
+      "step": 1500
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 2100,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 782714416398336.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e70b6a91213c1b38da9f91a27fc85eeb5f1a0452422f750cf236f5949b7e75b0
+size 5304

checkpoint-1500/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2000/added_tokens.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "[/ANSWER]": 50262,
+  "[/OS_CONCEPT]": 50258,
+  "[/REASONING]": 50260,
+  "[ANSWER]": 50261,
+  "[OS_CONCEPT]": 50257,
+  "[REASONING]": 50259
+}

checkpoint-2000/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "_num_labels": 1,
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 6,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.0",
+  "use_cache": true,
+  "vocab_size": 50263
+}

checkpoint-2000/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.53.0"
+}

checkpoint-2000/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b553b008fcb3d780475d258c5a19d7accb45dd8e0d6460927432e8cd4f4ba96
+size 327676360

checkpoint-2000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:549bac7a1cea1d7337adac7451526fb1ca87b0df721e2eebf573b9e58e6c7e6c
+size 655401338

checkpoint-2000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e5f761a538d3d5b7c9c6f99207b1ef11f5a408dad976547562d1ba4eabc058e
+size 14244

checkpoint-2000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:770f1cbe19c3cd0e680868e46e72ffe46758d439e6cdc3fc2626f238cb58965b
+size 1064

checkpoint-2000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "[OS_CONCEPT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[/OS_CONCEPT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[REASONING]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[/REASONING]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[ANSWER]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[/ANSWER]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

checkpoint-2000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50257": {
+      "content": "[OS_CONCEPT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50258": {
+      "content": "[/OS_CONCEPT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50259": {
+      "content": "[REASONING]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50260": {
+      "content": "[/REASONING]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50261": {
+      "content": "[ANSWER]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50262": {
+      "content": "[/ANSWER]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "[OS_CONCEPT]",
+    "[/OS_CONCEPT]",
+    "[REASONING]",
+    "[/REASONING]",
+    "[ANSWER]",
+    "[/ANSWER]"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

checkpoint-2000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,314 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 4.763267740011926,
+  "eval_steps": 500,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.11926058437686345,
+      "grad_norm": 8.05271053314209,
+      "learning_rate": 7.35e-06,
+      "loss": 5.4953,
+      "step": 50
+    },
+    {
+      "epoch": 0.2385211687537269,
+      "grad_norm": 2.7420473098754883,
+      "learning_rate": 1.485e-05,
+      "loss": 1.2318,
+      "step": 100
+    },
+    {
+      "epoch": 0.35778175313059035,
+      "grad_norm": 1.7430616617202759,
+      "learning_rate": 2.235e-05,
+      "loss": 0.7734,
+      "step": 150
+    },
+    {
+      "epoch": 0.4770423375074538,
+      "grad_norm": 1.5376101732254028,
+      "learning_rate": 2.985e-05,
+      "loss": 0.6646,
+      "step": 200
+    },
+    {
+      "epoch": 0.5963029218843172,
+      "grad_norm": 1.6270289421081543,
+      "learning_rate": 2.9226315789473687e-05,
+      "loss": 0.6334,
+      "step": 250
+    },
+    {
+      "epoch": 0.7155635062611807,
+      "grad_norm": 1.8452305793762207,
+      "learning_rate": 2.8436842105263156e-05,
+      "loss": 0.6103,
+      "step": 300
+    },
+    {
+      "epoch": 0.8348240906380441,
+      "grad_norm": 1.8059115409851074,
+      "learning_rate": 2.7647368421052632e-05,
+      "loss": 0.6186,
+      "step": 350
+    },
+    {
+      "epoch": 0.9540846750149076,
+      "grad_norm": 1.6886439323425293,
+      "learning_rate": 2.6857894736842105e-05,
+      "loss": 0.6045,
+      "step": 400
+    },
+    {
+      "epoch": 1.071556350626118,
+      "grad_norm": 1.4987449645996094,
+      "learning_rate": 2.6068421052631578e-05,
+      "loss": 0.5941,
+      "step": 450
+    },
+    {
+      "epoch": 1.1908169350029816,
+      "grad_norm": 1.5386848449707031,
+      "learning_rate": 2.527894736842105e-05,
+      "loss": 0.5678,
+      "step": 500
+    },
+    {
+      "epoch": 1.310077519379845,
+      "grad_norm": 1.5747556686401367,
+      "learning_rate": 2.4489473684210527e-05,
+      "loss": 0.5671,
+      "step": 550
+    },
+    {
+      "epoch": 1.4293381037567083,
+      "grad_norm": 1.5854023694992065,
+      "learning_rate": 2.37e-05,
+      "loss": 0.5376,
+      "step": 600
+    },
+    {
+      "epoch": 1.5485986881335718,
+      "grad_norm": 1.4520505666732788,
+      "learning_rate": 2.2910526315789473e-05,
+      "loss": 0.5632,
+      "step": 650
+    },
+    {
+      "epoch": 1.6678592725104353,
+      "grad_norm": 1.4415792226791382,
+      "learning_rate": 2.2121052631578946e-05,
+      "loss": 0.5629,
+      "step": 700
+    },
+    {
+      "epoch": 1.7871198568872988,
+      "grad_norm": 1.4224036931991577,
+      "learning_rate": 2.1331578947368422e-05,
+      "loss": 0.5647,
+      "step": 750
+    },
+    {
+      "epoch": 1.906380441264162,
+      "grad_norm": 1.4895819425582886,
+      "learning_rate": 2.0542105263157895e-05,
+      "loss": 0.54,
+      "step": 800
+    },
+    {
+      "epoch": 2.0238521168753727,
+      "grad_norm": 1.5508358478546143,
+      "learning_rate": 1.9752631578947368e-05,
+      "loss": 0.5695,
+      "step": 850
+    },
+    {
+      "epoch": 2.143112701252236,
+      "grad_norm": 1.5374252796173096,
+      "learning_rate": 1.896315789473684e-05,
+      "loss": 0.5228,
+      "step": 900
+    },
+    {
+      "epoch": 2.2623732856290997,
+      "grad_norm": 1.639708161354065,
+      "learning_rate": 1.8173684210526317e-05,
+      "loss": 0.5512,
+      "step": 950
+    },
+    {
+      "epoch": 2.381633870005963,
+      "grad_norm": 1.6390520334243774,
+      "learning_rate": 1.738421052631579e-05,
+      "loss": 0.5252,
+      "step": 1000
+    },
+    {
+      "epoch": 2.500894454382826,
+      "grad_norm": 1.4625619649887085,
+      "learning_rate": 1.6594736842105263e-05,
+      "loss": 0.5339,
+      "step": 1050
+    },
+    {
+      "epoch": 2.62015503875969,
+      "grad_norm": 1.3197258710861206,
+      "learning_rate": 1.5805263157894735e-05,
+      "loss": 0.5479,
+      "step": 1100
+    },
+    {
+      "epoch": 2.739415623136553,
+      "grad_norm": 1.4443845748901367,
+      "learning_rate": 1.5015789473684212e-05,
+      "loss": 0.5134,
+      "step": 1150
+    },
+    {
+      "epoch": 2.8586762075134167,
+      "grad_norm": 1.7365626096725464,
+      "learning_rate": 1.4226315789473685e-05,
+      "loss": 0.5253,
+      "step": 1200
+    },
+    {
+      "epoch": 2.97793679189028,
+      "grad_norm": 1.7469673156738281,
+      "learning_rate": 1.343684210526316e-05,
+      "loss": 0.5452,
+      "step": 1250
+    },
+    {
+      "epoch": 3.095408467501491,
+      "grad_norm": 1.7984752655029297,
+      "learning_rate": 1.2647368421052632e-05,
+      "loss": 0.5178,
+      "step": 1300
+    },
+    {
+      "epoch": 3.2146690518783543,
+      "grad_norm": 1.5190192461013794,
+      "learning_rate": 1.1857894736842105e-05,
+      "loss": 0.5261,
+      "step": 1350
+    },
+    {
+      "epoch": 3.3339296362552178,
+      "grad_norm": 1.5508211851119995,
+      "learning_rate": 1.106842105263158e-05,
+      "loss": 0.5435,
+      "step": 1400
+    },
+    {
+      "epoch": 3.4531902206320813,
+      "grad_norm": 1.8733484745025635,
+      "learning_rate": 1.0278947368421052e-05,
+      "loss": 0.5001,
+      "step": 1450
+    },
+    {
+      "epoch": 3.5724508050089447,
+      "grad_norm": 1.6355196237564087,
+      "learning_rate": 9.489473684210527e-06,
+      "loss": 0.5005,
+      "step": 1500
+    },
+    {
+      "epoch": 3.691711389385808,
+      "grad_norm": 1.238028645515442,
+      "learning_rate": 8.7e-06,
+      "loss": 0.5164,
+      "step": 1550
+    },
+    {
+      "epoch": 3.8109719737626713,
+      "grad_norm": 1.8155537843704224,
+      "learning_rate": 7.910526315789474e-06,
+      "loss": 0.5049,
+      "step": 1600
+    },
+    {
+      "epoch": 3.9302325581395348,
+      "grad_norm": 1.6747583150863647,
+      "learning_rate": 7.121052631578948e-06,
+      "loss": 0.5099,
+      "step": 1650
+    },
+    {
+      "epoch": 4.047704233750745,
+      "grad_norm": 1.4803907871246338,
+      "learning_rate": 6.331578947368422e-06,
+      "loss": 0.5148,
+      "step": 1700
+    },
+    {
+      "epoch": 4.166964818127608,
+      "grad_norm": 1.571410059928894,
+      "learning_rate": 5.542105263157895e-06,
+      "loss": 0.5128,
+      "step": 1750
+    },
+    {
+      "epoch": 4.286225402504472,
+      "grad_norm": 1.606655478477478,
+      "learning_rate": 4.752631578947368e-06,
+      "loss": 0.5116,
+      "step": 1800
+    },
+    {
+      "epoch": 4.405485986881335,
+      "grad_norm": 1.6239967346191406,
+      "learning_rate": 3.963157894736842e-06,
+      "loss": 0.5068,
+      "step": 1850
+    },
+    {
+      "epoch": 4.524746571258199,
+      "grad_norm": 1.3790518045425415,
+      "learning_rate": 3.173684210526316e-06,
+      "loss": 0.4962,
+      "step": 1900
+    },
+    {
+      "epoch": 4.644007155635062,
+      "grad_norm": 1.2910724878311157,
+      "learning_rate": 2.38421052631579e-06,
+      "loss": 0.5172,
+      "step": 1950
+    },
+    {
+      "epoch": 4.763267740011926,
+      "grad_norm": 2.057995557785034,
+      "learning_rate": 1.5947368421052633e-06,
+      "loss": 0.5007,
+      "step": 2000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 2100,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1043619221864448.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e70b6a91213c1b38da9f91a27fc85eeb5f1a0452422f750cf236f5949b7e75b0
+size 5304

checkpoint-2000/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2100/added_tokens.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "[/ANSWER]": 50262,
+  "[/OS_CONCEPT]": 50258,
+  "[/REASONING]": 50260,
+  "[ANSWER]": 50261,
+  "[OS_CONCEPT]": 50257,
+  "[REASONING]": 50259
+}

checkpoint-2100/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "_num_labels": 1,
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 6,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.0",
+  "use_cache": true,
+  "vocab_size": 50263
+}

checkpoint-2100/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.53.0"
+}

checkpoint-2100/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2100/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c173e96a46be22d9d0191ca38391ee5fc4d1326e59ceb072721c16bc5bbb0b98
+size 327676360

checkpoint-2100/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9400d7aae5f3790d12002b9485fcc6def655b4ea4836b8fc1e60a1a1c75ecf6
+size 655401338

checkpoint-2100/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6326a526ff8699f4ac51edcb6ba1aaaab54b1b963e88f0fff3fd251f6b7b78dc
+size 14244

checkpoint-2100/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6483713128e4012a73332236f5e558536b9285dd2f47e8d45f1662059bbe467
+size 1064

checkpoint-2100/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "[OS_CONCEPT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[/OS_CONCEPT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[REASONING]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[/REASONING]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[ANSWER]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[/ANSWER]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

checkpoint-2100/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2100/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,77 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50257": {
+      "content": "[OS_CONCEPT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50258": {
+      "content": "[/OS_CONCEPT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50259": {
+      "content": "[REASONING]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50260": {
+      "content": "[/REASONING]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50261": {
+      "content": "[ANSWER]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50262": {
+      "content": "[/ANSWER]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "[OS_CONCEPT]",
+    "[/OS_CONCEPT]",
+    "[REASONING]",
+    "[/REASONING]",
+    "[ANSWER]",
+    "[/ANSWER]"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

checkpoint-2100/trainer_state.json ADDED Viewed

	@@ -0,0 +1,328 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 5.0,
+  "eval_steps": 500,
+  "global_step": 2100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.11926058437686345,
+      "grad_norm": 8.05271053314209,
+      "learning_rate": 7.35e-06,
+      "loss": 5.4953,
+      "step": 50
+    },
+    {
+      "epoch": 0.2385211687537269,
+      "grad_norm": 2.7420473098754883,
+      "learning_rate": 1.485e-05,
+      "loss": 1.2318,
+      "step": 100
+    },
+    {
+      "epoch": 0.35778175313059035,
+      "grad_norm": 1.7430616617202759,
+      "learning_rate": 2.235e-05,
+      "loss": 0.7734,
+      "step": 150
+    },
+    {
+      "epoch": 0.4770423375074538,
+      "grad_norm": 1.5376101732254028,
+      "learning_rate": 2.985e-05,
+      "loss": 0.6646,
+      "step": 200
+    },
+    {
+      "epoch": 0.5963029218843172,
+      "grad_norm": 1.6270289421081543,
+      "learning_rate": 2.9226315789473687e-05,
+      "loss": 0.6334,
+      "step": 250
+    },
+    {
+      "epoch": 0.7155635062611807,
+      "grad_norm": 1.8452305793762207,
+      "learning_rate": 2.8436842105263156e-05,
+      "loss": 0.6103,
+      "step": 300
+    },
+    {
+      "epoch": 0.8348240906380441,
+      "grad_norm": 1.8059115409851074,
+      "learning_rate": 2.7647368421052632e-05,
+      "loss": 0.6186,
+      "step": 350
+    },
+    {
+      "epoch": 0.9540846750149076,
+      "grad_norm": 1.6886439323425293,
+      "learning_rate": 2.6857894736842105e-05,
+      "loss": 0.6045,
+      "step": 400
+    },
+    {
+      "epoch": 1.071556350626118,
+      "grad_norm": 1.4987449645996094,
+      "learning_rate": 2.6068421052631578e-05,
+      "loss": 0.5941,
+      "step": 450
+    },
+    {
+      "epoch": 1.1908169350029816,
+      "grad_norm": 1.5386848449707031,
+      "learning_rate": 2.527894736842105e-05,
+      "loss": 0.5678,
+      "step": 500
+    },
+    {
+      "epoch": 1.310077519379845,
+      "grad_norm": 1.5747556686401367,
+      "learning_rate": 2.4489473684210527e-05,
+      "loss": 0.5671,
+      "step": 550
+    },
+    {
+      "epoch": 1.4293381037567083,
+      "grad_norm": 1.5854023694992065,
+      "learning_rate": 2.37e-05,
+      "loss": 0.5376,
+      "step": 600
+    },
+    {
+      "epoch": 1.5485986881335718,
+      "grad_norm": 1.4520505666732788,
+      "learning_rate": 2.2910526315789473e-05,
+      "loss": 0.5632,
+      "step": 650
+    },
+    {
+      "epoch": 1.6678592725104353,
+      "grad_norm": 1.4415792226791382,
+      "learning_rate": 2.2121052631578946e-05,
+      "loss": 0.5629,
+      "step": 700
+    },
+    {
+      "epoch": 1.7871198568872988,
+      "grad_norm": 1.4224036931991577,
+      "learning_rate": 2.1331578947368422e-05,
+      "loss": 0.5647,
+      "step": 750
+    },
+    {
+      "epoch": 1.906380441264162,
+      "grad_norm": 1.4895819425582886,
+      "learning_rate": 2.0542105263157895e-05,
+      "loss": 0.54,
+      "step": 800
+    },
+    {
+      "epoch": 2.0238521168753727,
+      "grad_norm": 1.5508358478546143,
+      "learning_rate": 1.9752631578947368e-05,
+      "loss": 0.5695,
+      "step": 850
+    },
+    {
+      "epoch": 2.143112701252236,
+      "grad_norm": 1.5374252796173096,
+      "learning_rate": 1.896315789473684e-05,
+      "loss": 0.5228,
+      "step": 900
+    },
+    {
+      "epoch": 2.2623732856290997,
+      "grad_norm": 1.639708161354065,
+      "learning_rate": 1.8173684210526317e-05,
+      "loss": 0.5512,
+      "step": 950
+    },
+    {
+      "epoch": 2.381633870005963,
+      "grad_norm": 1.6390520334243774,
+      "learning_rate": 1.738421052631579e-05,
+      "loss": 0.5252,
+      "step": 1000
+    },
+    {
+      "epoch": 2.500894454382826,
+      "grad_norm": 1.4625619649887085,
+      "learning_rate": 1.6594736842105263e-05,
+      "loss": 0.5339,
+      "step": 1050
+    },
+    {
+      "epoch": 2.62015503875969,
+      "grad_norm": 1.3197258710861206,
+      "learning_rate": 1.5805263157894735e-05,
+      "loss": 0.5479,
+      "step": 1100
+    },
+    {
+      "epoch": 2.739415623136553,
+      "grad_norm": 1.4443845748901367,
+      "learning_rate": 1.5015789473684212e-05,
+      "loss": 0.5134,
+      "step": 1150
+    },
+    {
+      "epoch": 2.8586762075134167,
+      "grad_norm": 1.7365626096725464,
+      "learning_rate": 1.4226315789473685e-05,
+      "loss": 0.5253,
+      "step": 1200
+    },
+    {
+      "epoch": 2.97793679189028,
+      "grad_norm": 1.7469673156738281,
+      "learning_rate": 1.343684210526316e-05,
+      "loss": 0.5452,
+      "step": 1250
+    },
+    {
+      "epoch": 3.095408467501491,
+      "grad_norm": 1.7984752655029297,
+      "learning_rate": 1.2647368421052632e-05,
+      "loss": 0.5178,
+      "step": 1300
+    },
+    {
+      "epoch": 3.2146690518783543,
+      "grad_norm": 1.5190192461013794,
+      "learning_rate": 1.1857894736842105e-05,
+      "loss": 0.5261,
+      "step": 1350
+    },
+    {
+      "epoch": 3.3339296362552178,
+      "grad_norm": 1.5508211851119995,
+      "learning_rate": 1.106842105263158e-05,
+      "loss": 0.5435,
+      "step": 1400
+    },
+    {
+      "epoch": 3.4531902206320813,
+      "grad_norm": 1.8733484745025635,
+      "learning_rate": 1.0278947368421052e-05,
+      "loss": 0.5001,
+      "step": 1450
+    },
+    {
+      "epoch": 3.5724508050089447,
+      "grad_norm": 1.6355196237564087,
+      "learning_rate": 9.489473684210527e-06,
+      "loss": 0.5005,
+      "step": 1500
+    },
+    {
+      "epoch": 3.691711389385808,
+      "grad_norm": 1.238028645515442,
+      "learning_rate": 8.7e-06,
+      "loss": 0.5164,
+      "step": 1550
+    },
+    {
+      "epoch": 3.8109719737626713,
+      "grad_norm": 1.8155537843704224,
+      "learning_rate": 7.910526315789474e-06,
+      "loss": 0.5049,
+      "step": 1600
+    },
+    {
+      "epoch": 3.9302325581395348,
+      "grad_norm": 1.6747583150863647,
+      "learning_rate": 7.121052631578948e-06,
+      "loss": 0.5099,
+      "step": 1650
+    },
+    {
+      "epoch": 4.047704233750745,
+      "grad_norm": 1.4803907871246338,
+      "learning_rate": 6.331578947368422e-06,
+      "loss": 0.5148,
+      "step": 1700
+    },
+    {
+      "epoch": 4.166964818127608,
+      "grad_norm": 1.571410059928894,
+      "learning_rate": 5.542105263157895e-06,
+      "loss": 0.5128,
+      "step": 1750
+    },
+    {
+      "epoch": 4.286225402504472,
+      "grad_norm": 1.606655478477478,
+      "learning_rate": 4.752631578947368e-06,
+      "loss": 0.5116,
+      "step": 1800
+    },
+    {
+      "epoch": 4.405485986881335,
+      "grad_norm": 1.6239967346191406,
+      "learning_rate": 3.963157894736842e-06,
+      "loss": 0.5068,
+      "step": 1850
+    },
+    {
+      "epoch": 4.524746571258199,
+      "grad_norm": 1.3790518045425415,
+      "learning_rate": 3.173684210526316e-06,
+      "loss": 0.4962,
+      "step": 1900
+    },
+    {
+      "epoch": 4.644007155635062,
+      "grad_norm": 1.2910724878311157,
+      "learning_rate": 2.38421052631579e-06,
+      "loss": 0.5172,
+      "step": 1950
+    },
+    {
+      "epoch": 4.763267740011926,
+      "grad_norm": 2.057995557785034,
+      "learning_rate": 1.5947368421052633e-06,
+      "loss": 0.5007,
+      "step": 2000
+    },
+    {
+      "epoch": 4.882528324388789,
+      "grad_norm": 1.6165767908096313,
+      "learning_rate": 8.052631578947369e-07,
+      "loss": 0.4959,
+      "step": 2050
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 3.790095329284668,
+      "learning_rate": 1.5789473684210525e-08,
+      "loss": 0.4978,
+      "step": 2100
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 2100,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1095486626856960.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2100/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e70b6a91213c1b38da9f91a27fc85eeb5f1a0452422f750cf236f5949b7e75b0
+size 5304

checkpoint-2100/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "_num_labels": 1,
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 6,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.53.0",
+  "use_cache": true,
+  "vocab_size": 50263
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.53.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c173e96a46be22d9d0191ca38391ee5fc4d1326e59ceb072721c16bc5bbb0b98
+size 327676360

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "[OS_CONCEPT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[/OS_CONCEPT]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[REASONING]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[/REASONING]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[ANSWER]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "[/ANSWER]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "pad_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff