Vardaan98 commited on
Commit
a9dd270
·
verified ·
1 Parent(s): 5d37320

Upload 23 files

Browse files
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ import torch
4
+
5
+ # --- Fix Streamlit config issue ---
6
+ st.set_page_config(
7
+ page_title="Natural Reasoning Bot",
8
+ page_icon="🤖",
9
+ layout="centered"
10
+ )
11
+
12
+ st.title("🤖 Natural Reasoning Bot")
13
+ st.markdown("Ask science questions and get answers from your fine-tuned model.")
14
+
15
+ # --- Sidebar for parameters ---
16
+ st.sidebar.header("⚙️ Generation Settings")
17
+ temperature = st.sidebar.slider("Temperature", 0.0, 1.5, 1.0, 0.1)
18
+ top_k = st.sidebar.slider("Top-k", 0, 100, 50, 5)
19
+ top_p = st.sidebar.slider("Top-p", 0.0, 1.0, 0.95, 0.05)
20
+
21
+ # --- Load model and tokenizer ---
22
+ @st.cache_resource(show_spinner=False)
23
+ def load_model():
24
+ model = AutoModelForCausalLM.from_pretrained("./my_bot_model")
25
+ tokenizer = AutoTokenizer.from_pretrained("./my_bot_model")
26
+ return model, tokenizer
27
+
28
+ model, tokenizer = load_model()
29
+
30
+ # --- Text Input ---
31
+ question = st.text_area("🧠 Enter your science question:", height=100)
32
+
33
+ generate_btn = st.button("🔍 Generate Answer")
34
+
35
+ # --- Inference Logic ---
36
+ if generate_btn and question:
37
+ input_text = f"### Question: {question}\n### Answer:"
38
+ inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
39
+
40
+ model.eval()
41
+ with torch.no_grad():
42
+ output = model.generate(
43
+ **inputs,
44
+ max_length=256,
45
+ do_sample=True,
46
+ top_p=top_p,
47
+ top_k=top_k,
48
+ temperature=temperature,
49
+ pad_token_id=tokenizer.eos_token_id
50
+ )
51
+
52
+ response = tokenizer.decode(output[0], skip_special_tokens=True)
53
+ answer = response.replace(input_text, "").strip()
54
+
55
+ st.markdown("---")
56
+ st.subheader("📤 Model Answer")
57
+ st.success(answer)
58
+
59
+ elif generate_btn:
60
+ st.warning("Please enter a question to get an answer.")
main.ipynb ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "5d81bb13",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from datasets import load_dataset\n",
11
+ "\n",
12
+ "dataset = load_dataset(\"facebook/natural_reasoning\")\n",
13
+ "train_data = dataset[\"train\"].select(range(5000)) # Start with 5k examples\n"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 3,
19
+ "id": "5279c3c3",
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "def format_for_training(example):\n",
24
+ " return {\n",
25
+ " \"prompt\": example[\"question\"],\n",
26
+ " \"completion\": example[\"reference_answer\"]\n",
27
+ " }\n",
28
+ "\n",
29
+ "train_data = train_data.map(format_for_training)\n"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 4,
35
+ "id": "d5f715b3",
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "from transformers import AutoTokenizer\n",
40
+ "\n",
41
+ "model_checkpoint = \"distilgpt2\"\n",
42
+ "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
43
+ "tokenizer.pad_token = tokenizer.eos_token\n",
44
+ "max_seq_length = 512\n",
45
+ "\n",
46
+ "def tokenize(example):\n",
47
+ " input_text = f\"### Question: {example['prompt']}\\n### Answer: {example['completion']}{tokenizer.eos_token}\"\n",
48
+ " tokenized = tokenizer(\n",
49
+ " input_text,\n",
50
+ " padding=\"max_length\",\n",
51
+ " truncation=True,\n",
52
+ " max_length=max_seq_length\n",
53
+ " )\n",
54
+ " tokenized[\"labels\"] = tokenized[\"input_ids\"].copy()\n",
55
+ " return tokenized\n",
56
+ "\n",
57
+ "tokenized_data = train_data.map(tokenize)\n"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": 5,
63
+ "id": "61cb619d",
64
+ "metadata": {},
65
+ "outputs": [
66
+ {
67
+ "name": "stderr",
68
+ "output_type": "stream",
69
+ "text": [
70
+ "c:\\Users\\shukl\\anaconda3\\Lib\\site-packages\\transformers\\training_args.py:1611: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
71
+ " warnings.warn(\n",
72
+ "C:\\Users\\shukl\\AppData\\Local\\Temp\\ipykernel_7600\\3538093026.py:16: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
73
+ " trainer = Trainer(\n",
74
+ "`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.\n"
75
+ ]
76
+ },
77
+ {
78
+ "data": {
79
+ "text/html": [
80
+ "\n",
81
+ " <div>\n",
82
+ " \n",
83
+ " <progress value='2500' max='2500' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
84
+ " [2500/2500 3:07:00, Epoch 1/1]\n",
85
+ " </div>\n",
86
+ " <table border=\"1\" class=\"dataframe\">\n",
87
+ " <thead>\n",
88
+ " <tr style=\"text-align: left;\">\n",
89
+ " <th>Step</th>\n",
90
+ " <th>Training Loss</th>\n",
91
+ " </tr>\n",
92
+ " </thead>\n",
93
+ " <tbody>\n",
94
+ " <tr>\n",
95
+ " <td>500</td>\n",
96
+ " <td>0.836400</td>\n",
97
+ " </tr>\n",
98
+ " <tr>\n",
99
+ " <td>1000</td>\n",
100
+ " <td>0.629200</td>\n",
101
+ " </tr>\n",
102
+ " <tr>\n",
103
+ " <td>1500</td>\n",
104
+ " <td>0.631400</td>\n",
105
+ " </tr>\n",
106
+ " <tr>\n",
107
+ " <td>2000</td>\n",
108
+ " <td>0.622300</td>\n",
109
+ " </tr>\n",
110
+ " <tr>\n",
111
+ " <td>2500</td>\n",
112
+ " <td>0.631600</td>\n",
113
+ " </tr>\n",
114
+ " </tbody>\n",
115
+ "</table><p>"
116
+ ],
117
+ "text/plain": [
118
+ "<IPython.core.display.HTML object>"
119
+ ]
120
+ },
121
+ "metadata": {},
122
+ "output_type": "display_data"
123
+ },
124
+ {
125
+ "data": {
126
+ "text/plain": [
127
+ "('./my_bot_model\\\\tokenizer_config.json',\n",
128
+ " './my_bot_model\\\\special_tokens_map.json',\n",
129
+ " './my_bot_model\\\\vocab.json',\n",
130
+ " './my_bot_model\\\\merges.txt',\n",
131
+ " './my_bot_model\\\\added_tokens.json',\n",
132
+ " './my_bot_model\\\\tokenizer.json')"
133
+ ]
134
+ },
135
+ "execution_count": 5,
136
+ "metadata": {},
137
+ "output_type": "execute_result"
138
+ }
139
+ ],
140
+ "source": [
141
+ "from transformers import AutoModelForCausalLM, TrainingArguments, Trainer\n",
142
+ "\n",
143
+ "model = AutoModelForCausalLM.from_pretrained(model_checkpoint)\n",
144
+ "\n",
145
+ "training_args = TrainingArguments(\n",
146
+ " output_dir=\"./my_bot_model\",\n",
147
+ " evaluation_strategy=\"no\",\n",
148
+ " learning_rate=2e-5,\n",
149
+ " per_device_train_batch_size=2,\n",
150
+ " num_train_epochs=1,\n",
151
+ " save_strategy=\"epoch\",\n",
152
+ " weight_decay=0.01,\n",
153
+ " fp16=True # You said you have 4GB GPU\n",
154
+ ")\n",
155
+ "\n",
156
+ "trainer = Trainer(\n",
157
+ " model=model,\n",
158
+ " args=training_args,\n",
159
+ " train_dataset=tokenized_data,\n",
160
+ " tokenizer=tokenizer\n",
161
+ ")\n",
162
+ "\n",
163
+ "trainer.train()\n",
164
+ "model.save_pretrained(\"./my_bot_model\")\n",
165
+ "tokenizer.save_pretrained(\"./my_bot_model\")\n"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": 9,
171
+ "id": "57b71657",
172
+ "metadata": {},
173
+ "outputs": [
174
+ {
175
+ "name": "stdout",
176
+ "output_type": "stream",
177
+ "text": [
178
+ "💬 Model Answer:\n",
179
+ " The total work done on an object when it is moved upwards against gravity is approximately 3.8x faster than the total work done on an object in a vacuum.\n"
180
+ ]
181
+ }
182
+ ],
183
+ "source": [
184
+ "import torch\n",
185
+ "\n",
186
+ "model = AutoModelForCausalLM.from_pretrained(\"./my_bot_model\")\n",
187
+ "tokenizer = AutoTokenizer.from_pretrained(\"./my_bot_model\")\n",
188
+ "\n",
189
+ "question = \"What is the total work done on an object when it is moved upwards against gravity?\"\n",
190
+ "input_text = f\"### Question: {question}\\n### Answer:\"\n",
191
+ "inputs = tokenizer(input_text, return_tensors=\"pt\").to(model.device)\n",
192
+ "\n",
193
+ "with torch.no_grad():\n",
194
+ " output = model.generate(\n",
195
+ " **inputs,\n",
196
+ " max_length=256,\n",
197
+ " do_sample=True,\n",
198
+ " temperature=0.7,\n",
199
+ " top_p=0.9,\n",
200
+ " top_k=50,\n",
201
+ " pad_token_id=tokenizer.eos_token_id\n",
202
+ " )\n",
203
+ "\n",
204
+ "response = tokenizer.decode(output[0], skip_special_tokens=True)\n",
205
+ "answer = response.replace(input_text, \"\").strip()\n",
206
+ "\n",
207
+ "print(\"💬 Model Answer:\\n\", answer)\n"
208
+ ]
209
+ }
210
+ ],
211
+ "metadata": {
212
+ "kernelspec": {
213
+ "display_name": "base",
214
+ "language": "python",
215
+ "name": "python3"
216
+ },
217
+ "language_info": {
218
+ "codemirror_mode": {
219
+ "name": "ipython",
220
+ "version": 3
221
+ },
222
+ "file_extension": ".py",
223
+ "mimetype": "text/x-python",
224
+ "name": "python",
225
+ "nbconvert_exporter": "python",
226
+ "pygments_lexer": "ipython3",
227
+ "version": "3.12.3"
228
+ }
229
+ },
230
+ "nbformat": 4,
231
+ "nbformat_minor": 5
232
+ }
results/checkpoint-750/config.json ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_num_labels": 1,
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "id2label": {
12
+ "0": "LABEL_0"
13
+ },
14
+ "initializer_range": 0.02,
15
+ "label2id": {
16
+ "LABEL_0": 0
17
+ },
18
+ "layer_norm_epsilon": 1e-05,
19
+ "model_type": "gpt2",
20
+ "n_ctx": 1024,
21
+ "n_embd": 768,
22
+ "n_head": 12,
23
+ "n_inner": null,
24
+ "n_layer": 6,
25
+ "n_positions": 1024,
26
+ "reorder_and_upcast_attn": false,
27
+ "resid_pdrop": 0.1,
28
+ "scale_attn_by_inverse_layer_idx": false,
29
+ "scale_attn_weights": true,
30
+ "summary_activation": null,
31
+ "summary_first_dropout": 0.1,
32
+ "summary_proj_to_labels": true,
33
+ "summary_type": "cls_index",
34
+ "summary_use_proj": true,
35
+ "task_specific_params": {
36
+ "text-generation": {
37
+ "do_sample": true,
38
+ "max_length": 50
39
+ }
40
+ },
41
+ "torch_dtype": "float32",
42
+ "transformers_version": "4.50.3",
43
+ "use_cache": true,
44
+ "vocab_size": 50257
45
+ }
results/checkpoint-750/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.50.3"
6
+ }
results/checkpoint-750/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
results/checkpoint-750/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfbc9137f25bbd51376fae72c48f122ed62b2dae6d98c141264134d7427c33e3
3
+ size 327657928
results/checkpoint-750/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d3d83538ff9acd8bf815f0b64295eb49302ff9866c13b633b2d7708d5c4cecd
3
+ size 655362362
results/checkpoint-750/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce3140fdd7584bc4b4d20e2036b0351379fb91b849b1e1af57d80a0cad8a3d56
3
+ size 13990
results/checkpoint-750/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c70636ba82956d9bcb1e9c3edb2fca8aa8e0bd7aa847ff3d6a8cbc20d70c912a
3
+ size 1064
results/checkpoint-750/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
results/checkpoint-750/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
results/checkpoint-750/tokenizer_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "tokenizer_class": "GPT2Tokenizer",
20
+ "unk_token": "<|endoftext|>"
21
+ }
results/checkpoint-750/trainer_state.json ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 750,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.013333333333333334,
14
+ "grad_norm": 18.803022384643555,
15
+ "learning_rate": 1.9733333333333336e-05,
16
+ "loss": 4.8444,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.02666666666666667,
21
+ "grad_norm": 4.537410259246826,
22
+ "learning_rate": 1.9466666666666668e-05,
23
+ "loss": 1.6348,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.04,
28
+ "grad_norm": 4.052369117736816,
29
+ "learning_rate": 1.9200000000000003e-05,
30
+ "loss": 1.6249,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.05333333333333334,
35
+ "grad_norm": 3.1292710304260254,
36
+ "learning_rate": 1.8933333333333334e-05,
37
+ "loss": 1.525,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.06666666666666667,
42
+ "grad_norm": 3.1493794918060303,
43
+ "learning_rate": 1.866666666666667e-05,
44
+ "loss": 1.5233,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.08,
49
+ "grad_norm": 3.0254428386688232,
50
+ "learning_rate": 1.8400000000000003e-05,
51
+ "loss": 1.5487,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.09333333333333334,
56
+ "grad_norm": 3.110171318054199,
57
+ "learning_rate": 1.8133333333333335e-05,
58
+ "loss": 1.3042,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.10666666666666667,
63
+ "grad_norm": 3.023773431777954,
64
+ "learning_rate": 1.7866666666666666e-05,
65
+ "loss": 1.4056,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.12,
70
+ "grad_norm": 5.209704875946045,
71
+ "learning_rate": 1.76e-05,
72
+ "loss": 1.4335,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.13333333333333333,
77
+ "grad_norm": 2.825587034225464,
78
+ "learning_rate": 1.7333333333333336e-05,
79
+ "loss": 1.5363,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.14666666666666667,
84
+ "grad_norm": 3.2794153690338135,
85
+ "learning_rate": 1.706666666666667e-05,
86
+ "loss": 1.3614,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.16,
91
+ "grad_norm": 3.4573426246643066,
92
+ "learning_rate": 1.6800000000000002e-05,
93
+ "loss": 1.3278,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.17333333333333334,
98
+ "grad_norm": 3.3406155109405518,
99
+ "learning_rate": 1.6533333333333333e-05,
100
+ "loss": 1.2889,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.18666666666666668,
105
+ "grad_norm": 4.201858997344971,
106
+ "learning_rate": 1.6266666666666668e-05,
107
+ "loss": 1.3344,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.2,
112
+ "grad_norm": 2.768216848373413,
113
+ "learning_rate": 1.6000000000000003e-05,
114
+ "loss": 1.2979,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.21333333333333335,
119
+ "grad_norm": 3.158536911010742,
120
+ "learning_rate": 1.5733333333333334e-05,
121
+ "loss": 1.3151,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.22666666666666666,
126
+ "grad_norm": 2.6460344791412354,
127
+ "learning_rate": 1.546666666666667e-05,
128
+ "loss": 1.4425,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.24,
133
+ "grad_norm": 3.4400217533111572,
134
+ "learning_rate": 1.5200000000000002e-05,
135
+ "loss": 1.4769,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.25333333333333335,
140
+ "grad_norm": 3.023303985595703,
141
+ "learning_rate": 1.4933333333333335e-05,
142
+ "loss": 1.3712,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.26666666666666666,
147
+ "grad_norm": 3.0173838138580322,
148
+ "learning_rate": 1.4666666666666666e-05,
149
+ "loss": 1.4047,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.28,
154
+ "grad_norm": 2.7846803665161133,
155
+ "learning_rate": 1.4400000000000001e-05,
156
+ "loss": 1.264,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.29333333333333333,
161
+ "grad_norm": 3.157430648803711,
162
+ "learning_rate": 1.4133333333333334e-05,
163
+ "loss": 1.2673,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.30666666666666664,
168
+ "grad_norm": 3.6202878952026367,
169
+ "learning_rate": 1.3866666666666669e-05,
170
+ "loss": 1.2785,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.32,
175
+ "grad_norm": 3.181349515914917,
176
+ "learning_rate": 1.3600000000000002e-05,
177
+ "loss": 1.3585,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.3333333333333333,
182
+ "grad_norm": 3.8837106227874756,
183
+ "learning_rate": 1.3333333333333333e-05,
184
+ "loss": 1.4094,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.3466666666666667,
189
+ "grad_norm": 2.497514247894287,
190
+ "learning_rate": 1.3066666666666668e-05,
191
+ "loss": 1.248,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.36,
196
+ "grad_norm": 2.697995662689209,
197
+ "learning_rate": 1.2800000000000001e-05,
198
+ "loss": 1.2251,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.37333333333333335,
203
+ "grad_norm": 4.157288074493408,
204
+ "learning_rate": 1.2533333333333336e-05,
205
+ "loss": 1.4987,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.38666666666666666,
210
+ "grad_norm": 2.739837408065796,
211
+ "learning_rate": 1.2266666666666667e-05,
212
+ "loss": 1.4039,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.4,
217
+ "grad_norm": 3.4221503734588623,
218
+ "learning_rate": 1.2e-05,
219
+ "loss": 1.33,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.41333333333333333,
224
+ "grad_norm": 3.524386405944824,
225
+ "learning_rate": 1.1733333333333335e-05,
226
+ "loss": 1.4137,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.4266666666666667,
231
+ "grad_norm": 3.046319007873535,
232
+ "learning_rate": 1.1466666666666668e-05,
233
+ "loss": 1.3476,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.44,
238
+ "grad_norm": 2.374499559402466,
239
+ "learning_rate": 1.1200000000000001e-05,
240
+ "loss": 1.1561,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.4533333333333333,
245
+ "grad_norm": 3.037949323654175,
246
+ "learning_rate": 1.0933333333333334e-05,
247
+ "loss": 1.3312,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.4666666666666667,
252
+ "grad_norm": 3.216047525405884,
253
+ "learning_rate": 1.0666666666666667e-05,
254
+ "loss": 1.2017,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.48,
259
+ "grad_norm": 3.007854461669922,
260
+ "learning_rate": 1.04e-05,
261
+ "loss": 1.2893,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.49333333333333335,
266
+ "grad_norm": 3.648378849029541,
267
+ "learning_rate": 1.0133333333333335e-05,
268
+ "loss": 1.3693,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.5066666666666667,
273
+ "grad_norm": 3.0441296100616455,
274
+ "learning_rate": 9.866666666666668e-06,
275
+ "loss": 1.2724,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.52,
280
+ "grad_norm": 3.027775526046753,
281
+ "learning_rate": 9.600000000000001e-06,
282
+ "loss": 1.2175,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.5333333333333333,
287
+ "grad_norm": 3.1908812522888184,
288
+ "learning_rate": 9.333333333333334e-06,
289
+ "loss": 1.3371,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.5466666666666666,
294
+ "grad_norm": 3.432631731033325,
295
+ "learning_rate": 9.066666666666667e-06,
296
+ "loss": 1.2195,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.56,
301
+ "grad_norm": 3.2034451961517334,
302
+ "learning_rate": 8.8e-06,
303
+ "loss": 1.2541,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.5733333333333334,
308
+ "grad_norm": 3.1531622409820557,
309
+ "learning_rate": 8.533333333333335e-06,
310
+ "loss": 1.3046,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.5866666666666667,
315
+ "grad_norm": 2.9354567527770996,
316
+ "learning_rate": 8.266666666666667e-06,
317
+ "loss": 1.2941,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.6,
322
+ "grad_norm": 3.434643507003784,
323
+ "learning_rate": 8.000000000000001e-06,
324
+ "loss": 1.3223,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.6133333333333333,
329
+ "grad_norm": 2.433544158935547,
330
+ "learning_rate": 7.733333333333334e-06,
331
+ "loss": 1.2449,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.6266666666666667,
336
+ "grad_norm": 2.9112284183502197,
337
+ "learning_rate": 7.4666666666666675e-06,
338
+ "loss": 1.2879,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.64,
343
+ "grad_norm": 3.082655668258667,
344
+ "learning_rate": 7.2000000000000005e-06,
345
+ "loss": 1.2657,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.6533333333333333,
350
+ "grad_norm": 3.0584707260131836,
351
+ "learning_rate": 6.9333333333333344e-06,
352
+ "loss": 1.2012,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.6666666666666666,
357
+ "grad_norm": 2.8814918994903564,
358
+ "learning_rate": 6.666666666666667e-06,
359
+ "loss": 1.35,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.68,
364
+ "grad_norm": 3.5102522373199463,
365
+ "learning_rate": 6.4000000000000006e-06,
366
+ "loss": 1.2537,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.6933333333333334,
371
+ "grad_norm": 3.4719033241271973,
372
+ "learning_rate": 6.133333333333334e-06,
373
+ "loss": 1.407,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.7066666666666667,
378
+ "grad_norm": 3.265688896179199,
379
+ "learning_rate": 5.8666666666666675e-06,
380
+ "loss": 1.2373,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.72,
385
+ "grad_norm": 3.8343162536621094,
386
+ "learning_rate": 5.600000000000001e-06,
387
+ "loss": 1.3966,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.7333333333333333,
392
+ "grad_norm": 3.5571675300598145,
393
+ "learning_rate": 5.333333333333334e-06,
394
+ "loss": 1.263,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.7466666666666667,
399
+ "grad_norm": 3.040654182434082,
400
+ "learning_rate": 5.0666666666666676e-06,
401
+ "loss": 1.2993,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.76,
406
+ "grad_norm": 3.5149991512298584,
407
+ "learning_rate": 4.800000000000001e-06,
408
+ "loss": 1.3882,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.7733333333333333,
413
+ "grad_norm": 3.1839466094970703,
414
+ "learning_rate": 4.533333333333334e-06,
415
+ "loss": 1.4662,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.7866666666666666,
420
+ "grad_norm": 3.4513893127441406,
421
+ "learning_rate": 4.266666666666668e-06,
422
+ "loss": 1.2339,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.8,
427
+ "grad_norm": 3.415241003036499,
428
+ "learning_rate": 4.000000000000001e-06,
429
+ "loss": 1.4015,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.8133333333333334,
434
+ "grad_norm": 3.349771738052368,
435
+ "learning_rate": 3.7333333333333337e-06,
436
+ "loss": 1.3701,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.8266666666666667,
441
+ "grad_norm": 3.491492748260498,
442
+ "learning_rate": 3.4666666666666672e-06,
443
+ "loss": 1.2594,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.84,
448
+ "grad_norm": 2.971444606781006,
449
+ "learning_rate": 3.2000000000000003e-06,
450
+ "loss": 1.2519,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.8533333333333334,
455
+ "grad_norm": 3.3128817081451416,
456
+ "learning_rate": 2.9333333333333338e-06,
457
+ "loss": 1.2691,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.8666666666666667,
462
+ "grad_norm": 2.9615046977996826,
463
+ "learning_rate": 2.666666666666667e-06,
464
+ "loss": 1.1949,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.88,
469
+ "grad_norm": 2.7413668632507324,
470
+ "learning_rate": 2.4000000000000003e-06,
471
+ "loss": 1.3438,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.8933333333333333,
476
+ "grad_norm": 3.5874390602111816,
477
+ "learning_rate": 2.133333333333334e-06,
478
+ "loss": 1.2388,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.9066666666666666,
483
+ "grad_norm": 3.203536033630371,
484
+ "learning_rate": 1.8666666666666669e-06,
485
+ "loss": 1.2355,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.92,
490
+ "grad_norm": 3.3690247535705566,
491
+ "learning_rate": 1.6000000000000001e-06,
492
+ "loss": 1.3799,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.9333333333333333,
497
+ "grad_norm": 3.1757168769836426,
498
+ "learning_rate": 1.3333333333333334e-06,
499
+ "loss": 1.2549,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.9466666666666667,
504
+ "grad_norm": 3.5580122470855713,
505
+ "learning_rate": 1.066666666666667e-06,
506
+ "loss": 1.3206,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.96,
511
+ "grad_norm": 2.4422216415405273,
512
+ "learning_rate": 8.000000000000001e-07,
513
+ "loss": 1.2652,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.9733333333333334,
518
+ "grad_norm": 3.2295455932617188,
519
+ "learning_rate": 5.333333333333335e-07,
520
+ "loss": 1.2546,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.9866666666666667,
525
+ "grad_norm": 3.25307297706604,
526
+ "learning_rate": 2.666666666666667e-07,
527
+ "loss": 1.3874,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 1.0,
532
+ "grad_norm": 3.577993392944336,
533
+ "learning_rate": 0.0,
534
+ "loss": 1.3314,
535
+ "step": 750
536
+ }
537
+ ],
538
+ "logging_steps": 10,
539
+ "max_steps": 750,
540
+ "num_input_tokens_seen": 0,
541
+ "num_train_epochs": 1,
542
+ "save_steps": 500,
543
+ "stateful_callbacks": {
544
+ "TrainerControl": {
545
+ "args": {
546
+ "should_epoch_stop": false,
547
+ "should_evaluate": false,
548
+ "should_log": false,
549
+ "should_save": true,
550
+ "should_training_stop": true
551
+ },
552
+ "attributes": {}
553
+ }
554
+ },
555
+ "total_flos": 195972562944000.0,
556
+ "train_batch_size": 2,
557
+ "trial_name": null,
558
+ "trial_params": null
559
+ }
results/checkpoint-750/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6beed08edbb35ca6e9a80f4f0c2e64fd68d6b69087b1135dded06cfeaf727a05
3
+ size 5304
results/checkpoint-750/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
results/runs/May21_21-01-39_Vardaan_Aspire/events.out.tfevents.1747841540.Vardaan_Aspire.6608.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64bd6068596e88bb765fef2aa4dab5d0fef3a8c7c12416a30ae28b36a7904481
3
+ size 5369
results/runs/May21_21-07-12_Vardaan_Aspire/events.out.tfevents.1747841837.Vardaan_Aspire.16648.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:887cccb86efad0f284642ed81eb6cfdbf395b51ac8eb267913660bec4bd9603b
3
+ size 5371
results/runs/May21_21-14-05_Vardaan_Aspire/events.out.tfevents.1747842248.Vardaan_Aspire.7928.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e40e91bc782ac59098605977ae9b579d7870f1e69f6b114f43a11a8dd868b76c
3
+ size 5371
results/runs/May21_21-20-51_Vardaan_Aspire/events.out.tfevents.1747842654.Vardaan_Aspire.7928.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6554f22d532b849aa4458c5bafcbb829456d20366ac4e2350880e775eed71bde
3
+ size 5370
results/runs/May21_21-23-46_Vardaan_Aspire/events.out.tfevents.1747842827.Vardaan_Aspire.11864.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7ecbf193794a8fbea8914cd99e240858642c50d86cd57e7719d1bcbf8ff2749
3
+ size 5370
results/runs/May21_21-27-09_Vardaan_Aspire/events.out.tfevents.1747843029.Vardaan_Aspire.14288.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c093a8b1c5548b37f1c974deb08a86054a9578d4fe069f5b8fc745a0ba27d3c8
3
+ size 5369
results/runs/May21_21-28-25_Vardaan_Aspire/events.out.tfevents.1747843114.Vardaan_Aspire.14288.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3da74765969de5fe6d9ebd0c52f00fd1b8c532a0a56631701c6c13123ca3faef
3
+ size 5572
results/runs/May21_21-50-19_Vardaan_Aspire/events.out.tfevents.1747844420.Vardaan_Aspire.14288.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2579751bdbe481663b1ae2cdfc54d9e178b3cabdfb206b3095f49f00884446a9
3
+ size 21496