Ray0323 commited on
Commit
706d2f9
·
verified ·
1 Parent(s): 72ad347

Upload Model config and Train config

Browse files
Files changed (2) hide show
  1. model_config.json +209 -0
  2. train_config.json +67 -0
model_config.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "summary": {
3
+ "quantizer_dim": 3584,
4
+ "codebook_per_group_per_residual": 3584,
5
+ "group": 1,
6
+ "residual": 1,
7
+ "original_residual_dim": 1024,
8
+ "codebook_upsample": 3.5,
9
+ "codebook_dim": 3584
10
+ },
11
+ "base_model": "QWen2.5-7B-Pretrain",
12
+ "token_id_offset": 152064,
13
+ "spec_transform": {
14
+ "sampling_rate": 24000,
15
+ "segment_size": 72000,
16
+ "num_mels": 128,
17
+ "n_fft": 1024,
18
+ "hop_size": 256,
19
+ "win_size": 1024,
20
+ "fmin": 0,
21
+ "fmax": 12000,
22
+ "fmax_loss": null
23
+ },
24
+ "encoder": {
25
+ "input_channels": 128,
26
+ "depths": [
27
+ 3,
28
+ 3,
29
+ 9,
30
+ 3
31
+ ],
32
+ "dims": [
33
+ 256,
34
+ 512,
35
+ 768,
36
+ 1024
37
+ ],
38
+ "drop_path_rate": 0.2,
39
+ "kernel_size": 7
40
+ },
41
+ "decoder": {
42
+ "hop_length": 256,
43
+ "upsample_rates": [
44
+ 8,
45
+ 4,
46
+ 2,
47
+ 2,
48
+ 2
49
+ ],
50
+ "upsample_kernel_sizes": [
51
+ 16,
52
+ 12,
53
+ 4,
54
+ 4,
55
+ 4
56
+ ],
57
+ "resblock_kernel_sizes": [
58
+ 3,
59
+ 7,
60
+ 11
61
+ ],
62
+ "resblock_dilation_sizes": [
63
+ [
64
+ 1,
65
+ 3,
66
+ 5
67
+ ],
68
+ [
69
+ 1,
70
+ 3,
71
+ 5
72
+ ],
73
+ [
74
+ 1,
75
+ 3,
76
+ 5
77
+ ]
78
+ ],
79
+ "num_mels": 1024,
80
+ "upsample_initial_channel": 1024,
81
+ "use_template": false,
82
+ "pre_conv_kernel_size": 13,
83
+ "post_conv_kernel_size": 13
84
+ },
85
+ "quantizer": {
86
+ "quantizer_type": "grvq",
87
+ "input_dim": 1024,
88
+ "n_groups": 1,
89
+ "n_codebooks": 1,
90
+ "codebook_size": 32768,
91
+ "codebook_dim": 3584,
92
+ "levels": [
93
+ 8,
94
+ 5,
95
+ 5,
96
+ 5
97
+ ],
98
+ "downsample_factor": [
99
+ 1
100
+ ],
101
+ "ema_decay": 0.8,
102
+ "codebook_diversity_loss_weight": 1.0,
103
+ "codebook_diversity_temperature": 100.0
104
+ },
105
+ "teacher_quantizer": {
106
+ "quantizer_type": "grvq",
107
+ "input_dim": 1024,
108
+ "n_groups": 2,
109
+ "n_codebooks": 1,
110
+ "codebook_size": 32768,
111
+ "codebook_dim": 3584,
112
+ "levels": [
113
+ 8,
114
+ 5,
115
+ 5,
116
+ 5
117
+ ],
118
+ "downsample_factor": [
119
+ 2
120
+ ],
121
+ "ema_decay": 0.8,
122
+ "codebook_diversity_loss_weight": 1.0,
123
+ "codebook_diversity_temperature": 100.0
124
+ },
125
+ "descriminators": {
126
+ "MultiPeriodDiscriminator": {
127
+ "periods": [
128
+ 5,
129
+ 8,
130
+ 13,
131
+ 19,
132
+ 30
133
+ ],
134
+ "kernal_size": 5,
135
+ "stride": 3
136
+ },
137
+ "MultiScaleDiscriminator": {
138
+ "avg_poolings": {
139
+ "kernal_sizes": [
140
+ 6,
141
+ 6
142
+ ],
143
+ "stridess": [
144
+ 3,
145
+ 3
146
+ ],
147
+ "paddings": [
148
+ 3,
149
+ 3
150
+ ]
151
+ },
152
+ "DiscriminatorS": {
153
+ "kernal_sizes": [
154
+ 21,
155
+ 61,
156
+ 61,
157
+ 61,
158
+ 61,
159
+ 61,
160
+ 7
161
+ ],
162
+ "strides": [
163
+ 1,
164
+ 3,
165
+ 3,
166
+ 6,
167
+ 6,
168
+ 1,
169
+ 1
170
+ ],
171
+ "paddings": [
172
+ 10,
173
+ 30,
174
+ 30,
175
+ 30,
176
+ 30,
177
+ 30,
178
+ 3
179
+ ]
180
+ }
181
+ },
182
+ "MultiScaleSTFTDiscriminator": {
183
+ "n_ffts": [
184
+ 1024,
185
+ 2048,
186
+ 512,
187
+ 256,
188
+ 128
189
+ ],
190
+ "hop_lengths": [
191
+ 256,
192
+ 512,
193
+ 128,
194
+ 64,
195
+ 32
196
+ ],
197
+ "win_lengths": [
198
+ 1024,
199
+ 2048,
200
+ 512,
201
+ 256,
202
+ 128
203
+ ],
204
+ "filters": 32,
205
+ "in_channels": 1,
206
+ "out_channels": 1
207
+ }
208
+ }
209
+ }
train_config.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "adam": {
3
+ "learning_rate": 0.0002,
4
+ "adam_b1": 0.5,
5
+ "adam_b2": 0.9,
6
+ "lr_decay": 0.98,
7
+ "weight_decay": 0.001
8
+ },
9
+ "wandb": {
10
+ "use_wandb": true,
11
+ "wandb_project": "audio-tokenizer-0927",
12
+ "wandb_group": "audio-tokenizer",
13
+ "wandb_team": "ccnl-s-team"
14
+ },
15
+ "mels_path": null,
16
+ "training_files_path": {
17
+ "replay_training_file_pathes": [
18
+ {
19
+ "path": "/cognitive_comp/common_data/audio/output/24k",
20
+ "replay_rate": 1.0
21
+ },
22
+ {
23
+ "path": "/cognitive_comp/common_data/audio/output/24k_data/24k_1_10s/stage2",
24
+ "replay_rate": 0.6
25
+ },
26
+ {
27
+ "path": "/cognitive_comp/common_data/audio/output/24k_data/24k_1_10s/stage3",
28
+ "replay_rate": 0.6
29
+ },
30
+ {
31
+ "path": "/cognitive_comp/common_data/audio/output/lam/asr_tts",
32
+ "replay_rate": 0.01
33
+ }
34
+ ],
35
+ "current_training_file_pathes": [
36
+ "/cognitive_comp/common_data/audio/output/24k_data/24k_1_10s/stage4"
37
+ ]
38
+ },
39
+ "validation_files_path": [
40
+ "/cognitive_comp/common_data/audio/output/dev",
41
+ "/cognitive_comp/common_data/audio/output/24k_data/dev/stage2",
42
+ "/cognitive_comp/common_data/audio/output/24k_data/dev/stage3",
43
+ "/cognitive_comp/common_data/audio/output/24k_data/dev/stage4"
44
+ ],
45
+ "save_path": "/cognitive_comp/wangrui/data/lam0927",
46
+ "distill": {
47
+ "is_distill": true,
48
+ "use_fm_distill": false,
49
+ "quantizer_transfer": false,
50
+ "teacher_ckpt_path": "/cognitive_comp/wangrui/data/lam0923/saved_ckpt"
51
+ },
52
+ "training_epochs": 50,
53
+ "checkpoint_interval": 1000,
54
+ "validation_interval": 2000,
55
+ "accumulation_steps": 1,
56
+ "summary_interval": 10,
57
+ "stdout_interval": 10,
58
+ "num_ckpt_keep": 30,
59
+ "fine_tuning": false,
60
+ "num_gpus": 8,
61
+ "batch_size": 152,
62
+ "seed": 1234,
63
+ "sampling_rate": 24000,
64
+ "codebook_loss_lambda": 1.0,
65
+ "commitment_loss_lambda": 0.25,
66
+ "num_workers": 4
67
+ }