| data_root: N/A | |
| input_channels: 1 | |
| input_feat_per_channel: 80 | |
| multitask: | |
| source_unit: | |
| data: N/A | |
| decoder_type: transformer | |
| dict: N/A | |
| encoder_layer: 6 | |
| loss_weight: 8.0 | |
| target_type: text | |
| output_channels: 1 | |
| output_feat_per_channel: 1 | |
| output_feat_reduction_rate: 0 | |
| output_sample_rate: 16000 | |
| specaugment: | |
| freq_mask_F: 27 | |
| freq_mask_N: 1 | |
| time_mask_N: 1 | |
| time_mask_T: 100 | |
| time_mask_p: 1.0 | |
| time_wrap_W: 0 | |
| transforms: | |
| _eval: | |
| - utterance_cmvn | |
| _train: | |
| - utterance_cmvn | |
| - specaugment | |
| vocoder: | |
| dur_prediction: true | |
| model_path: N/A | |
| speaker: false | |
| type: code_hifigan | |
| hub: | |
| input_type: fbank80_w_utt_cmvn | |
| tts_model_id: pytorch/fairseq:ust:unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur | |
| unit_vocoder: true | |
| generation_args: | |
| beam: 10 | |
| max_len_a: 1 |