| ## Mel-filterbank | |
| mel_window_length = 25 # In milliseconds | |
| mel_window_step = 10 # In milliseconds | |
| mel_n_channels = 40 | |
| ## Audio | |
| sampling_rate = 16000 | |
| # Number of spectrogram frames in a partial utterance | |
| partials_n_frames = 160 # 1600 ms | |
| # Number of spectrogram frames at inference | |
| inference_n_frames = 80 # 800 ms | |
| ## Voice Activation Detection | |
| # Window size of the VAD. Must be either 10, 20 or 30 milliseconds. | |
| # This sets the granularity of the VAD. Should not need to be changed. | |
| vad_window_length = 30 # In milliseconds | |
| # Number of frames to average together when performing the moving average smoothing. | |
| # The larger this value, the larger the VAD variations must be to not get smoothed out. | |
| vad_moving_average_width = 8 | |
| # Maximum number of consecutive silent frames a segment can have. | |
| vad_max_silence_length = 6 | |
| ## Audio volume normalization | |
| audio_norm_target_dBFS = -30 | |