Automatic Speech Recognition
ESPnet
multilingual
audio
speech-translation
language-identification
pyf98 commited on
Commit
e2e75ad
·
verified ·
1 Parent(s): f880d37

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +43 -45
README.md CHANGED
@@ -68,28 +68,27 @@ import torch
68
  from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch
69
 
70
 
71
- if __name__ == "__main__":
72
- context_len_in_secs = 4 # left and right context when doing buffered inference
73
- batch_size = 32 # depends on the GPU memory
74
- s2t = Speech2TextGreedySearch.from_pretrained(
75
- "pyf98/owsm_ctc_v3.1_1B",
76
- device='cuda' if torch.cuda.is_available() else 'cpu',
77
- generate_interctc_outputs=False,
78
- lang_sym='<eng>',
79
- task_sym='<asr>',
80
- )
81
-
82
- speech, rate = sf.read(
83
- "xxx.wav"
84
- )
85
-
86
- text = s2t.decode_long_batched_buffered(
87
- speech,
88
- batch_size=batch_size,
89
- context_len_in_secs=context_len_in_secs,
90
- frames_per_sec=12.5, # 80ms shift, model-dependent, don't change
91
- )
92
- print(text)
93
  ```
94
 
95
  ### Example for CTC forced alignment using `ctc-segmentation`
@@ -102,31 +101,30 @@ import soundfile as sf
102
  from espnet2.bin.s2t_ctc_align import CTCSegmentation
103
 
104
 
105
- if __name__ == "__main__":
106
- ## Please download model first
107
- aligner = CTCSegmentation(
108
- s2t_model_file="exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/valid.total_count.ave_5best.till45epoch.pth",
109
- fs=16000,
110
- ngpu=1,
111
- batch_size=16, # batched parallel decoding; reduce it if your GPU memory is smaller
112
- kaldi_style_text=True,
113
- time_stamps="fixed",
114
- samples_to_frames_ratio=1280, # 80ms time shift; don't change as it depends on the pre-trained model
115
- lang_sym="<eng>",
116
- task_sym="<asr>",
117
- context_len_in_secs=2, # left and right context in buffered decoding
118
- frames_per_sec=12.5, # 80ms time shift; don't change as it depends on the pre-trained model
119
- )
120
-
121
- speech, rate = sf.read(
122
- "example.wav"
123
- )
124
- print(f"speech duration: {len(speech) / rate : .2f} seconds")
125
- text = '''
126
  utt1 hello there
127
  utt2 welcome to this repo
128
  '''
129
 
130
- segments = aligner(speech, text)
131
- print(segments)
132
  ```
 
68
  from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch
69
 
70
 
71
+ context_len_in_secs = 4 # left and right context when doing buffered inference
72
+ batch_size = 32 # depends on the GPU memory
73
+ s2t = Speech2TextGreedySearch.from_pretrained(
74
+ "pyf98/owsm_ctc_v3.1_1B",
75
+ device='cuda' if torch.cuda.is_available() else 'cpu',
76
+ generate_interctc_outputs=False,
77
+ lang_sym='<eng>',
78
+ task_sym='<asr>',
79
+ )
80
+
81
+ speech, rate = sf.read(
82
+ "xxx.wav"
83
+ )
84
+
85
+ text = s2t.decode_long_batched_buffered(
86
+ speech,
87
+ batch_size=batch_size,
88
+ context_len_in_secs=context_len_in_secs,
89
+ frames_per_sec=12.5, # 80ms shift, model-dependent, don't change
90
+ )
91
+ print(text)
 
92
  ```
93
 
94
  ### Example for CTC forced alignment using `ctc-segmentation`
 
101
  from espnet2.bin.s2t_ctc_align import CTCSegmentation
102
 
103
 
104
+ ## Please download model first
105
+ aligner = CTCSegmentation(
106
+ s2t_model_file="exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/valid.total_count.ave_5best.till45epoch.pth",
107
+ fs=16000,
108
+ ngpu=1,
109
+ batch_size=16, # batched parallel decoding; reduce it if your GPU memory is smaller
110
+ kaldi_style_text=True,
111
+ time_stamps="fixed",
112
+ samples_to_frames_ratio=1280, # 80ms time shift; don't change as it depends on the pre-trained model
113
+ lang_sym="<eng>",
114
+ task_sym="<asr>",
115
+ context_len_in_secs=2, # left and right context in buffered decoding
116
+ frames_per_sec=12.5, # 80ms time shift; don't change as it depends on the pre-trained model
117
+ )
118
+
119
+ speech, rate = sf.read(
120
+ "example.wav"
121
+ )
122
+ print(f"speech duration: {len(speech) / rate : .2f} seconds")
123
+ text = '''
 
124
  utt1 hello there
125
  utt2 welcome to this repo
126
  '''
127
 
128
+ segments = aligner(speech, text)
129
+ print(segments)
130
  ```