Upload SpeakerEncoder
Browse files- config.json +3 -3
- model.safetensors +2 -2
- modeling_ecapa_tdnn.py +6 -94
config.json
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
{
|
2 |
"C": 1024,
|
3 |
"architectures": [
|
4 |
-
"
|
5 |
],
|
6 |
"auto_map": {
|
7 |
-
"AutoConfig": "
|
8 |
-
"AutoModel": "modeling_ecapa_tdnn.
|
9 |
},
|
10 |
"model_type": "ecapa_tdnn",
|
11 |
"torch_dtype": "float32",
|
|
|
1 |
{
|
2 |
"C": 1024,
|
3 |
"architectures": [
|
4 |
+
"SpeakerEncoder"
|
5 |
],
|
6 |
"auto_map": {
|
7 |
+
"AutoConfig": "modeling_ecapa_tdnn.ECAPAConfig",
|
8 |
+
"AutoModel": "modeling_ecapa_tdnn.SpeakerEncoder"
|
9 |
},
|
10 |
"model_type": "ecapa_tdnn",
|
11 |
"torch_dtype": "float32",
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:99a87fdb4f4b9608940134f211d1d61f64107667bfad2003948da449a1902197
|
3 |
+
size 65020192
|
modeling_ecapa_tdnn.py
CHANGED
@@ -78,85 +78,11 @@ class Bottle2neck(nn.Module):
|
|
78 |
out += residual
|
79 |
return out
|
80 |
|
81 |
-
class
|
82 |
-
|
83 |
-
def __init__(self, coef: float = 0.97):
|
84 |
-
super().__init__()
|
85 |
-
self.coef = coef
|
86 |
-
self.register_buffer(
|
87 |
-
'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
|
88 |
-
)
|
89 |
-
|
90 |
-
def forward(self, input: torch.tensor) -> torch.tensor:
|
91 |
-
input = input.unsqueeze(1)
|
92 |
-
input = F.pad(input, (1, 0), 'reflect')
|
93 |
-
return F.conv1d(input, self.flipped_filter).squeeze(1)
|
94 |
-
|
95 |
-
class FbankAug(nn.Module):
|
96 |
-
|
97 |
-
def __init__(self, freq_mask_width = (0, 8), time_mask_width = (0, 10)):
|
98 |
-
self.time_mask_width = time_mask_width
|
99 |
-
self.freq_mask_width = freq_mask_width
|
100 |
-
super().__init__()
|
101 |
-
|
102 |
-
def mask_along_axis(self, x, dim):
|
103 |
-
original_size = x.shape
|
104 |
-
batch, fea, time = x.shape
|
105 |
-
if dim == 1:
|
106 |
-
D = fea
|
107 |
-
width_range = self.freq_mask_width
|
108 |
-
else:
|
109 |
-
D = time
|
110 |
-
width_range = self.time_mask_width
|
111 |
-
|
112 |
-
mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2)
|
113 |
-
mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2)
|
114 |
-
arange = torch.arange(D, device=x.device).view(1, 1, -1)
|
115 |
-
mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len))
|
116 |
-
mask = mask.any(dim=1)
|
117 |
-
|
118 |
-
if dim == 1:
|
119 |
-
mask = mask.unsqueeze(2)
|
120 |
-
else:
|
121 |
-
mask = mask.unsqueeze(1)
|
122 |
-
|
123 |
-
x = x.masked_fill_(mask, 0.0)
|
124 |
-
return x.view(*original_size)
|
125 |
-
|
126 |
-
def forward(self, x):
|
127 |
-
x = self.mask_along_axis(x, dim=2)
|
128 |
-
x = self.mask_along_axis(x, dim=1)
|
129 |
-
return x
|
130 |
-
|
131 |
-
class ECAPA_TDNN(nn.Module):
|
132 |
|
133 |
def __init__(self, C):
|
134 |
|
135 |
-
super(
|
136 |
-
|
137 |
-
self.torchfbank = torch.nn.Sequential(
|
138 |
-
PreEmphasis(),
|
139 |
-
# torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \
|
140 |
-
# f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80),
|
141 |
-
torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050),
|
142 |
-
torchaudio.transforms.MelSpectrogram(
|
143 |
-
sample_rate = 22050,
|
144 |
-
n_fft = 2048,
|
145 |
-
hop_length = 512,
|
146 |
-
win_length = 2048,
|
147 |
-
# window_fn = lambda *_: window,
|
148 |
-
center = False,
|
149 |
-
power = 2.0,
|
150 |
-
n_mels = 256,
|
151 |
-
norm = "slaney",
|
152 |
-
mel_scale = "htk",
|
153 |
-
),
|
154 |
-
torchaudio.transforms.AmplitudeToDB(
|
155 |
-
stype="power", top_db=80
|
156 |
-
)
|
157 |
-
)
|
158 |
-
|
159 |
-
self.specaug = FbankAug() # Spec augmentation
|
160 |
|
161 |
# self.conv1 = nn.Conv1d(80, C, kernel_size=5, stride=1, padding=2)
|
162 |
# self.conv1 = nn.Conv1d(256, C, kernel_size=5, stride=1, padding=2)
|
@@ -181,19 +107,7 @@ class ECAPA_TDNN(nn.Module):
|
|
181 |
self.bn6 = nn.BatchNorm1d(192)
|
182 |
|
183 |
|
184 |
-
def forward(self, x
|
185 |
-
with torch.no_grad():
|
186 |
-
x = self.torchfbank(x)
|
187 |
-
# x = self.torchfbank(x)+1e-6
|
188 |
-
# x = x.log()
|
189 |
-
x = x - torch.mean(x, dim=-1, keepdim=True) # mean normalization
|
190 |
-
if aug == True:
|
191 |
-
x = self.specaug(x)
|
192 |
-
# only take the first 232 mel bins
|
193 |
-
if x.dim() == 3:
|
194 |
-
x = x[:, :232, :]
|
195 |
-
else:
|
196 |
-
x = x[:232]
|
197 |
|
198 |
x = self.conv1(x)
|
199 |
x = self.relu(x)
|
@@ -224,9 +138,7 @@ class ECAPA_TDNN(nn.Module):
|
|
224 |
|
225 |
|
226 |
import torch
|
227 |
-
from transformers import PreTrainedModel
|
228 |
-
# from configuration_ecapa_tdnn import ECAPAConfig
|
229 |
-
from transformers import PretrainedConfig
|
230 |
|
231 |
|
232 |
class ECAPAConfig(PretrainedConfig):
|
@@ -238,11 +150,11 @@ class ECAPAConfig(PretrainedConfig):
|
|
238 |
|
239 |
|
240 |
|
241 |
-
class
|
242 |
config_class = ECAPAConfig
|
243 |
base_model_prefix = "ecapa_tdnn"
|
244 |
def __init__(self, config):
|
245 |
super().__init__(config)
|
246 |
-
self.model =
|
247 |
def forward(self, *args, **kwargs):
|
248 |
return self.model(*args, **kwargs)
|
|
|
78 |
out += residual
|
79 |
return out
|
80 |
|
81 |
+
class EcapaTdnnEncoder(nn.Module):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
def __init__(self, C):
|
84 |
|
85 |
+
super(EcapaTdnnEncoder, self).__init__()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
# self.conv1 = nn.Conv1d(80, C, kernel_size=5, stride=1, padding=2)
|
88 |
# self.conv1 = nn.Conv1d(256, C, kernel_size=5, stride=1, padding=2)
|
|
|
107 |
self.bn6 = nn.BatchNorm1d(192)
|
108 |
|
109 |
|
110 |
+
def forward(self, x):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
x = self.conv1(x)
|
113 |
x = self.relu(x)
|
|
|
138 |
|
139 |
|
140 |
import torch
|
141 |
+
from transformers import PreTrainedModel, PretrainedConfig
|
|
|
|
|
142 |
|
143 |
|
144 |
class ECAPAConfig(PretrainedConfig):
|
|
|
150 |
|
151 |
|
152 |
|
153 |
+
class SpeakerEncoder(PreTrainedModel):
|
154 |
config_class = ECAPAConfig
|
155 |
base_model_prefix = "ecapa_tdnn"
|
156 |
def __init__(self, config):
|
157 |
super().__init__(config)
|
158 |
+
self.model = EcapaTdnnEncoder(C=config.C)
|
159 |
def forward(self, *args, **kwargs):
|
160 |
return self.model(*args, **kwargs)
|