Upload SpeakerEncoder

Browse files

Files changed (3) hide show

config.json +3 -3
model.safetensors +2 -2
modeling_ecapa_tdnn.py +6 -94

config.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
   "C": 1024,
   "architectures": [
-    "HFECAPATDNN"
   ],
   "auto_map": {
-    "AutoConfig": "configuration_ecapa_tdnn.ECAPAConfig",
-    "AutoModel": "modeling_ecapa_tdnn.HFECAPATDNN"
   },
   "model_type": "ecapa_tdnn",
   "torch_dtype": "float32",

 {
   "C": 1024,
   "architectures": [
+    "SpeakerEncoder"
   ],
   "auto_map": {
+    "AutoConfig": "modeling_ecapa_tdnn.ECAPAConfig",
+    "AutoModel": "modeling_ecapa_tdnn.SpeakerEncoder"
   },
   "model_type": "ecapa_tdnn",
   "torch_dtype": "float32",

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5e8babea99d09e708dadae8623a538b406825d7b5af527584cee204b957785d0
-size 66667584

 version https://git-lfs.github.com/spec/v1
+oid sha256:99a87fdb4f4b9608940134f211d1d61f64107667bfad2003948da449a1902197
+size 65020192

modeling_ecapa_tdnn.py CHANGED Viewed

@@ -78,85 +78,11 @@ class Bottle2neck(nn.Module):
         out += residual
         return out
-class PreEmphasis(torch.nn.Module):
-    def __init__(self, coef: float = 0.97):
-        super().__init__()
-        self.coef = coef
-        self.register_buffer(
-            'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
-        )
-    def forward(self, input: torch.tensor) -> torch.tensor:
-        input = input.unsqueeze(1)
-        input = F.pad(input, (1, 0), 'reflect')
-        return F.conv1d(input, self.flipped_filter).squeeze(1)
-class FbankAug(nn.Module):
-    def __init__(self, freq_mask_width = (0, 8), time_mask_width = (0, 10)):
-        self.time_mask_width = time_mask_width
-        self.freq_mask_width = freq_mask_width
-        super().__init__()
-    def mask_along_axis(self, x, dim):
-        original_size = x.shape
-        batch, fea, time = x.shape
-        if dim == 1:
-            D = fea
-            width_range = self.freq_mask_width
-        else:
-            D = time
-            width_range = self.time_mask_width
-        mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2)
-        mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2)
-        arange = torch.arange(D, device=x.device).view(1, 1, -1)
-        mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len))
-        mask = mask.any(dim=1)
-        if dim == 1:
-            mask = mask.unsqueeze(2)
-        else:
-            mask = mask.unsqueeze(1)
-        x = x.masked_fill_(mask, 0.0)
-        return x.view(*original_size)
-    def forward(self, x):
-        x = self.mask_along_axis(x, dim=2)
-        x = self.mask_along_axis(x, dim=1)
-        return x
-class ECAPA_TDNN(nn.Module):
     def __init__(self, C):
-        super(ECAPA_TDNN, self).__init__()
-        self.torchfbank = torch.nn.Sequential(
-            PreEmphasis(),
-            # torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \
-            #                                      f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80),
-            torchaudio.transforms.Resample(orig_freq=16000, new_freq=22050),
-            torchaudio.transforms.MelSpectrogram(
-                sample_rate = 22050,
-                n_fft       = 2048,
-                hop_length  = 512,
-                win_length  = 2048,
-                # window_fn   = lambda *_: window,
-                center      = False,
-                power       = 2.0,
-                n_mels      = 256,
-                norm        = "slaney",
-                mel_scale   = "htk",
-            ),
-            torchaudio.transforms.AmplitudeToDB(
-                stype="power", top_db=80
-            )
-            )
-        self.specaug = FbankAug() # Spec augmentation
         # self.conv1  = nn.Conv1d(80, C, kernel_size=5, stride=1, padding=2)
         # self.conv1  = nn.Conv1d(256, C, kernel_size=5, stride=1, padding=2)
@@ -181,19 +107,7 @@ class ECAPA_TDNN(nn.Module):
         self.bn6 = nn.BatchNorm1d(192)
-    def forward(self, x, aug):
-        with torch.no_grad():
-            x = self.torchfbank(x)
-            # x = self.torchfbank(x)+1e-6
-            # x = x.log()
-            x = x - torch.mean(x, dim=-1, keepdim=True) # mean normalization
-            if aug == True:
-                x = self.specaug(x)
-            # only take the first 232 mel bins
-            if x.dim() == 3:
-                x = x[:, :232, :]
-            else:
-                x = x[:232]
         x = self.conv1(x)
         x = self.relu(x)
@@ -224,9 +138,7 @@ class ECAPA_TDNN(nn.Module):
 import torch
-from transformers import PreTrainedModel
-# from configuration_ecapa_tdnn import ECAPAConfig
-from transformers import PretrainedConfig
 class ECAPAConfig(PretrainedConfig):
@@ -238,11 +150,11 @@ class ECAPAConfig(PretrainedConfig):
-class HFECAPATDNN(PreTrainedModel):
     config_class = ECAPAConfig
     base_model_prefix = "ecapa_tdnn"
     def __init__(self, config):
         super().__init__(config)
-        self.model = ECAPA_TDNN(C=config.C)
     def forward(self, *args, **kwargs):
         return self.model(*args, **kwargs)

         out += residual
         return out
+class EcapaTdnnEncoder(nn.Module):
     def __init__(self, C):
+        super(EcapaTdnnEncoder, self).__init__()
         # self.conv1  = nn.Conv1d(80, C, kernel_size=5, stride=1, padding=2)
         # self.conv1  = nn.Conv1d(256, C, kernel_size=5, stride=1, padding=2)
         self.bn6 = nn.BatchNorm1d(192)
+    def forward(self, x):
         x = self.conv1(x)
         x = self.relu(x)
 import torch
+from transformers import PreTrainedModel, PretrainedConfig
 class ECAPAConfig(PretrainedConfig):
+class SpeakerEncoder(PreTrainedModel):
     config_class = ECAPAConfig
     base_model_prefix = "ecapa_tdnn"
     def __init__(self, config):
         super().__init__(config)
+        self.model = EcapaTdnnEncoder(C=config.C)
     def forward(self, *args, **kwargs):
         return self.model(*args, **kwargs)