|
| 1 | +from typing import Optional |
| 2 | + |
| 3 | +from torch import Tensor |
| 4 | +from torch import nn |
| 5 | + |
| 6 | +__all__ = ["Wav2Letter"] |
| 7 | + |
| 8 | + |
| 9 | +class Wav2Letter(nn.Module): |
| 10 | + r"""Wav2Letter model architecture from the `"Wav2Letter: an End-to-End ConvNet-based Speech Recognition System" |
| 11 | + <https://arxiv.org/abs/1609.03193>`_ paper. |
| 12 | +
|
| 13 | + :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}` |
| 14 | +
|
| 15 | + Args: |
| 16 | + num_classes (int, optional): Number of classes to be classified. (Default: ``40``) |
| 17 | + input_type (str, optional): Wav2Letter can use as input: ``waveform``, ``power_spectrum`` |
| 18 | + or ``mfcc`` (Default: ``waveform``). |
| 19 | + num_features (int, optional): Number of input features that the network will receive (Default: ``1``). |
| 20 | + """ |
| 21 | + |
| 22 | + def __init__(self, num_classes: int = 40, |
| 23 | + input_type: str = "waveform", |
| 24 | + num_features: int = 1) -> None: |
| 25 | + super(Wav2Letter, self).__init__() |
| 26 | + |
| 27 | + acoustic_num_features = 250 if input_type == "waveform" else num_features |
| 28 | + acoustic_model = nn.Sequential( |
| 29 | + nn.Conv1d(in_channels=acoustic_num_features, out_channels=250, kernel_size=48, stride=2, padding=23), |
| 30 | + nn.ReLU(inplace=True), |
| 31 | + nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), |
| 32 | + nn.ReLU(inplace=True), |
| 33 | + nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), |
| 34 | + nn.ReLU(inplace=True), |
| 35 | + nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), |
| 36 | + nn.ReLU(inplace=True), |
| 37 | + nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), |
| 38 | + nn.ReLU(inplace=True), |
| 39 | + nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), |
| 40 | + nn.ReLU(inplace=True), |
| 41 | + nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), |
| 42 | + nn.ReLU(inplace=True), |
| 43 | + nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), |
| 44 | + nn.ReLU(inplace=True), |
| 45 | + nn.Conv1d(in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16), |
| 46 | + nn.ReLU(inplace=True), |
| 47 | + nn.Conv1d(in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0), |
| 48 | + nn.ReLU(inplace=True), |
| 49 | + nn.Conv1d(in_channels=2000, out_channels=num_classes, kernel_size=1, stride=1, padding=0), |
| 50 | + nn.ReLU(inplace=True) |
| 51 | + ) |
| 52 | + |
| 53 | + if input_type == "waveform": |
| 54 | + waveform_model = nn.Sequential( |
| 55 | + nn.Conv1d(in_channels=num_features, out_channels=250, kernel_size=250, stride=160, padding=45), |
| 56 | + nn.ReLU(inplace=True) |
| 57 | + ) |
| 58 | + self.acoustic_model = nn.Sequential(waveform_model, acoustic_model) |
| 59 | + |
| 60 | + if input_type in ["power_spectrum", "mfcc"]: |
| 61 | + self.acoustic_model = acoustic_model |
| 62 | + |
| 63 | + def forward(self, x: Tensor) -> Tensor: |
| 64 | + r""" |
| 65 | + Args: |
| 66 | + x (Tensor): Tensor of dimension (batch_size, num_features, input_length). |
| 67 | +
|
| 68 | + Returns: |
| 69 | + Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length). |
| 70 | + """ |
| 71 | + |
| 72 | + x = self.acoustic_model(x) |
| 73 | + x = nn.functional.log_softmax(x, dim=1) |
| 74 | + return x |
0 commit comments