Skip to content

Commit 400f37f

Browse files
committed
NVIDIA#1369 Updated DeepLearningExamples so that all projects can use latest librosa.
modified: CUDA-Optimized/FastSpeech/fastspeech/dataset/ljspeech_dataset.py modified: CUDA-Optimized/FastSpeech/generate.py modified: CUDA-Optimized/FastSpeech/tacotron2/audio_processing.py modified: CUDA-Optimized/FastSpeech/tacotron2/layers.py modified: Kaldi/SpeechRecognition/notebooks/Kaldi_TRTIS_inference_offline_demo.ipynb modified: Kaldi/SpeechRecognition/notebooks/Kaldi_TRTIS_inference_online_demo.ipynb modified: PyTorch/SpeechRecognition/Jasper/requirements.txt modified: PyTorch/SpeechRecognition/QuartzNet/requirements.txt modified: PyTorch/SpeechRecognition/wav2vec2/requirements.txt modified: PyTorch/SpeechSynthesis/FastPitch/hifigan/data_function.py modified: PyTorch/SpeechSynthesis/FastPitch/requirements.txt modified: PyTorch/SpeechSynthesis/HiFiGAN/requirements.txt modified: PyTorch/SpeechSynthesis/Tacotron2/notebooks/conversationalai/client/speech_ai_demo/utils/jasper/speech_utils.py modified: PyTorch/SpeechSynthesis/Tacotron2/trtis_cpp/src/trt/requirements.txt
1 parent 9dd9fcb commit 400f37f

File tree

14 files changed

+17
-18
lines changed

14 files changed

+17
-18
lines changed

CUDA-Optimized/FastSpeech/fastspeech/dataset/ljspeech_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def __getitem__(self, idx):
107107

108108
# Audio processing
109109
wav, _ = librosa.effects.trim(wav, frame_length=self.win_len, hop_length=self.hop_len)
110-
110+
111111
if self.mels_path:
112112
mel = np.load(os.path.join(self.mels_path, name + ".mel.npy"))
113113
else:

CUDA-Optimized/FastSpeech/generate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
import time
2929

3030
import fire
31-
import librosa
31+
import soundfile
3232
import torch
3333

3434
from fastspeech.data_load import PadDataLoader
@@ -158,7 +158,7 @@ def generate(hparam='infer.yaml',
158158
wav = wav[:wav_len]
159159

160160
path = os.path.join(results_path, text[:MAX_FILESIZE] + ".wav")
161-
librosa.output.write_wav(path, wav, hp.sr)
161+
soundfile.write(path, wav, hp.sr)
162162

163163
except StopIteration:
164164
tprint("Generation has been done.")

CUDA-Optimized/FastSpeech/tacotron2/audio_processing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
7979
# Compute the squared window at the desired length
8080
win_sq = get_window(window, win_length, fftbins=True)
8181
win_sq = librosa_util.normalize(win_sq, norm=norm)**2
82-
win_sq = librosa_util.pad_center(win_sq, n_fft)
82+
win_sq = librosa_util.pad_center(win_sq, size=n_fft)
8383

8484
# Fill the envelope
8585
for i in range(n_frames):

CUDA-Optimized/FastSpeech/tacotron2/layers.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
"""https://github.com/NVIDIA/tacotron2"""
3232

3333
import torch
34-
from librosa.filters import mel as librosa_mel_fn
3534

3635

3736
class LinearNorm(torch.nn.Module):

Kaldi/SpeechRecognition/notebooks/Kaldi_TRTIS_inference_offline_demo.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -511,10 +511,10 @@
511511
" \"\"\"\n",
512512
" samples = self._convert_samples_to_float32(samples)\n",
513513
" if target_sr is not None and target_sr != sample_rate:\n",
514-
" samples = librosa.core.resample(samples, sample_rate, target_sr)\n",
514+
" samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)\n",
515515
" sample_rate = target_sr\n",
516516
" if trim:\n",
517-
" samples, _ = librosa.effects.trim(samples, trim_db)\n",
517+
" samples, _ = librosa.effects.trim(samples, top_db=trim_db)\n",
518518
" self._samples = samples\n",
519519
" self._sample_rate = sample_rate\n",
520520
" if self._samples.ndim >= 2:\n",

Kaldi/SpeechRecognition/notebooks/Kaldi_TRTIS_inference_online_demo.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -640,10 +640,10 @@
640640
" \"\"\"\n",
641641
" samples = self._convert_samples_to_float32(samples)\n",
642642
" if target_sr is not None and target_sr != sample_rate:\n",
643-
" samples = librosa.core.resample(samples, sample_rate, target_sr)\n",
643+
" samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)\n",
644644
" sample_rate = target_sr\n",
645645
" if trim:\n",
646-
" samples, _ = librosa.effects.trim(samples, trim_db)\n",
646+
" samples, _ = librosa.effects.trim(samples, top_db=trim_db)\n",
647647
" self._samples = samples\n",
648648
" self._sample_rate = sample_rate\n",
649649
" if self._samples.ndim >= 2:\n",

PyTorch/SpeechRecognition/Jasper/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
inflect==5.3.0
22
ipdb
3-
librosa==0.9.0
3+
librosa>=0.9.0
44
pandas==1.5.2
55
pyyaml>=5.4
66
soundfile

PyTorch/SpeechRecognition/QuartzNet/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
inflect==5.3.0
22
ipdb
3-
librosa==0.9.0
3+
librosa>=0.9.0
44
pandas==1.5.2
55
pyyaml>=5.4
66
soundfile

PyTorch/SpeechRecognition/wav2vec2/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
editdistance==0.6.0
2-
librosa==0.10.1
2+
librosa>=0.10.1
33
omegaconf==2.0.6 # optional for handling certain Fairseq ckpts
44
pyarrow==6.0.1
55
soundfile==0.12.1

PyTorch/SpeechSynthesis/FastPitch/hifigan/data_function.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size,
6969
global mel_basis, hann_window
7070
fmax_key = f'{fmax}_{y.device}'
7171
if fmax_key not in mel_basis:
72-
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
72+
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
7373
mel_basis[fmax_key] = torch.from_numpy(mel).float().to(y.device)
7474
hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
7575

PyTorch/SpeechSynthesis/FastPitch/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
inflect
2-
librosa==0.9.0
2+
librosa>=0.9.0
33
matplotlib
44
numpy
55
pynvml==11.0.0

PyTorch/SpeechSynthesis/HiFiGAN/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
inflect
2-
librosa==0.9.0
2+
librosa>=0.9.0
33
numpy
44
pandas
55
pynvml==11.0.0

PyTorch/SpeechSynthesis/Tacotron2/notebooks/conversationalai/client/speech_ai_demo/utils/jasper/speech_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -383,10 +383,10 @@ def __init__(self, samples, sample_rate, target_sr=16000, trim=False,
383383
"""
384384
samples = self._convert_samples_to_float32(samples)
385385
if target_sr is not None and target_sr != sample_rate:
386-
samples = librosa.core.resample(samples, sample_rate, target_sr)
386+
samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)
387387
sample_rate = target_sr
388388
if trim:
389-
samples, _ = librosa.effects.trim(samples, trim_db)
389+
samples, _ = librosa.effects.trim(samples, top_db=trim_db)
390390
self._samples = samples
391391
self._sample_rate = sample_rate
392392
if self._samples.ndim >= 2:
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
torch==1.3.0
22
onnx==1.5.0
33
scipy==1.3.1
4-
librosa==0.7.0
4+
librosa

0 commit comments

Comments
 (0)