From f0ceffc39b51fbf82be22ae0d6ef695ea8fbfb8d Mon Sep 17 00:00:00 2001 From: alastorid Date: Thu, 26 Dec 2024 03:20:07 +0800 Subject: [PATCH 1/3] Merge remote-tracking branch 'zgldh/main' --- melo/api.py | 2 +- melo/utils.py | 11 +++++++++++ requirements.txt | 1 + 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/melo/api.py b/melo/api.py index 236ea8f17..567df0678 100644 --- a/melo/api.py +++ b/melo/api.py @@ -122,7 +122,7 @@ def tts_to_file(self, text, speaker_id, output_path=None, sdp_ratio=0.2, noise_s )[0][0, 0].data.cpu().float().numpy() del x_tst, tones, lang_ids, bert, ja_bert, x_tst_lengths, speakers # - audio_list.append(audio) + audio_list.append(utils.fix_loudness(audio,self.hps.data.sampling_rate)) torch.cuda.empty_cache() audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed) diff --git a/melo/utils.py b/melo/utils.py index bafca5a6d..bbe3fdeae 100644 --- a/melo/utils.py +++ b/melo/utils.py @@ -13,11 +13,22 @@ from melo.text.cleaner import clean_text from melo import commons +import pyloudnorm as pyln + MATPLOTLIB_FLAG = False logger = logging.getLogger(__name__) +def fix_loudness(input, rate): + # 峰值归一化至 -1 dB + peak_normalized_audio = pyln.normalize.peak(input, -1.0) + + # 测量响度 + meter = pyln.Meter(rate) + loudness = meter.integrated_loudness(peak_normalized_audio) + # 响度归一化至 -12 dB LUFS + return pyln.normalize.loudness(peak_normalized_audio, loudness, -12.0) def get_text_for_tts_infer(text, language_str, hps, device, symbol_to_id=None): norm_text, phone, tone, word2ph = clean_text(text, language_str) diff --git a/requirements.txt b/requirements.txt index a79f61599..1cc2f0086 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,3 +27,4 @@ langid==1.1.6 tqdm tensorboard==2.16.2 loguru==0.7.2 +pyloudnorm \ No newline at end of file From 76c31870b5f4a3584d3bbe3f09a1191766649fa1 Mon Sep 17 00:00:00 2001 From: alastorid Date: Thu, 26 Dec 2024 04:53:50 +0800 Subject: [PATCH 2/3] SAY GOODBYE TO CLIPPING --- melo/utils.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/melo/utils.py b/melo/utils.py index bbe3fdeae..94ce92946 100644 --- a/melo/utils.py +++ b/melo/utils.py @@ -19,16 +19,10 @@ logger = logging.getLogger(__name__) -def fix_loudness(input, rate): - # 峰值归一化至 -1 dB - peak_normalized_audio = pyln.normalize.peak(input, -1.0) +def fix_loudness(data, rate): + peak_normalized_audio = pyln.normalize.peak(data, -1.0) - # 测量响度 - meter = pyln.Meter(rate) - loudness = meter.integrated_loudness(peak_normalized_audio) - - # 响度归一化至 -12 dB LUFS - return pyln.normalize.loudness(peak_normalized_audio, loudness, -12.0) + return peak_normalized_audio def get_text_for_tts_infer(text, language_str, hps, device, symbol_to_id=None): norm_text, phone, tone, word2ph = clean_text(text, language_str) From df784b7d2615358a30a3dbdb55566e8d386b8184 Mon Sep 17 00:00:00 2001 From: alastorid Date: Tue, 21 Jan 2025 09:09:40 -0500 Subject: [PATCH 3/3] Rework fix_loudness to normalize_loudness to resolve potential issues --- melo/api.py | 2 +- melo/utils.py | 25 +++++++++++++++++++------ requirements.txt | 1 - 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/melo/api.py b/melo/api.py index 567df0678..a3a65f574 100644 --- a/melo/api.py +++ b/melo/api.py @@ -122,7 +122,7 @@ def tts_to_file(self, text, speaker_id, output_path=None, sdp_ratio=0.2, noise_s )[0][0, 0].data.cpu().float().numpy() del x_tst, tones, lang_ids, bert, ja_bert, x_tst_lengths, speakers # - audio_list.append(utils.fix_loudness(audio,self.hps.data.sampling_rate)) + audio_list.append(utils.normalize_loudness(audio)) torch.cuda.empty_cache() audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed) diff --git a/melo/utils.py b/melo/utils.py index 94ce92946..2186f6c45 100644 --- a/melo/utils.py +++ b/melo/utils.py @@ -13,16 +13,29 @@ from melo.text.cleaner import clean_text from melo import commons -import pyloudnorm as pyln - MATPLOTLIB_FLAG = False logger = logging.getLogger(__name__) -def fix_loudness(data, rate): - peak_normalized_audio = pyln.normalize.peak(data, -1.0) - - return peak_normalized_audio +def normalize_loudness(audio): + max_val = np.max(np.abs(audio)) + if max_val == 0: + return audio + scale_factor = (1 - 1e-6) / max_val + normalized_audio = audio * scale_factor + + if False: + print(f'--- normalize ---') + print(f"Before normalization:") + print(f" Min value: {np.min(audio)}") + print(f" Max value: {np.max(audio)}") + print(f" Length: {len(audio)}") + print(f"After normalization:") + print(f" Min value: {np.min(normalized_audio)}") + print(f" Max value: {np.max(normalized_audio)}") + print(f" Length: {len(normalized_audio)}") + + return normalized_audio def get_text_for_tts_infer(text, language_str, hps, device, symbol_to_id=None): norm_text, phone, tone, word2ph = clean_text(text, language_str) diff --git a/requirements.txt b/requirements.txt index 1cc2f0086..a79f61599 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,4 +27,3 @@ langid==1.1.6 tqdm tensorboard==2.16.2 loguru==0.7.2 -pyloudnorm \ No newline at end of file