diff --git a/melo/api.py b/melo/api.py index 236ea8f17..a3a65f574 100644 --- a/melo/api.py +++ b/melo/api.py @@ -122,7 +122,7 @@ def tts_to_file(self, text, speaker_id, output_path=None, sdp_ratio=0.2, noise_s )[0][0, 0].data.cpu().float().numpy() del x_tst, tones, lang_ids, bert, ja_bert, x_tst_lengths, speakers # - audio_list.append(audio) + audio_list.append(utils.normalize_loudness(audio)) torch.cuda.empty_cache() audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed) diff --git a/melo/utils.py b/melo/utils.py index bafca5a6d..2186f6c45 100644 --- a/melo/utils.py +++ b/melo/utils.py @@ -17,7 +17,25 @@ logger = logging.getLogger(__name__) - +def normalize_loudness(audio): + max_val = np.max(np.abs(audio)) + if max_val == 0: + return audio + scale_factor = (1 - 1e-6) / max_val + normalized_audio = audio * scale_factor + + if False: + print(f'--- normalize ---') + print(f"Before normalization:") + print(f" Min value: {np.min(audio)}") + print(f" Max value: {np.max(audio)}") + print(f" Length: {len(audio)}") + print(f"After normalization:") + print(f" Min value: {np.min(normalized_audio)}") + print(f" Max value: {np.max(normalized_audio)}") + print(f" Length: {len(normalized_audio)}") + + return normalized_audio def get_text_for_tts_infer(text, language_str, hps, device, symbol_to_id=None): norm_text, phone, tone, word2ph = clean_text(text, language_str)