Skip to content

Commit

Permalink
add pretrained
Browse files Browse the repository at this point in the history
  • Loading branch information
babysor committed Feb 18, 2023
1 parent 3ce874a commit 5c17fc8
Show file tree
Hide file tree
Showing 16 changed files with 21,908 additions and 123 deletions.
8 changes: 5 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
*.bcf
*.toc
*.sh
data/ckpt
!data/ckpt/vocoder/pretrained/**
data/ckpt/*/*
!data/ckpt/encoder/pretrained.pt
!data/ckpt/vocoder/pretrained/
wavs
log
!/docker-entrypoint.sh
!/datasets_download/*.sh
/datasets
/datasets
monotonic_align/build
monotonic_align/monotonic_align
2 changes: 1 addition & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
"request": "launch",
"program": "train.py",
"console": "integratedTerminal",
"args": ["--type", "synth", "..\\audiodata\\SV2TTS\\synthesizer"]
"args": ["--type", "vits"]
},
{
"name": "Python: PPG Convert",
Expand Down
Binary file added data/ckpt/encoder/pretrained.pt
Binary file not shown.
31 changes: 31 additions & 0 deletions data/ckpt/vocoder/pretrained/config_16k.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"resblock": "1",
"num_gpus": 0,
"batch_size": 16,
"learning_rate": 0.0002,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.999,
"seed": 1234,

"upsample_rates": [5,5,4,2],
"upsample_kernel_sizes": [10,10,8,4],
"upsample_initial_channel": 512,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],

"segment_size": 6400,
"num_mels": 80,
"num_freq": 1025,
"n_fft": 1024,
"hop_size": 200,
"win_size": 800,

"sampling_rate": 16000,

"fmin": 0,
"fmax": 7600,
"fmax_for_loss": null,

"num_workers": 4
}
Binary file added data/ckpt/vocoder/pretrained/g_hifigan.pt
Binary file not shown.
Binary file added data/ckpt/vocoder/pretrained/pretrained.pt
Binary file not shown.
31 changes: 28 additions & 3 deletions models/synthesizer/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from tqdm import tqdm
import numpy as np
from models.encoder import inference as encoder
from models.synthesizer.preprocess_audio import preprocess_general
from models.synthesizer.preprocess_audio import preprocess_general, extract_emo
from models.synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata

data_info = {
Expand Down Expand Up @@ -41,7 +41,7 @@

def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
skip_existing: bool, hparams, no_alignments: bool,
dataset: str, emotion_extract = False):
dataset: str, emotion_extract = False, encoder_model_fpath=None):
dataset_info = data_info[dataset]
# Gather the input directories
dataset_root = datasets_root.joinpath(dataset)
Expand Down Expand Up @@ -77,7 +77,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))

func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing,
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, emotion_extract=emotion_extract)
hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, speaker_dirs)

for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
Expand Down Expand Up @@ -110,6 +110,13 @@ def embed_utterance(fpaths, encoder_model_fpath):
embed = encoder.embed_utterance(wav)
np.save(embed_fpath, embed, allow_pickle=False)

def _emo_extract_from_utterance(fpaths, hparams, skip_existing=False):
if skip_existing and fpaths.exists():
return
wav_fpath, emo_fpath = fpaths
wav = np.load(wav_fpath)
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)

def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
wav_dir = synthesizer_root.joinpath("audio")
Expand All @@ -128,3 +135,21 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
job = Pool(n_processes).imap(func, fpaths)
list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))

def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
wav_dir = synthesizer_root.joinpath("audio")
metadata_fpath = synthesizer_root.joinpath("train.txt")
assert wav_dir.exists() and metadata_fpath.exists()
emo_dir = synthesizer_root.joinpath("emo")
emo_dir.mkdir(exist_ok=True)

# Gather the input wave filepath and the target output embed filepath
with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
metadata = [line.split("|") for line in metadata_file]
fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata]

# TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
# Embed the utterances in separate threads
func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
job = Pool(n_processes).imap(func, fpaths)
list(tqdm(job, "Emo", len(fpaths), unit="utterances"))
25 changes: 10 additions & 15 deletions models/synthesizer/preprocess_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@
from transformers import Wav2Vec2Processor
from .models.wav2emo import EmotionExtractorModel

SAMPLE_RATE = 16000
class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
pass

pinyin = Pinyin(PinyinConverter()).pinyin


# load model from hub
device = 'cuda' if torch.cuda.is_available() else "cpu"
Expand All @@ -40,14 +44,8 @@ def extract_emo(

return y

class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
pass

pinyin = Pinyin(PinyinConverter()).pinyin


def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
skip_existing: bool, hparams, emotion_extract: bool):
skip_existing: bool, hparams, encoder_model_fpath):
## FOR REFERENCE:
# For you not to lose your head if you ever wish to change things here or implement your own
# synthesizer.
Expand All @@ -69,6 +67,8 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,

# Trim silence
if hparams.trim_silence:
if not encoder.is_loaded():
encoder.load_model(encoder_model_fpath)
wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)

# Skip utterances that are too short
Expand Down Expand Up @@ -109,7 +109,7 @@ def _split_on_silences(wav_fpath, words, hparams):

return wav, res

def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, emotion_extract: bool):
def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
metadata = []
extensions = ["*.wav", "*.flac", "*.mp3"]
for extension in extensions:
Expand All @@ -124,14 +124,9 @@ def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams,
sub_basename = "%s_%02d" % (wav_fpath.name, 0)
wav, text = _split_on_silences(wav_fpath, words, hparams)
result = _process_utterance(wav, text, out_dir, sub_basename,
skip_existing, hparams, emotion_extract)
skip_existing, hparams, encoder_model_fpath)
if result is None:
continue
wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
emo_fpath = out_dir.joinpath("emo", "emo-%s.npy" % sub_basename)
skip_emo_extract = not emotion_extract or (skip_existing and emo_fpath.exists())
if not skip_emo_extract and wav is not None:
emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
return [m for m in metadata if m is not None]
65 changes: 18 additions & 47 deletions models/synthesizer/vits_dataset.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os
import random
import numpy as np
import torch.nn.functional as F
import torch
import torch.utils.data

from utils.audio_utils import spectrogram1, load_wav_to_torch, spectrogram
from utils.audio_utils import load_wav_to_torch, spectrogram
from utils.util import intersperse
from models.synthesizer.utils.text import text_to_sequence

Expand Down Expand Up @@ -51,21 +52,10 @@ def _filter(self):
lengths = []

# for audiopath, sid, text in self.audio_metadata:
sid = 0
spk_to_sid = {}
for wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text in self.audio_metadata:
for wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spkid in self.audio_metadata:
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
# TODO: for magic data only
speaker_name = wav_fpath.split("_")[1]
# # TODO: for ai data only
# speaker_name = wav_fpath.split("-")[1][6:9]
if speaker_name not in spk_to_sid:
sid += 1
spk_to_sid[speaker_name] = sid

audio_metadata_new.append([wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spk_to_sid[speaker_name]])
audio_metadata_new.append([wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spkid])
lengths.append(os.path.getsize(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}') // (2 * self.hop_length))
print("found sid:%d", sid)
self.audio_metadata = audio_metadata_new
self.lengths = lengths

Expand All @@ -74,50 +64,31 @@ def get_audio_text_speaker_pair(self, audio_metadata):
wav_fpath, text, sid = audio_metadata[0], audio_metadata[5], audio_metadata[6]
text = self.get_text(text)

# TODO: add original audio data root for loading
file_name = wav_fpath.split("_00")[0].split('-')[1]
spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}..{os.sep}..{os.sep}magicdata{os.sep}train{os.sep}{"_".join(file_name.split("_")[:2])}{os.sep}{file_name}')

# spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}')
spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}')
sid = self.get_sid(sid)
emo = torch.FloatTensor(np.load(f'{self.datasets_root}{os.sep}emo{os.sep}{wav_fpath.replace("audio", "emo")}'))
return (text, spec, wav, sid, emo)

def get_audio(self, filename):
audio, sampling_rate = load_wav_to_torch(filename)
if sampling_rate != self.sampling_rate:
raise ValueError("{} {} SR doesn't match target {} SR".format(
sampling_rate, self.sampling_rate))
audio_norm = audio / self.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
spec = spectrogram(audio_norm, self.filter_length, self.hop_length, self.win_length,
center=False)
# Load preprocessed wav npy instead of reading from wav file
audio = torch.FloatTensor(np.load(filename))
audio_norm = audio.unsqueeze(0)

spec_filename = filename.replace(".wav", ".spec")
if os.path.exists(spec_filename):
spec = torch.load(spec_filename)
else:
spec = spectrogram(audio_norm, self.filter_length,self.hop_length, self.win_length,
center=False)
torch.save(spec, spec_filename)
spec = torch.squeeze(spec, 0)
return spec, audio_norm

# print("Loading", filename)
# # audio = torch.FloatTensor(np.load(filename).astype(np.float32))
# audio = audio.unsqueeze(0)
# audio_norm = audio / self.max_wav_value
# audio_norm = audio_norm.unsqueeze(0)
# # spec_filename = filename.replace(".wav", ".spec.pt")
# # if os.path.exists(spec_filename):
# # spec = torch.load(spec_filename)
# # else:
# # spec = spectrogram(audio, self.filter_length,self.hop_length, self.win_length,
# # center=False)
# # spec = torch.squeeze(spec, 0)
# # torch.save(spec, spec_filename)
# spec = spectrogram(audio, self.filter_length, self.hop_length, self.win_length,
# center=False)
# spec = torch.squeeze(spec, 0)
# return spec, audio

def get_text(self, text):
if self.cleaned_text:
text_norm = text_to_sequence(text, self.text_cleaners)
if self.add_blank:
text_norm = intersperse(text_norm, 0)
text_norm = intersperse(text_norm, 0) # 在所有文本数值序列中的元素前后都补充一个0 - 不适用于中文
text_norm = torch.LongTensor(text_norm)
return text_norm

Expand Down Expand Up @@ -188,7 +159,7 @@ def __call__(self, batch):
emo[i, :] = row[4]

if self.return_ids:
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing, emo
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, emo


Expand Down
19 changes: 19 additions & 0 deletions monotonic_align/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import numpy as np
import torch
from .monotonic_align.core import maximum_path_c


def maximum_path(neg_cent, mask):
""" Cython optimized version.
neg_cent: [b, t_t, t_s]
mask: [b, t_t, t_s]
"""
device = neg_cent.device
dtype = neg_cent.dtype
neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
path = np.zeros(neg_cent.shape, dtype=np.int32)

t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
maximum_path_c(path, neg_cent, t_t_max, t_s_max)
return torch.from_numpy(path).to(device=device, dtype=dtype)
Loading

0 comments on commit 5c17fc8

Please sign in to comment.