add pretrained

babysor · Feb 18, 2023 · 5c17fc8 · 5c17fc8
1 parent 3ce874a
commit 5c17fc8
Show file tree

Hide file tree

Showing 16 changed files with 21,908 additions and 123 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,11 +14,13 @@
 *.bcf
 *.toc
 *.sh
-data/ckpt
-!data/ckpt/vocoder/pretrained/**
+data/ckpt/*/*
 !data/ckpt/encoder/pretrained.pt
+!data/ckpt/vocoder/pretrained/
 wavs
 log
 !/docker-entrypoint.sh
 !/datasets_download/*.sh
-/datasets
+/datasets
+monotonic_align/build
+monotonic_align/monotonic_align
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -53,7 +53,7 @@
  "request": "launch",
  "program": "train.py",
  "console": "integratedTerminal",
- "args": ["--type", "synth", "..\\audiodata\\SV2TTS\\synthesizer"]
+ "args": ["--type", "vits"]
  },
  {
  "name": "Python: PPG Convert",

diff --git a/data/ckpt/encoder/pretrained.pt b/data/ckpt/encoder/pretrained.pt
diff --git a/data/ckpt/vocoder/pretrained/config_16k.json b/data/ckpt/vocoder/pretrained/config_16k.json
@@ -0,0 +1,31 @@
+{
+ "resblock": "1",
+ "num_gpus": 0,
+ "batch_size": 16,
+ "learning_rate": 0.0002,
+ "adam_b1": 0.8,
+ "adam_b2": 0.99,
+ "lr_decay": 0.999,
+ "seed": 1234,
+
+ "upsample_rates": [5,5,4,2],
+ "upsample_kernel_sizes": [10,10,8,4],
+ "upsample_initial_channel": 512,
+ "resblock_kernel_sizes": [3,7,11],
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+
+ "segment_size": 6400,
+ "num_mels": 80,
+ "num_freq": 1025,
+ "n_fft": 1024,
+ "hop_size": 200,
+ "win_size": 800,
+
+ "sampling_rate": 16000,
+
+ "fmin": 0,
+ "fmax": 7600,
+ "fmax_for_loss": null,
+
+ "num_workers": 4
+}
diff --git a/data/ckpt/vocoder/pretrained/g_hifigan.pt b/data/ckpt/vocoder/pretrained/g_hifigan.pt
diff --git a/data/ckpt/vocoder/pretrained/pretrained.pt b/data/ckpt/vocoder/pretrained/pretrained.pt
diff --git a/models/synthesizer/preprocess.py b/models/synthesizer/preprocess.py
@@ -6,7 +6,7 @@
 from tqdm import tqdm
 import numpy as np
 from models.encoder import inference as encoder
-from models.synthesizer.preprocess_audio import preprocess_general
+from models.synthesizer.preprocess_audio import preprocess_general, extract_emo
 from models.synthesizer.preprocess_transcript import preprocess_transcript_aishell3, preprocess_transcript_magicdata
 
 data_info = {
@@ -41,7 +41,7 @@
 
 def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
  skip_existing: bool, hparams, no_alignments: bool, 
- dataset: str, emotion_extract = False):
+ dataset: str, emotion_extract = False, encoder_model_fpath=None):
  dataset_info = data_info[dataset]
  # Gather the input directories
  dataset_root = datasets_root.joinpath(dataset)
@@ -77,7 +77,7 @@ def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
  speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
 
  func = partial(dataset_info["speak_func"], out_dir=out_dir, skip_existing=skip_existing, 
- hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, emotion_extract=emotion_extract)
+ hparams=hparams, dict_info=dict_info, no_alignments=no_alignments, encoder_model_fpath=encoder_model_fpath)
  job = Pool(n_processes).imap(func, speaker_dirs)
 
  for speaker_metadata in tqdm(job, dataset, len(speaker_dirs), unit="speakers"):
@@ -110,6 +110,13 @@ def embed_utterance(fpaths, encoder_model_fpath):
  embed = encoder.embed_utterance(wav)
  np.save(embed_fpath, embed, allow_pickle=False)
 
+def _emo_extract_from_utterance(fpaths, hparams, skip_existing=False):
+ if skip_existing and fpaths.exists():
+ return
+ wav_fpath, emo_fpath = fpaths
+ wav = np.load(wav_fpath)
+ emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
+ np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
 
 def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
  wav_dir = synthesizer_root.joinpath("audio")
@@ -128,3 +135,21 @@ def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_proce
  func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
  job = Pool(n_processes).imap(func, fpaths)
  list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
+
+def create_emo(synthesizer_root: Path, n_processes: int, skip_existing: bool, hparams):
+ wav_dir = synthesizer_root.joinpath("audio")
+ metadata_fpath = synthesizer_root.joinpath("train.txt")
+ assert wav_dir.exists() and metadata_fpath.exists()
+ emo_dir = synthesizer_root.joinpath("emo")
+ emo_dir.mkdir(exist_ok=True)
+
+ # Gather the input wave filepath and the target output embed filepath
+ with metadata_fpath.open("r", encoding="utf-8") as metadata_file:
+ metadata = [line.split("|") for line in metadata_file]
+ fpaths = [(wav_dir.joinpath(m[0]), emo_dir.joinpath(m[0].replace("audio-", "emo-"))) for m in metadata]
+
+ # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
+ # Embed the utterances in separate threads
+ func = partial(_emo_extract_from_utterance, hparams=hparams, skip_existing=skip_existing)
+ job = Pool(n_processes).imap(func, fpaths)
+ list(tqdm(job, "Emo", len(fpaths), unit="utterances"))
diff --git a/models/synthesizer/preprocess_audio.py b/models/synthesizer/preprocess_audio.py
@@ -13,7 +13,11 @@
 from transformers import Wav2Vec2Processor
 from .models.wav2emo import EmotionExtractorModel
 
-SAMPLE_RATE = 16000
+class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
+ pass
+
+pinyin = Pinyin(PinyinConverter()).pinyin
+
 
 # load model from hub 
 device = 'cuda' if torch.cuda.is_available() else "cpu"
@@ -40,14 +44,8 @@ def extract_emo(
 
  return y
 
-class PinyinConverter(NeutralToneWith5Mixin, DefaultConverter):
- pass
-
-pinyin = Pinyin(PinyinConverter()).pinyin
-
-
 def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, 
- skip_existing: bool, hparams, emotion_extract: bool):
+ skip_existing: bool, hparams, encoder_model_fpath):
  ## FOR REFERENCE:
  # For you not to lose your head if you ever wish to change things here or implement your own
  # synthesizer.
@@ -69,6 +67,8 @@ def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
 
  # Trim silence
  if hparams.trim_silence:
+ if not encoder.is_loaded():
+ encoder.load_model(encoder_model_fpath)
  wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
 
  # Skip utterances that are too short
@@ -109,7 +109,7 @@ def _split_on_silences(wav_fpath, words, hparams):
 
  return wav, res
 
-def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, emotion_extract: bool):
+def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams, dict_info, no_alignments: bool, encoder_model_fpath: Path):
  metadata = []
  extensions = ["*.wav", "*.flac", "*.mp3"]
  for extension in extensions:
@@ -124,14 +124,9 @@ def preprocess_general(speaker_dir, out_dir: Path, skip_existing: bool, hparams,
  sub_basename = "%s_%02d" % (wav_fpath.name, 0)
  wav, text = _split_on_silences(wav_fpath, words, hparams)
  result = _process_utterance(wav, text, out_dir, sub_basename, 
- skip_existing, hparams, emotion_extract)
+ skip_existing, hparams, encoder_model_fpath)
  if result is None:
  continue
  wav_fpath_name, mel_fpath_name, embed_fpath_name, wav, mel_frames, text = result
- emo_fpath = out_dir.joinpath("emo", "emo-%s.npy" % sub_basename)
- skip_emo_extract = not emotion_extract or (skip_existing and emo_fpath.exists())
- if not skip_emo_extract and wav is not None:
- emo = extract_emo(np.expand_dims(wav, 0), hparams.sample_rate, True)
- np.save(emo_fpath, emo.squeeze(0), allow_pickle=False)
  metadata.append([wav_fpath_name, mel_fpath_name, embed_fpath_name, len(wav), mel_frames, text])
  return [m for m in metadata if m is not None]
diff --git a/models/synthesizer/vits_dataset.py b/models/synthesizer/vits_dataset.py
@@ -1,10 +1,11 @@
 import os
 import random
 import numpy as np
+import torch.nn.functional as F
 import torch
 import torch.utils.data
 
-from utils.audio_utils import spectrogram1, load_wav_to_torch, spectrogram
+from utils.audio_utils import load_wav_to_torch, spectrogram
 from utils.util import intersperse
 from models.synthesizer.utils.text import text_to_sequence
 
@@ -51,21 +52,10 @@ def _filter(self):
  lengths = []
 
  # for audiopath, sid, text in self.audio_metadata:
- sid = 0
- spk_to_sid = {}
- for wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text in self.audio_metadata:
+ for wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spkid in self.audio_metadata:
  if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
- # TODO: for magic data only
- speaker_name = wav_fpath.split("_")[1]
- # # TODO: for ai data only
- # speaker_name = wav_fpath.split("-")[1][6:9]
- if speaker_name not in spk_to_sid:
- sid += 1
- spk_to_sid[speaker_name] = sid
-
- audio_metadata_new.append([wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spk_to_sid[speaker_name]])
+ audio_metadata_new.append([wav_fpath, mel_fpath, embed_path, wav_length, mel_frames, text, spkid])
  lengths.append(os.path.getsize(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}') // (2 * self.hop_length))
- print("found sid:%d", sid)
  self.audio_metadata = audio_metadata_new
  self.lengths = lengths
 
@@ -74,50 +64,31 @@ def get_audio_text_speaker_pair(self, audio_metadata):
  wav_fpath, text, sid = audio_metadata[0], audio_metadata[5], audio_metadata[6]
  text = self.get_text(text)
 
- # TODO: add original audio data root for loading
- file_name = wav_fpath.split("_00")[0].split('-')[1]
- spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}..{os.sep}..{os.sep}magicdata{os.sep}train{os.sep}{"_".join(file_name.split("_")[:2])}{os.sep}{file_name}')
-
- # spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}')
+ spec, wav = self.get_audio(f'{self.datasets_root}{os.sep}audio{os.sep}{wav_fpath}')
  sid = self.get_sid(sid)
  emo = torch.FloatTensor(np.load(f'{self.datasets_root}{os.sep}emo{os.sep}{wav_fpath.replace("audio", "emo")}'))
  return (text, spec, wav, sid, emo)
 
  def get_audio(self, filename):
- audio, sampling_rate = load_wav_to_torch(filename)
- if sampling_rate != self.sampling_rate:
- raise ValueError("{} {} SR doesn't match target {} SR".format(
- sampling_rate, self.sampling_rate))
- audio_norm = audio / self.max_wav_value
- audio_norm = audio_norm.unsqueeze(0)
- spec = spectrogram(audio_norm, self.filter_length, self.hop_length, self.win_length,
- center=False)
+ # Load preprocessed wav npy instead of reading from wav file
+ audio = torch.FloatTensor(np.load(filename))
+ audio_norm = audio.unsqueeze(0)
+
+ spec_filename = filename.replace(".wav", ".spec")
+ if os.path.exists(spec_filename):
+ spec = torch.load(spec_filename)
+ else:
+ spec = spectrogram(audio_norm, self.filter_length,self.hop_length, self.win_length,
+ center=False)
+ torch.save(spec, spec_filename)
  spec = torch.squeeze(spec, 0)
  return spec, audio_norm
 
- # print("Loading", filename)
- # # audio = torch.FloatTensor(np.load(filename).astype(np.float32)) 
- # audio = audio.unsqueeze(0)
- # audio_norm = audio / self.max_wav_value
- # audio_norm = audio_norm.unsqueeze(0)
- # # spec_filename = filename.replace(".wav", ".spec.pt")
- # # if os.path.exists(spec_filename):
- # # spec = torch.load(spec_filename)
- # # else:
- # # spec = spectrogram(audio, self.filter_length,self.hop_length, self.win_length,
- # # center=False)
- # # spec = torch.squeeze(spec, 0)
- # # torch.save(spec, spec_filename)
- # spec = spectrogram(audio, self.filter_length, self.hop_length, self.win_length,
- # center=False)
- # spec = torch.squeeze(spec, 0)
- # return spec, audio
-
  def get_text(self, text):
  if self.cleaned_text:
  text_norm = text_to_sequence(text, self.text_cleaners)
  if self.add_blank:
- text_norm = intersperse(text_norm, 0)
+ text_norm = intersperse(text_norm, 0) # 在所有文本数值序列中的元素前后都补充一个0 - 不适用于中文
  text_norm = torch.LongTensor(text_norm)
  return text_norm
 
@@ -188,7 +159,7 @@ def __call__(self, batch):
  emo[i, :] = row[4]
 
  if self.return_ids:
- return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
+ return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing, emo
  return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, emo
 
 

diff --git a/monotonic_align/__init__.py b/monotonic_align/__init__.py
@@ -0,0 +1,19 @@
+import numpy as np
+import torch
+from .monotonic_align.core import maximum_path_c
+
+
+def maximum_path(neg_cent, mask):
+ """ Cython optimized version.
+ neg_cent: [b, t_t, t_s]
+ mask: [b, t_t, t_s]
+ """
+ device = neg_cent.device
+ dtype = neg_cent.dtype
+ neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
+ path = np.zeros(neg_cent.shape, dtype=np.int32)
+
+ t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
+ t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
+ maximum_path_c(path, neg_cent, t_t_max, t_s_max)
+ return torch.from_numpy(path).to(device=device, dtype=dtype)