using visualdl; fix read_manifest

PaddlePaddle · Nov 30, 2021 · 7554b61 · 7554b61
1 parent 05a6f77
commit 7554b61
Show file tree

Hide file tree

Showing 11 changed files with 80 additions and 36 deletions.
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
@@ -128,8 +128,9 @@ def train_batch(self, batch_index, batch_data, msg):
  if dist.get_rank() == 0 and self.visualizer:
  losses_np_v = losses_np.copy()
  losses_np_v.update({"lr": self.lr_scheduler()})
- self.visualizer.add_scalars("step", losses_np_v,
- self.iteration - 1)
+ for key, val in losses_np_v.items():
+ self.visualizer.add_scalar(tag='train/'+key, value=val, step=self.iteration-1)
+
 
  @paddle.no_grad()
  def valid(self):
@@ -237,9 +238,8 @@ def do_train(self):
  logger.info(
  'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
  if self.visualizer:
- self.visualizer.add_scalars(
- 'epoch', {'cv_loss': cv_loss,
- 'lr': self.lr_scheduler()}, self.epoch)
+ self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+ self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
  self.save(tag=self.epoch, infos={'val_loss': cv_loss})
  self.new_epoch()

diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -131,8 +131,8 @@ def train_batch(self, batch_index, batch_data, msg):
  if dist.get_rank() == 0 and self.visualizer:
  losses_np_v = losses_np.copy()
  losses_np_v.update({"lr": self.lr_scheduler()})
- self.visualizer.add_scalars("step", losses_np_v,
-  self.iteration - 1)
+ for key, val in losses_np_v.items():
+ self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1)
 
  @paddle.no_grad()
  def valid(self):
@@ -222,9 +222,9 @@ def do_train(self):
  logger.info(
  'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
  if self.visualizer:
- self.visualizer.add_scalars(
-  'epoch', {'cv_loss': cv_loss,
-  'lr': self.lr_scheduler()}, self.epoch)
+ self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+ self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
  self.save(tag=self.epoch, infos={'val_loss': cv_loss})
  self.new_epoch()
 

diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py
@@ -138,8 +138,8 @@ def train_batch(self, batch_index, batch_data, msg):
  if dist.get_rank() == 0 and self.visualizer:
  losses_np_v = losses_np.copy()
  losses_np_v.update({"lr": self.lr_scheduler()})
- self.visualizer.add_scalars("step", losses_np_v,
-  self.iteration - 1)
+ for key, val in losses_np_v.items():
+ self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1)
 
  @paddle.no_grad()
  def valid(self):
@@ -235,9 +235,9 @@ def do_train(self):
  logger.info(
  'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
  if self.visualizer:
- self.visualizer.add_scalars(
-  'epoch', {'cv_loss': cv_loss,
-  'lr': self.lr_scheduler()}, self.epoch)
+ self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+ self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
  self.save(tag=self.epoch, infos={'val_loss': cv_loss})
  self.new_epoch()
 

diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py
@@ -16,19 +16,36 @@
 
 import numpy as np
 import paddle
+import jsonlines
 from paddle.io import DataLoader
 from paddle.io import Dataset
 
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.utils.log import Log
 
 __all__ = ["FeatureNormalizer"]
 
 logger = Log(__name__).getlog()
 
-
+def read_manifest(manifest_path):
+ """Load and parse manifest file.
+ 
+ Args:
+ manifest_path ([type]): Manifest file to load and parse.
+ Raises:
+ IOError: If failed to parse the manifest.
+ 
+ Returns:
+ List[dict]: Manifest parsing results.
+ """
+
+ manifest = []
+ with jsonlines.open(manifest_path, 'r') as reader:
+ for json_data in reader:
+ manifest.append(json_data)
+ return manifest
+
 # https://github.com/PaddlePaddle/Paddle/pull/31481
 class CollateFunc(object):
  def __init__(self, feature_func):
@@ -61,7 +78,11 @@ def __call__(self, batch):
 class AudioDataset(Dataset):
  def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):
  self._rng = rng if rng else np.random.RandomState(random_seed)
- manifest = read_manifest(manifest_path)
+ manifest = []
+ with jsonlines.open(manifest_path, 'r') as reader:
+ for json_data in reader:
+ manifest.append(json_data)
+
  if num_samples == -1:
  sampled_manifest = manifest
  else:

diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py
@@ -65,7 +65,26 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
  return char_list
 
 
-def read_manifest(
+def read_manifest(manifest_path,):
+ """Load and parse manifest file.
+
+ Args:
+ manifest_path ([type]): Manifest file to load and parse.
+
+ Raises:
+ IOError: If failed to parse the manifest.
+
+ Returns:
+ List[dict]: Manifest parsing results.
+ """
+ manifest = []
+ with jsonlines.open(manifest_path, 'r') as reader:
+ for json_data in reader:
+ manifest.append(json_data)
+ return manifest
+
+
+def read_manifest_filter(
  manifest_path,
  max_input_len=float('inf'),
  min_input_len=0.0,
@@ -98,7 +117,6 @@ def read_manifest(
  Returns:
  List[dict]: Manifest parsing results.
  """
-
  manifest = []
  with jsonlines.open(manifest_path, 'r') as reader:
  for json_data in reader:

diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
@@ -95,7 +95,7 @@ def __init__(self,
  super().__init__()
 
  # read manifest
- self._manifest = read_manifest(
+ self._manifest = read_manifest_filter(
  manifest_path=manifest_path,
  max_input_len=max_input_len,
  min_input_len=min_input_len,

diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
@@ -19,7 +19,7 @@
 
 import paddle
 from paddle import distributed as dist
-from tensorboardX import SummaryWriter
+from visualdl import LogWriter
 
 from paddlespeech.s2t.training.reporter import ObsScope
 from paddlespeech.s2t.training.reporter import report
@@ -309,9 +309,8 @@ def do_train(self):
  logger.info(
  'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
  if self.visualizer:
- self.visualizer.add_scalars(
- 'epoch', {'cv_loss': cv_loss,
- 'lr': self.lr_scheduler()}, self.epoch)
+ self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+ self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
  # after epoch
  self.save(tag=self.epoch, infos={'val_loss': cv_loss})
@@ -427,7 +426,7 @@ def setup_visualizer(self):
  unexpected behaviors.
  """
  # visualizer
- visualizer = SummaryWriter(logdir=str(self.visual_dir))
+ visualizer = LogWriter(logdir=str(self.visual_dir))
  self.visualizer = visualizer
 
  @mp_tools.rank_zero_only

diff --git a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
@@ -34,7 +34,7 @@
 from speechtask.punctuation_restoration.utils import layer_tools
 from speechtask.punctuation_restoration.utils import mp_tools
 from speechtask.punctuation_restoration.utils.checkpoint import Checkpoint
-from tensorboardX import SummaryWriter
+from visualdl import LogWriter
 
 __all__ = ["Trainer", "Tester"]
 
@@ -252,10 +252,8 @@ def train(self):
  self.logger.info("Epoch {} Val info val_loss {}, F1_score {}".
  format(self.epoch, total_loss, F1_score))
  if self.visualizer:
- self.visualizer.add_scalars("epoch", {
- "total_loss": total_loss,
- "lr": self.lr_scheduler()
- }, self.epoch)
+ self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+ self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
  self.save(
  tag=self.epoch, infos={"val_loss": total_loss,
@@ -341,7 +339,7 @@ def setup_visualizer(self):
  unexpected behaviors.
  """
  # visualizer
- visualizer = SummaryWriter(logdir=str(self.output_dir))
+ visualizer = LogWriter(logdir=str(self.output_dir))
  self.visualizer = visualizer
 
  @mp_tools.rank_zero_only

diff --git a/requirements.txt b/requirements.txt
@@ -40,7 +40,6 @@ snakeviz
 soundfile~=0.10
 sox
 soxbindings
-tensorboardX
 textgrid
 timer
 tqdm

diff --git a/utils/build_vocab.py b/utils/build_vocab.py
@@ -19,11 +19,11 @@
 import functools
 import os
 import tempfile
+import jsonlines
 from collections import Counter
 
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import BLANK
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.frontend.utility import SOS
 from paddlespeech.s2t.frontend.utility import SPACE
 from paddlespeech.s2t.frontend.utility import UNK
@@ -59,13 +59,21 @@
 
 
 def count_manifest(counter, text_feature, manifest_path):
- manifest_jsons = read_manifest(manifest_path)
+ manifest_jsons = []
+ with jsonlines.open(manifest_path, 'r') as reader:
+ for json_data in reader:
+ manifest_jsons.append(json_data)
+
  for line_json in manifest_jsons:
  line = text_feature.tokenize(line_json['text'], replace_space=False)
  counter.update(line)
 
 def dump_text_manifest(fileobj, manifest_path, key='text'):
- manifest_jsons = read_manifest(manifest_path)
+ manifest_jsons = []
+ with jsonlines.open(manifest_path, 'r') as reader:
+ for json_data in reader:
+ manifest_jsons.append(json_data)
+
  for line_json in manifest_jsons:
  fileobj.write(line_json[key] + "\n")
 

diff --git a/utils/utility.py b/utils/utility.py
@@ -42,6 +42,7 @@ def read_manifest(manifest_path):
  for json_line in open(manifest_path, 'r'):
  try:
  json_data = json.loads(json_line)
+ manifest.append(json_data)
  except Exception as e:
  raise IOError("Error reading manifest: %s" % str(e))
  return manifest