Skip to content

Commit

Permalink
using visualdl; fix read_manifest
Browse files Browse the repository at this point in the history
  • Loading branch information
zh794390558 committed Nov 30, 2021
1 parent 05a6f77 commit 7554b61
Show file tree
Hide file tree
Showing 11 changed files with 80 additions and 36 deletions.
10 changes: 5 additions & 5 deletions paddlespeech/s2t/exps/u2/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,9 @@ def train_batch(self, batch_index, batch_data, msg):
if dist.get_rank() == 0 and self.visualizer:
losses_np_v = losses_np.copy()
losses_np_v.update({"lr": self.lr_scheduler()})
self.visualizer.add_scalars("step", losses_np_v,
self.iteration - 1)
for key, val in losses_np_v.items():
self.visualizer.add_scalar(tag='train/'+key, value=val, step=self.iteration-1)


@paddle.no_grad()
def valid(self):
Expand Down Expand Up @@ -237,9 +238,8 @@ def do_train(self):
logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
if self.visualizer:
self.visualizer.add_scalars(
'epoch', {'cv_loss': cv_loss,
'lr': self.lr_scheduler()}, self.epoch)
self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)

self.save(tag=self.epoch, infos={'val_loss': cv_loss})
self.new_epoch()
Expand Down
10 changes: 5 additions & 5 deletions paddlespeech/s2t/exps/u2_kaldi/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ def train_batch(self, batch_index, batch_data, msg):
if dist.get_rank() == 0 and self.visualizer:
losses_np_v = losses_np.copy()
losses_np_v.update({"lr": self.lr_scheduler()})
self.visualizer.add_scalars("step", losses_np_v,
self.iteration - 1)
for key, val in losses_np_v.items():
self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1)

@paddle.no_grad()
def valid(self):
Expand Down Expand Up @@ -222,9 +222,9 @@ def do_train(self):
logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
if self.visualizer:
self.visualizer.add_scalars(
'epoch', {'cv_loss': cv_loss,
'lr': self.lr_scheduler()}, self.epoch)
self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)

self.save(tag=self.epoch, infos={'val_loss': cv_loss})
self.new_epoch()

Expand Down
10 changes: 5 additions & 5 deletions paddlespeech/s2t/exps/u2_st/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ def train_batch(self, batch_index, batch_data, msg):
if dist.get_rank() == 0 and self.visualizer:
losses_np_v = losses_np.copy()
losses_np_v.update({"lr": self.lr_scheduler()})
self.visualizer.add_scalars("step", losses_np_v,
self.iteration - 1)
for key, val in losses_np_v.items():
self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1)

@paddle.no_grad()
def valid(self):
Expand Down Expand Up @@ -235,9 +235,9 @@ def do_train(self):
logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
if self.visualizer:
self.visualizer.add_scalars(
'epoch', {'cv_loss': cv_loss,
'lr': self.lr_scheduler()}, self.epoch)
self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)

self.save(tag=self.epoch, infos={'val_loss': cv_loss})
self.new_epoch()

Expand Down
27 changes: 24 additions & 3 deletions paddlespeech/s2t/frontend/normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,36 @@

import numpy as np
import paddle
import jsonlines
from paddle.io import DataLoader
from paddle.io import Dataset

from paddlespeech.s2t.frontend.audio import AudioSegment
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.utils.log import Log

__all__ = ["FeatureNormalizer"]

logger = Log(__name__).getlog()


def read_manifest(manifest_path):
"""Load and parse manifest file.
Args:
manifest_path ([type]): Manifest file to load and parse.
Raises:
IOError: If failed to parse the manifest.
Returns:
List[dict]: Manifest parsing results.
"""

manifest = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest.append(json_data)
return manifest

# https://github.com/PaddlePaddle/Paddle/pull/31481
class CollateFunc(object):
def __init__(self, feature_func):
Expand Down Expand Up @@ -61,7 +78,11 @@ def __call__(self, batch):
class AudioDataset(Dataset):
def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):
self._rng = rng if rng else np.random.RandomState(random_seed)
manifest = read_manifest(manifest_path)
manifest = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest.append(json_data)

if num_samples == -1:
sampled_manifest = manifest
else:
Expand Down
22 changes: 20 additions & 2 deletions paddlespeech/s2t/frontend/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,26 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
return char_list


def read_manifest(
def read_manifest(manifest_path,):
"""Load and parse manifest file.
Args:
manifest_path ([type]): Manifest file to load and parse.
Raises:
IOError: If failed to parse the manifest.
Returns:
List[dict]: Manifest parsing results.
"""
manifest = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest.append(json_data)
return manifest


def read_manifest_filter(
manifest_path,
max_input_len=float('inf'),
min_input_len=0.0,
Expand Down Expand Up @@ -98,7 +117,6 @@ def read_manifest(
Returns:
List[dict]: Manifest parsing results.
"""

manifest = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
Expand Down
2 changes: 1 addition & 1 deletion paddlespeech/s2t/io/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def __init__(self,
super().__init__()

# read manifest
self._manifest = read_manifest(
self._manifest = read_manifest_filter(
manifest_path=manifest_path,
max_input_len=max_input_len,
min_input_len=min_input_len,
Expand Down
9 changes: 4 additions & 5 deletions paddlespeech/s2t/training/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

import paddle
from paddle import distributed as dist
from tensorboardX import SummaryWriter
from visualdl import LogWriter

from paddlespeech.s2t.training.reporter import ObsScope
from paddlespeech.s2t.training.reporter import report
Expand Down Expand Up @@ -309,9 +309,8 @@ def do_train(self):
logger.info(
'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
if self.visualizer:
self.visualizer.add_scalars(
'epoch', {'cv_loss': cv_loss,
'lr': self.lr_scheduler()}, self.epoch)
self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)

# after epoch
self.save(tag=self.epoch, infos={'val_loss': cv_loss})
Expand Down Expand Up @@ -427,7 +426,7 @@ def setup_visualizer(self):
unexpected behaviors.
"""
# visualizer
visualizer = SummaryWriter(logdir=str(self.visual_dir))
visualizer = LogWriter(logdir=str(self.visual_dir))
self.visualizer = visualizer

@mp_tools.rank_zero_only
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from speechtask.punctuation_restoration.utils import layer_tools
from speechtask.punctuation_restoration.utils import mp_tools
from speechtask.punctuation_restoration.utils.checkpoint import Checkpoint
from tensorboardX import SummaryWriter
from visualdl import LogWriter

__all__ = ["Trainer", "Tester"]

Expand Down Expand Up @@ -252,10 +252,8 @@ def train(self):
self.logger.info("Epoch {} Val info val_loss {}, F1_score {}".
format(self.epoch, total_loss, F1_score))
if self.visualizer:
self.visualizer.add_scalars("epoch", {
"total_loss": total_loss,
"lr": self.lr_scheduler()
}, self.epoch)
self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)

self.save(
tag=self.epoch, infos={"val_loss": total_loss,
Expand Down Expand Up @@ -341,7 +339,7 @@ def setup_visualizer(self):
unexpected behaviors.
"""
# visualizer
visualizer = SummaryWriter(logdir=str(self.output_dir))
visualizer = LogWriter(logdir=str(self.output_dir))
self.visualizer = visualizer

@mp_tools.rank_zero_only
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ snakeviz
soundfile~=0.10
sox
soxbindings
tensorboardX
textgrid
timer
tqdm
Expand Down
14 changes: 11 additions & 3 deletions utils/build_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
import functools
import os
import tempfile
import jsonlines
from collections import Counter

from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import BLANK
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.frontend.utility import SOS
from paddlespeech.s2t.frontend.utility import SPACE
from paddlespeech.s2t.frontend.utility import UNK
Expand Down Expand Up @@ -59,13 +59,21 @@


def count_manifest(counter, text_feature, manifest_path):
manifest_jsons = read_manifest(manifest_path)
manifest_jsons = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest_jsons.append(json_data)

for line_json in manifest_jsons:
line = text_feature.tokenize(line_json['text'], replace_space=False)
counter.update(line)

def dump_text_manifest(fileobj, manifest_path, key='text'):
manifest_jsons = read_manifest(manifest_path)
manifest_jsons = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest_jsons.append(json_data)

for line_json in manifest_jsons:
fileobj.write(line_json[key] + "\n")

Expand Down
1 change: 1 addition & 0 deletions utils/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def read_manifest(manifest_path):
for json_line in open(manifest_path, 'r'):
try:
json_data = json.loads(json_line)
manifest.append(json_data)
except Exception as e:
raise IOError("Error reading manifest: %s" % str(e))
return manifest
Expand Down

0 comments on commit 7554b61

Please sign in to comment.