From 535bb2cedf69d00649fa5eb38b6f78e3c28a9f7f Mon Sep 17 00:00:00 2001 From: "Z. Stanley Guan" Date: Wed, 10 Oct 2018 23:17:50 -0400 Subject: [PATCH] [scripts] Enhancements & minor bugfix to segmentation postprocessing (#2776) --- .../segmentation/detect_speech_activity.sh | 11 +- .../segmentation/internal/sad_to_segments.py | 126 ++++++++++++++---- .../post_process_sad_to_segments.sh | 3 + 3 files changed, 115 insertions(+), 25 deletions(-) diff --git a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh index f71a14aebf1..831283bb5ec 100755 --- a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh +++ b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh @@ -56,7 +56,15 @@ acwt=0.3 # e.g. --speech-in-sil-weight=0.0 --garbage-in-sil-weight=0.0 --sil-in-speech-weight=0.0 --garbage-in-speech-weight=0.3 transform_probs_opts="" +# Postprocessing options segment_padding=0.2 # Duration (in seconds) of padding added to segments +min_segment_dur=0 # Minimum duration (in seconds) required for a segment to be included + # This is before any padding. Segments shorter than this duration will be removed. + # This is an alternative to --min-speech-duration above. +merge_consecutive_max_dur=0 # Merge consecutive segments as long as the merged segment is no longer than this many + # seconds. The segments are only merged if their boundaries are touching. + # This is after padding by --segment-padding seconds. + # 0 means do not merge. Use 'inf' to not limit the duration. echo $* @@ -225,7 +233,8 @@ fi if [ $stage -le 7 ]; then steps/segmentation/post_process_sad_to_segments.sh \ - --segment-padding $segment_padding \ + --segment-padding $segment_padding --min-segment-dur $min_segment_dur \ + --merge-consecutive-max-dur $merge_consecutive_max_dur \ --cmd "$cmd" --frame-shift $(perl -e "print $frame_subsampling_factor * $frame_shift") \ ${test_data_dir} ${seg_dir} ${seg_dir} fi diff --git a/egs/wsj/s5/steps/segmentation/internal/sad_to_segments.py b/egs/wsj/s5/steps/segmentation/internal/sad_to_segments.py index 9b1c0f12b9a..cf19f9bbfb3 100755 --- a/egs/wsj/s5/steps/segmentation/internal/sad_to_segments.py +++ b/egs/wsj/s5/steps/segmentation/internal/sad_to_segments.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # Copyright 2017 Vimal Manohar +# 2018 Capital One (Author: Zhiyuan Guan) # Apache 2.0 """ @@ -29,6 +30,7 @@ global_verbose = 0 + def get_args(): parser = argparse.ArgumentParser( description=""" @@ -44,18 +46,31 @@ def get_args(): parser.add_argument("--utt2dur", type=str, help="File containing durations of utterances.") + parser.add_argument("--frame-shift", type=float, default=0.01, help="Frame shift to convert frame indexes to time") parser.add_argument("--segment-padding", type=float, default=0.2, help="Additional padding on speech segments. But we " - "ensure that the padding does not go beyond the " - "adjacent segment.") + "ensure that the padding does not go beyond the " + "adjacent segment.") + parser.add_argument("--min-segment-dur", type=float, default=0, + help="Minimum duration (in seconds) required for a segment " + "to be included. This is before any padding. Segments " + "shorter than this duration will be removed.") + + parser.add_argument("--merge-consecutive-max-dur", type=float, default=0, + help="Merge consecutive segments as long as the merged " + "segment is no longer than this many seconds. The segments " + "are only merged if their boundaries are touching. " + "This is after padding by --segment-padding seconds." + "0 means do not merge. Use 'inf' to not limit the duration.") parser.add_argument("in_sad", type=str, help="Input file containing alignments in " - "text archive format") + "text archive format") + parser.add_argument("out_segments", type=str, help="Output kaldi segments file") @@ -80,28 +95,45 @@ def to_str(segment): class SegmenterStats(object): """Stores stats about the post-process stages""" + def __init__(self): - self.num_segments = 0 + self.num_segments_initial = 0 + self.num_short_segments_filtered = 0 + self.num_merges = 0 + self.num_segments_final = 0 self.initial_duration = 0.0 self.padding_duration = 0.0 + self.filter_short_duration = 0.0 self.final_duration = 0.0 def add(self, other): """Adds stats from another object""" - self.num_segments += other.num_segments + self.num_segments_initial += other.num_segments_initial + self.num_short_segments_filtered += other.num_short_segments_filtered + self.num_merges += other.num_merges + self.num_segments_final += other.num_segments_final self.initial_duration += other.initial_duration - self.padding_duration = other.padding_duration - self.final_duration = other.final_duration + self.filter_short_duration += other.filter_short_duration + self.padding_duration += other.padding_duration + self.final_duration += other.final_duration def __str__(self): - return ("num-segments={num_segments}, " + return ("num-segments-initial={num_segments_initial}, " + "num-short-segments-filtered={num_short_segments_filtered}, " + "num-merges={num_merges}, " + "num-segments-final={num_segments_final}, " "initial-duration={initial_duration}, " + "filter-short-duration={filter_short_duration}, " "padding-duration={padding_duration}, " "final-duration={final_duration}".format( - num_segments=self.num_segments, - initial_duration=self.initial_duration, - padding_duration=self.padding_duration, - final_duration=self.final_duration)) + num_segments_initial=self.num_segments_initial, + num_short_segments_filtered=self.num_short_segments_filtered, + num_merges=self.num_merges, + num_segments_final=self.num_segments_final, + initial_duration=self.initial_duration, + filter_short_duration=self.filter_short_duration, + padding_duration=self.padding_duration, + final_duration=self.final_duration)) def process_label(text_label): @@ -114,13 +146,14 @@ def process_label(text_label): prev_label = int(text_label) if prev_label not in [1, 2]: raise ValueError("Expecting label to 1 (non-speech) or 2 (speech); " - "got {0}".format(prev_label)) + "got {}".format(prev_label)) return prev_label class Segmentation(object): """Stores segmentation for an utterances""" + def __init__(self): self.segments = None self.stats = SegmenterStats() @@ -143,8 +176,8 @@ def initialize_segments(self, alignment, frame_shift=0.01): float(i) * frame_shift, prev_label]) prev_label = process_label(text_label) - prev_length = 0 self.stats.initial_duration += (prev_length * frame_shift) + prev_length = 0 elif prev_label is None: prev_label = process_label(text_label) @@ -156,7 +189,27 @@ def initialize_segments(self, alignment, frame_shift=0.01): float(len(alignment)) * frame_shift, prev_label]) self.stats.initial_duration += (prev_length * frame_shift) - self.stats.num_segments = len(self.segments) + self.stats.num_segments_initial = len(self.segments) + self.stats.num_segments_final = len(self.segments) + self.stats.final_duration = self.stats.initial_duration + + def filter_short_segments(self, min_dur): + """Filters out segments with durations shorter than 'min_dur'.""" + if min_dur <= 0: + return + + segments_kept = [] + for segment in self.segments: + assert segment[2] == 2, segment + dur = segment[1] - segment[0] + if dur < min_dur: + self.stats.filter_short_duration += dur + self.stats.num_short_segments_filtered += 1 + else: + segments_kept.append(segment) + self.segments = segments_kept + self.stats.num_segments_final = len(self.segments) + self.stats.final_duration -= self.stats.filter_short_duration def pad_speech_segments(self, segment_padding, max_duration=float("inf")): """Pads segments by duration 'segment_padding' on either sides, but @@ -166,19 +219,19 @@ def pad_speech_segments(self, segment_padding, max_duration=float("inf")): max_duration = float("inf") for i, segment in enumerate(self.segments): assert segment[2] == 2, segment - segment[0] -= segment_padding # try adding padding on the left side + segment[0] -= segment_padding # try adding padding on the left side self.stats.padding_duration += segment_padding if segment[0] < 0.0: # Padding takes the segment start to before the beginning of the utterance. # Reduce padding. self.stats.padding_duration += segment[0] segment[0] = 0.0 - if i >= 1 and self.segments[i-1][1] > segment[0]: + if i >= 1 and self.segments[i - 1][1] > segment[0]: # Padding takes the segment start to before the end the previous segment. # Reduce padding. self.stats.padding_duration -= ( - self.segments[i-1][1] - segment[0]) - segment[0] = self.segments[i-1][1] + self.segments[i - 1][1] - segment[0]) + segment[0] = self.segments[i - 1][1] segment[1] += segment_padding self.stats.padding_duration += segment_padding @@ -188,12 +241,35 @@ def pad_speech_segments(self, segment_padding, max_duration=float("inf")): self.stats.padding_duration -= (segment[1] - max_duration) segment[1] = max_duration if (i + 1 < len(self.segments) - and segment[1] > self.segments[i+1][0]): + and segment[1] > self.segments[i + 1][0]): # Padding takes the segment end beyond the start of the next segment. # Reduce padding. self.stats.padding_duration -= ( - segment[1] - self.segments[i+1][0]) - segment[1] = self.segments[i+1][0] + segment[1] - self.segments[i + 1][0]) + segment[1] = self.segments[i + 1][0] + self.stats.final_duration += self.stats.padding_duration + + def merge_consecutive_segments(self, max_dur): + """Merge consecutive segments (happens after padding), provided that + the merged segment is no longer than 'max_dur'.""" + if max_dur <= 0 or not self.segments: + return + + merged_segments = [self.segments[0]] + for segment in self.segments[1:]: + assert segment[2] == 2, segment + if segment[0] == merged_segments[-1][1] and \ + segment[1] - merged_segments[-1][1] <= max_dur: + # The segment starts at the same time the last segment ends, + # and the merged segment is shorter than 'max_dur'. + # Extend the previous segment. + merged_segments[-1][1] = segment[1] + self.stats.num_merges += 1 + else: + merged_segments.append(segment) + + self.segments = merged_segments + self.stats.num_segments_final = len(self.segments) def write(self, key, file_handle): """Write segments to file""" @@ -203,9 +279,9 @@ def write(self, key, file_handle): for segment in self.segments: seg_id = "{key}-{st:07d}-{end:07d}".format( key=key, st=int(segment[0] * 100), end=int(segment[1] * 100)) - print ("{seg_id} {key} {st:.2f} {end:.2f}".format( + print("{seg_id} {key} {st:.2f} {end:.2f}".format( seg_id=seg_id, key=key, st=segment[0], end=segment[1]), - file=file_handle) + file=file_handle) def run(args): @@ -235,9 +311,11 @@ def run(args): segmentation = Segmentation() segmentation.initialize_segments( parts[1:], args.frame_shift) + segmentation.filter_short_segments(args.min_segment_dur) segmentation.pad_speech_segments(args.segment_padding, None if args.utt2dur is None else utt2dur[utt_id]) + segmentation.merge_consecutive_segments(args.merge_consecutive_max_dur) segmentation.write(utt_id, out_segments_fh) global_stats.add(segmentation.stats) logger.info(global_stats) diff --git a/egs/wsj/s5/steps/segmentation/post_process_sad_to_segments.sh b/egs/wsj/s5/steps/segmentation/post_process_sad_to_segments.sh index ca9cea2518b..b168c307b57 100755 --- a/egs/wsj/s5/steps/segmentation/post_process_sad_to_segments.sh +++ b/egs/wsj/s5/steps/segmentation/post_process_sad_to_segments.sh @@ -18,6 +18,8 @@ nj=18 # The values below are in seconds frame_shift=0.01 segment_padding=0.2 +min_segment_dur=0 +merge_consecutive_max_dur=0 . utils/parse_options.sh @@ -53,6 +55,7 @@ if [ $stage -le 0 ]; then copy-int-vector "ark:gunzip -c $vad_dir/ali.JOB.gz |" ark,t:- \| \ steps/segmentation/internal/sad_to_segments.py \ --frame-shift=$frame_shift --segment-padding=$segment_padding \ + --min-segment-dur=$min_segment_dur --merge-consecutive-max-dur=$merge_consecutive_max_dur \ --utt2dur=$data_dir/utt2dur - $dir/segments.JOB fi