Skip to content

Commit

Permalink
feature/use-spacy-in-transcription (#59)
Browse files Browse the repository at this point in the history
* Use spaCy for sentence segmentation in GoogleCloudSRModel

* Remove print statement

* Remove commented out code

* Remove unnecessary deps and fix typo

* Add comma back
  • Loading branch information
isaacna committed Jun 2, 2021
1 parent 9d0332c commit 5f09efc
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 49 deletions.
107 changes: 61 additions & 46 deletions cdp_backend/sr_models/google_cloud_sr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import Any, List, Optional, Union

from google.cloud import speech_v1p1beta1 as speech
from spacy.lang.en import English

from ..pipeline.transcript_model import Sentence, Transcript, Word
from .sr_model import SRModel
Expand Down Expand Up @@ -107,72 +108,86 @@ def transcribe(

# Create timestamped sentences
timestamped_sentences: List[Sentence] = []
current_sentence = None
sentence_index = 0
word_index = 0
transcript_sentence_index = 0

# Create sentence boundary pipeline
nlp = English()
nlp.add_pipe("sentencizer")

for result in response.results:
# Some portions of audio may not have text
if len(result.alternatives) > 0:
# Check length of transcript result
word_list = result.alternatives[0].words
if len(word_list) > 0:
for word in word_list:
# create Word
# Split transcript into sentences
doc = nlp(result.alternatives[0].transcript)

# Convert generator to list
sentences = [str(sent) for sent in doc.sents]

# Index holder for word results of response
w_marker = 0

for s_ind in range(0, len(sentences)):
# Sentence text
s_text = sentences[s_ind]

num_words = len(s_text.split())

# Initialize sentence model
timestamped_sentence = Sentence(
index=transcript_sentence_index,
confidence=result.alternatives[0].confidence,
# Start and end time are placeholder values
start_time=0.0,
end_time=0.0,
words=[],
text=s_text,
)

for w_ind in range(w_marker, w_marker + num_words):
# Extract word from response
word = result.alternatives[0].words[w_ind]

start_time = (
word.start_time.seconds + word.start_time.nanos * 1e-9
)
end_time = word.end_time.seconds + word.end_time.nanos * 1e-9

# Add start_time to Sentence if first word
if w_ind - w_marker == 0:
timestamped_sentence.start_time = start_time

# Add end_time to Sentence if last word
if (w_ind - w_marker) == (num_words - 1):
timestamped_sentence.end_time = end_time

# Clean everything but non-delimiting characters make lowercase
regex = re.compile(r"[^a-zA-Z0-9'\-]")
cleaned_word = regex.sub("", word.word).lower()

# Create Word model
timestamped_word = Word(
index=word_index,
index=w_ind - w_marker,
start_time=start_time,
end_time=end_time,
text=cleaned_word,
# TODO: Add annotations
annotations=None,
)

if current_sentence is None:
current_sentence = Sentence(
index=sentence_index,
confidence=result.alternatives[0].confidence,
start_time=word.start_time,
end_time=word.end_time,
# TODO: Add speaker and annotations?
words=[timestamped_word],
text=word.word,
)
word_index += 1

# End current sentence and reset
# TODO: Account for non-sentence ending periods, such as
# prefixes like "Mr." or "Dr."
elif bool(re.match(r"\.|\?", word.word[-1])):
# Finish sentence and append
current_sentence.end_time = word.end_time
current_sentence.text += " {}".format(word.word)
current_sentence.words.append(timestamped_word)

timestamped_sentences.append(current_sentence)

# Adjust indices
current_sentence = None
sentence_index += 1
word_index = 0

# Update current sentence
else:
current_sentence.text += " {}".format(word.word)
current_sentence.words.append(timestamped_word)
word_index += 1

# Update confidence stats
confidence_sum += result.alternatives[0].confidence
segments += 1
timestamped_sentence.words.append(timestamped_word)

# Increment word marker
w_marker += num_words

# Add Sentence to sentence list
timestamped_sentences.append(timestamped_sentence)

# Increment transcript sentence index
transcript_sentence_index += 1

# Update confidence stats
confidence_sum += result.alternatives[0].confidence
segments += 1

# Compute mean confidence
if segments > 0:
Expand Down
9 changes: 6 additions & 3 deletions cdp_backend/tests/sr_models/test_google_cloud_sr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ def __init__(self, word: str, start_time: float, end_time: float):


class FakeRecognizeAlternative:
def __init__(self, words: List[FakeRecognizeWord]):
def __init__(self, transcript: str, words: List[FakeRecognizeWord]):
self.words = words
self.transcript = transcript
self.confidence = random.random()


Expand All @@ -55,6 +56,7 @@ class FakeRecognizeResults:
FakeRecognizeResult(
[
FakeRecognizeAlternative(
"Hello everyone, and thank you for coming.",
[
FakeRecognizeWord("Hello", 0.0, 0.6),
FakeRecognizeWord("everyone,", 0.7, 1.1),
Expand All @@ -63,13 +65,14 @@ class FakeRecognizeResults:
FakeRecognizeWord("you", 1.8, 1.9),
FakeRecognizeWord("for", 2.0, 2.1),
FakeRecognizeWord("coming.", 2.2, 2.4),
]
],
)
]
),
FakeRecognizeResult(
[
FakeRecognizeAlternative(
"Will the clerk begin by taking roll.",
[
FakeRecognizeWord("Will", 3.0, 3.1),
FakeRecognizeWord("the", 3.2, 3.3),
Expand All @@ -78,7 +81,7 @@ class FakeRecognizeResults:
FakeRecognizeWord("by", 3.8, 3.9),
FakeRecognizeWord("taking", 4.0, 4.1),
FakeRecognizeWord("roll.", 4.2, 4.3),
]
],
)
]
),
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
"pulumi~=3.0.0",
"pulumi-google-native~=0.1.0",
"pulumi-gcp~=5.0.0",
"spacy~=3.0.6",
"truecase>=0.0.9",
"webvtt-py>=0.4.5",
]
Expand Down

0 comments on commit 5f09efc

Please sign in to comment.