feature/use-spacy-in-transcription (#59)

* Use spaCy for sentence segmentation in GoogleCloudSRModel * Remove print statement * Remove commented out code * Remove unnecessary deps and fix typo * Add comma back
CouncilDataProject · Jun 2, 2021 · 5f09efc · 5f09efc
1 parent 9d0332c
commit 5f09efc
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 49 deletions.
diff --git a/cdp_backend/sr_models/google_cloud_sr_model.py b/cdp_backend/sr_models/google_cloud_sr_model.py
@@ -8,6 +8,7 @@
 from typing import Any, List, Optional, Union
 
 from google.cloud import speech_v1p1beta1 as speech
+from spacy.lang.en import English
 
 from ..pipeline.transcript_model import Sentence, Transcript, Word
 from .sr_model import SRModel
@@ -107,72 +108,86 @@ def transcribe(
 
  # Create timestamped sentences
  timestamped_sentences: List[Sentence] = []
- current_sentence = None
- sentence_index = 0
- word_index = 0
+ transcript_sentence_index = 0
+
+ # Create sentence boundary pipeline
+ nlp = English()
+ nlp.add_pipe("sentencizer")
 
  for result in response.results:
  # Some portions of audio may not have text
  if len(result.alternatives) > 0:
- # Check length of transcript result
- word_list = result.alternatives[0].words
- if len(word_list) > 0:
- for word in word_list:
- # create Word
+ # Split transcript into sentences
+ doc = nlp(result.alternatives[0].transcript)
+
+ # Convert generator to list
+ sentences = [str(sent) for sent in doc.sents]
+
+ # Index holder for word results of response
+ w_marker = 0
+
+ for s_ind in range(0, len(sentences)):
+ # Sentence text
+ s_text = sentences[s_ind]
+
+ num_words = len(s_text.split())
+
+ # Initialize sentence model
+ timestamped_sentence = Sentence(
+ index=transcript_sentence_index,
+ confidence=result.alternatives[0].confidence,
+ # Start and end time are placeholder values
+ start_time=0.0,
+ end_time=0.0,
+ words=[],
+ text=s_text,
+ )
+
+ for w_ind in range(w_marker, w_marker + num_words):
+ # Extract word from response
+ word = result.alternatives[0].words[w_ind]
+
  start_time = (
  word.start_time.seconds + word.start_time.nanos * 1e-9
  )
  end_time = word.end_time.seconds + word.end_time.nanos * 1e-9
 
+ # Add start_time to Sentence if first word
+ if w_ind - w_marker == 0:
+ timestamped_sentence.start_time = start_time
+
+ # Add end_time to Sentence if last word
+ if (w_ind - w_marker) == (num_words - 1):
+ timestamped_sentence.end_time = end_time
+
  # Clean everything but non-delimiting characters make lowercase
  regex = re.compile(r"[^a-zA-Z0-9'\-]")
  cleaned_word = regex.sub("", word.word).lower()
+
+ # Create Word model
  timestamped_word = Word(
- index=word_index,
+ index=w_ind - w_marker,
  start_time=start_time,
  end_time=end_time,
  text=cleaned_word,
  # TODO: Add annotations
  annotations=None,
  )
 
- if current_sentence is None:
- current_sentence = Sentence(
- index=sentence_index,
- confidence=result.alternatives[0].confidence,
- start_time=word.start_time,
- end_time=word.end_time,
- # TODO: Add speaker and annotations?
- words=[timestamped_word],
- text=word.word,
- )
- word_index += 1
-
- # End current sentence and reset
- # TODO: Account for non-sentence ending periods, such as
- # prefixes like "Mr." or "Dr."
- elif bool(re.match(r"\.|\?", word.word[-1])):
- # Finish sentence and append
- current_sentence.end_time = word.end_time
- current_sentence.text += " {}".format(word.word)
- current_sentence.words.append(timestamped_word)
-
- timestamped_sentences.append(current_sentence)
-
- # Adjust indices
- current_sentence = None
- sentence_index += 1
- word_index = 0
-
- # Update current sentence
- else:
- current_sentence.text += " {}".format(word.word)
- current_sentence.words.append(timestamped_word)
- word_index += 1
-
- # Update confidence stats
- confidence_sum += result.alternatives[0].confidence
- segments += 1
+ timestamped_sentence.words.append(timestamped_word)
+
+ # Increment word marker
+ w_marker += num_words
+
+ # Add Sentence to sentence list
+ timestamped_sentences.append(timestamped_sentence)
+
+ # Increment transcript sentence index
+ transcript_sentence_index += 1
+
+ # Update confidence stats
+ confidence_sum += result.alternatives[0].confidence
+ segments += 1
 
  # Compute mean confidence
  if segments > 0:

diff --git a/cdp_backend/tests/sr_models/test_google_cloud_sr_model.py b/cdp_backend/tests/sr_models/test_google_cloud_sr_model.py
@@ -40,8 +40,9 @@ def __init__(self, word: str, start_time: float, end_time: float):
 
 
 class FakeRecognizeAlternative:
- def __init__(self, words: List[FakeRecognizeWord]):
+ def __init__(self, transcript: str, words: List[FakeRecognizeWord]):
  self.words = words
+ self.transcript = transcript
  self.confidence = random.random()
 
 
@@ -55,6 +56,7 @@ class FakeRecognizeResults:
  FakeRecognizeResult(
  [
  FakeRecognizeAlternative(
+ "Hello everyone, and thank you for coming.",
  [
  FakeRecognizeWord("Hello", 0.0, 0.6),
  FakeRecognizeWord("everyone,", 0.7, 1.1),
@@ -63,13 +65,14 @@ class FakeRecognizeResults:
  FakeRecognizeWord("you", 1.8, 1.9),
  FakeRecognizeWord("for", 2.0, 2.1),
  FakeRecognizeWord("coming.", 2.2, 2.4),
- ]
+ ],
  )
  ]
  ),
  FakeRecognizeResult(
  [
  FakeRecognizeAlternative(
+ "Will the clerk begin by taking roll.",
  [
  FakeRecognizeWord("Will", 3.0, 3.1),
  FakeRecognizeWord("the", 3.2, 3.3),
@@ -78,7 +81,7 @@ class FakeRecognizeResults:
  FakeRecognizeWord("by", 3.8, 3.9),
  FakeRecognizeWord("taking", 4.0, 4.1),
  FakeRecognizeWord("roll.", 4.2, 4.3),
- ]
+ ],
  )
  ]
  ),

diff --git a/setup.py b/setup.py
@@ -58,6 +58,7 @@
  "pulumi~=3.0.0",
  "pulumi-google-native~=0.1.0",
  "pulumi-gcp~=5.0.0",
+ "spacy~=3.0.6",
  "truecase>=0.0.9",
  "webvtt-py>=0.4.5",
 ]