print '?' if a letter can't be encoded using the system default encoding

openai · jongwook · Jan 18, 2023 · Jan 18, 2023 · Jan 18, 2023 · f46a1943ea52b4cde3f00140bc6c86cd60b1bddc
commit f46a1943ea52b4cde3f00140bc6c86cd60b1bddc
diff --git a/whisper/transcribe.py b/whisper/transcribe.py
@@ -1,5 +1,6 @@
 import argparse
 import os
+import sys
 import warnings
 from typing import List, Optional, Tuple, Union, TYPE_CHECKING
 
@@ -165,7 +166,10 @@ def add_segment(
  }
  )
  if verbose:
- print(f"[{format_timestamp(start)} --> {format_timestamp(end)}] {text}")
+ line = f"[{format_timestamp(start)} --> {format_timestamp(end)}] {text}\n"
+ # compared to just `print(line)`, this replaces any character not representable using
+ # the system default encoding with an '?', avoiding UnicodeEncodeError.
+ sys.stderr.buffer.write(line.encode(sys.getdefaultencoding(), errors="replace"))
 
  # show the progress bar when verbose is False (otherwise the transcribed text will be printed)
  num_frames = mel.shape[-1]