forked from mrspiggot/prompts_and_completions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
transcriber.py
68 lines (57 loc) · 2.47 KB
/
transcriber.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import re
import pandas as pd
import json
import whisper
import youtube_dl
class LucidateTextSplitter:
def __init__(self, text, n):
self.text = text
self.n = n
def split_into_sentences_with_prompts(self):
print(self.text)
print(type(self.text))
if self.text == "":
raise ValueError("Input text cannot be empty.")
if self.n <= 0:
raise ValueError("n must be a positive integer.")
sentences = re.split("(?<=[.!?]) +", self.text['text'])
if len(sentences) < self.n:
raise ValueError("Input text must have at least n sentences.")
prompts = sentences[::self.n]
completions = []
for i in range(len(prompts) - 1):
completion = " ".join(sentences[self.n * i + 1:self.n * (i + 1)])
completions.append(completion)
completions.append(" ".join(sentences[self.n * (len(prompts) - 1) + 1:]))
data = {'prompt': prompts, 'completion': completions}
df = pd.DataFrame(data)
return df
def save_as_excel(self, filename):
df = self.split_into_sentences_with_prompts()
df.to_excel(filename, index=False)
def save_as_csv(self, filename):
df = self.split_into_sentences_with_prompts()
df.to_csv(filename, index=False)
def save_as_json(self, filename):
df = self.split_into_sentences_with_prompts()
data = []
for i in range(len(df)):
row = {'prompt': df.iloc[i]['prompt'], 'completion': df.iloc[i]['completion']}
data.append(row)
with open(filename, 'w') as f:
json.dump(data, f)
class LucidateTranscriber:
def __init__(self, model_path):
self.model = whisper.load_model(model_path)
def save_to_mp3(self, url):
options = {'format': 'bestaudio/best', 'postprocessors': [{'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3', 'preferredquality': '192'}]}
with youtube_dl.YoutubeDL(options) as downloader:
downloader.download([url])
return downloader.prepare_filename(downloader.extract_info(url, download=False)).replace(".m4a", ".mp3")
def transcribe_youtube_video(self, url, fp16=False, n=5, op_name='transcribe'):
filename = self.save_to_mp3(url)
text = self.model.transcribe(filename, fp16=fp16)
splitter = LucidateTextSplitter(text, n)
splitter.save_as_excel(f'{op_name}.xlsx')
return splitter.split_into_sentences_with_prompts()