-
Notifications
You must be signed in to change notification settings - Fork 17
/
gen_feats_from_service.py
103 lines (84 loc) · 2.84 KB
/
gen_feats_from_service.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# @Time : 2023/6/6
# @Author : Minglun Han
# @File : gen_feats_from_service.py
# Bonjour! Wish you good luck, everyday! :)
import os
import sys
import json
import torch
import base64
import requests
import argparse
import urllib
import numpy as np
import torchaudio
import torchaudio.transforms as T
def get_ASRFeature(waveform):
print("waveform_size: ", waveform.size())
waveform_bytes = waveform.numpy().tobytes()
audio_base64 = base64.b64encode(waveform_bytes).decode('utf-8')
url = 'https://172.18.30.121:8888/genfeats'
asr_feature = requests.post(url, json={'waveform':audio_base64}).json()
asr_feature = base64.b64decode(asr_feature)
asr_feature = np.frombuffer(asr_feature, dtype=np.float32)
asr_feature = torch.tensor(asr_feature, dtype=torch.float32).view(-1, 512)
asr_feature = asr_feature.unsqueeze(0)
print(asr_feature.size())
return asr_feature
wav_dir = "/raid/cfl/data/all_audio_cif_feats/js_paired_data/wav/js_real_test_data"
text_path = "/raid/cfl/data/all_audio_cif_feats/js_paired_data/wav/js_real_test_data/text"
output_dir = "/raid/cfl/data/all_audio_cif_feats/js_paired_data/"
output_feats_dir = "/raid/cfl/data/all_audio_cif_feats/js_paired_data/dumped_cif_feats/"
output_json_fn = "js_real_test_data.json"
# load text
id2text = dict()
with open(text_path, "r") as f_text:
for line in f_text:
utt_id, text = line.strip().split("\t")
utt_id = utt_id.split(".")[0]
id2text[utt_id] = text.strip()
n = 0
json_save_list = []
fn_list = os.listdir(wav_dir)
for fn in fn_list:
if ".wav" not in fn:
continue
utt_id = fn.strip().split(".")[0]
wav_path = os.path.join(wav_dir, fn)
waveform, sr = torchaudio.load(wav_path)
if sr != 16000:
resampler = T.Resample(sr, 16000, dtype=waveform.dtype)
waveform = resampler(waveform)
cif_feats = get_ASRFeature(waveform=waveform)[0] # T x C
text = id2text[utt_id]
text = text.replace(".", "")
text = text.replace("。", "")
text = text.replace("?", "")
text = text.replace(",", "")
text = text.replace(",", "")
text = text.replace(":", "")
# save feats
utt_id = "js_" + utt_id
cur_save_dir = os.path.join(output_feats_dir, utt_id + ".npy")
length = cif_feats.size(0)
np.save(cur_save_dir, cif_feats[:length])
json_save_list.append(
{
"utterance_id": utt_id,
"speech_feats_path": cur_save_dir,
"length": length,
"wav_length": waveform.size()[-1],
"target_text": text,
}
)
n += 1
# if n == 10:
# break
if n % 1000 == 0:
print("have processed %d samples" % n )
json_str = json.dumps(
json_save_list, indent=4, ensure_ascii=False
)
output_fn = os.path.join(output_dir, output_json_fn)
with open(output_fn, "w") as f_out:
f_out.write(json_str)