-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_kakasi.py
37 lines (26 loc) · 1.1 KB
/
run_kakasi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import polars as pl
from tqdm import tqdm
import pykakasi
RANDOM_SEED = 42
N_TEST_SAMPLES = 1000
pl.set_random_seed(RANDOM_SEED)
def main():
# Lazily read the Parquet file into a DataFrame
df = pl.scan_parquet("data/ndlbib.parquet")
# Randomly select 1000 samples
df_sample = df.collect().sample(n=N_TEST_SAMPLES, with_replacement=False, seed=RANDOM_SEED)
# Initialize Kakasi for conversion
kks = pykakasi.kakasi()
# Use Kakasi for conversion
converted_readings = []
for text in tqdm(df_sample["text"]):
converted_reading = ''.join(segment['hira'] for segment in kks.convert(text))
converted_readings.append(converted_reading)
# Create a new DataFrame with the sampled text and converted readings
df_sample = df_sample.with_columns(pl.Series("reading_output", converted_readings))
# Keep only the "text" and "reading_output" columns
df_sample = df_sample.select(["text", "reading", "reading_output"])
# Write the new DataFrame to a new Parquet file
df_sample.write_parquet("results/ndlbib_kakasi.parquet")
if __name__ == "__main__":
main()