-
Notifications
You must be signed in to change notification settings - Fork 0
/
kindlehighlight.py
53 lines (37 loc) · 1.43 KB
/
kindlehighlight.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pandas as pd
import re
from bs4 import BeautifulSoup
import os.path
import argparse
def main():
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
filepath = args.filepath
html_path = os.path.join(BASE_DIR, filepath)
with open(html_path, encoding='utf-8') as namafile:
soup = BeautifulSoup(namafile.read(), features="html.parser")
judul = re.sub('[!@#$:]', '', soup.find_all('h3')[0].get_text())+'.csv'
teks = soup.findAll('span', {'id':'highlight'})
page = soup.findAll('span', {'id':'annotationNoteHeader'})
note = soup.findAll('span', {'id': 'note'})
hal = []
isiteks = []
isinotes = []
for baris in teks:
isiteks.append(baris.get_text())
for baris in page:
hal.append(re.sub(',','',re.search('((=?\d{1,3},)*\d{1,3})', baris.get_text()).group(0)))
for baris in note:
isinotes.append(baris.get_text())
tabel = {'Highlight':isiteks, 'Page':hal, 'Note':isinotes}
df = pd.DataFrame(tabel)
csv_path = os.path.join(BASE_DIR, judul)
df.to_csv(csv_path, index=False)
print('fin')
return
if __name__ == '__main__':
my_parser = argparse.ArgumentParser()
my_parser.add_argument('filepath',
action='store',
help='path to HTML source code in txt')
args = my_parser.parse_args()
main()