0.12.3

vardecab · Sep 29, 2020 · bdb2de5 · bdb2de5
1 parent 9d605e7
commit bdb2de5
Show file tree

Hide file tree

Showing 3 changed files with 184 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+_old
+output
+TODO
+.DS_Store
diff --git a/README.md b/README.md
@@ -0,0 +1,52 @@
+# kindle-words
+
+> Do something useful with your Kindle notes :) This script extracts individual words from `My Clippings` file hidden on your Kindle e-reader, translates them using Google Translate and exports the pair "original word" → "translation" into a `.txt` file from which you can learn these words or import them into an application such as [Quizlet](https://quizlet.com/).
+
+<!-- ## Screenshots -->
+
+<!-- ## How to use -->
+
+## Roadmap
+
+- Use DeepL rather than Google Translate. 
+- Dictionary definitions.
+- <del>Improve regex formula to better deal with words that have special characters.</del>
+- <del>Extract single words from source file.</del>
+- <del>Output list line by line.</del> 
+- <del>Use API to translate words.</del>
+- <del>Skip the same words on subsequent imports.</del>
+
+## Release History
+
+- 0.12.3: Fixes to `regex` formula so it also takes words with `,` & `—`.
+- 0.12.2: Print which folder was created for exported files.
+- 0.12.1: Renamed variables & export files' names to improve readability. 
+- 0.12: Be able to select source & target languages.
+- 0.11: Added input timeout.
+- 0.10: Take input file directly from Kindle once drive letter is given.
+- 0.9: Export files to specific folders based on today's date & ID. 
+- 0.8: Add script runtime info.
+- 0.7: Fixes to `regex` formula so it also takes words with `.`, `-` & `"`.
+- 0.6: Print translations directly to `kindle-words_export-{DATE}.txt`. 
+- 0.5: No more duplicate words.
+- 0.4: Fixed `charmap' codec can't encode character (...)` problem that occured with PL characters. 
+- 0.3: Translation with [googletrans](https://pypi.org/project/googletrans) lib.
+- 0.2: Output list line by line + export to a `.txt.` file. 
+- 0.1: Initial release. Extract single words from source file using `regex`.
+
+## Versioning
+
+Using [SemVer](http:https://semver.org/).
+
+## License
+
+GNU General Public License v3.0, see [LICENSE.md](https://github.com/vardecab/umbrella/blob/master/LICENSE).
+
+## Acknowledgements
+
+- https://stackabuse.com/text-translation-with-google-translate-api-in-python/
+- https://pypi.org/project/googletrans
+- https://regex101.com/
+- https://stackoverflow.com/questions/56995919/change-python-3-7-default-encoding-from-cp1252-to-cp65001-aka-utf-8
+- https://www.geeksforgeeks.org/print-lists-in-python-4-different-ways/
+- https://stackabuse.com/writing-to-a-file-with-pythons-print-function/
diff --git a/script.py b/script.py
@@ -0,0 +1,128 @@
+# kindle-words
+# 0.12.3
+
+### import libs 
+import re # regex; extract words
+from googletrans import Translator # Google Translate API; https://pypi.org/project/googletrans
+# from progress.bar import Bar # progress bar lib; https://pypi.org/project/progress/
+# from tqdm import tqdm # progress bar lib v2
+import os # create new folders
+
+### current date 
+from datetime import datetime # have current date in exported files' names
+today_date = datetime.strftime(datetime.now(), '%y%m%d-%f') # https://www.w3schools.com/python/python_datetime.asp
+
+### script runtime
+import time
+start_time = time.time() 
+
+### fix for "'charmap' codec can't encode character (...)" problem; https://stackoverflow.com/questions/56995919/change-python-3-7-default-encoding-from-cp1252-to-cp65001-aka-utf-8
+import sys
+import io
+sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding = 'utf-8')
+sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding = 'utf-8')
+
+### path to file; https://www.journaldev.com/15642/python-switch-case
+path = {
+ 'C' : r'C:\documents\My Clippings.txt',
+ 'c' : r'C:\documents\My Clippings.txt',
+ 'D' : r'D:\documents\My Clippings.txt',
+ 'd' : r'D:\documents\My Clippings.txt',
+ 'E' : r'E:\documents\My Clippings.txt',
+ 'e' : r'E:\documents\My Clippings.txt',
+ 'F' : r'F:\documents\My Clippings.txt',
+ 'f' : r'F:\documents\My Clippings.txt'
+}
+
+### input timeout
+from inputimeout import inputimeout, TimeoutOccurred # input timeout; https://pypi.org/project/inputimeout/
+
+# select source language
+try:
+ select_source_language = inputimeout(prompt="Enter the source language (default: en): ", timeout=3)
+except TimeoutOccurred:
+ print ("Time ran out, selecting default source language (en)...")
+ select_source_language = 'en'
+
+# select target language
+try:
+ select_target_language = inputimeout(prompt="Enter the target language (default: pl): ", timeout=3)
+except TimeoutOccurred:
+ print ("Time ran out, selecting default target language (pl)...")
+ select_target_language = 'pl'
+
+# select Kindle driver letter
+try:
+ kindle_drive_letter = inputimeout(prompt="Enter the drive letter that is assigned to your Kindle (C/D/E/F): ", timeout=3)
+ with io.open(path.get(kindle_drive_letter,r'D:\documents\My Clippings.txt'), "r", encoding="utf-8") as source_file: # "r'D:\" as default/fallback 
+ read_source_file = source_file.read()
+except TimeoutOccurred:
+ print ("Time ran out, selecting default drive (D)...")
+ with io.open(r'_old/test.txt', "r", encoding="utf-8") as source_file: # TODO: change from dev to prod
+ read_source_file = source_file.read()
+
+### open source file; testing purposes
+# with io.open(r"_old/test.txt", "r", encoding="utf-8") as source_file: 
+# with io.open(r"200729.txt", "r", encoding="utf-8") as source_file: 
+ # read_source_file = source_file.read()
+
+### regex formula # TODO: remove unnecessary formulas once published to repo
+
+# regex = re.compile(r"^'?\w[\w']*(?:-\w+)*'?$", re.MULTILINE) # 6xx w/ dupes @ 200729.txt
+# regex = re.compile(r"^['\"\“]?(\w+)[\,\"\'\. ]?$", re.MULTILINE) # 886 w/ dupes @ 200729.txt 
+# regex = re.compile(r"^['\"\“]?(\w+)[\,\"\'\. ]?$|^'?\w[\w']*(?:-\w+)*'?$|^(?=\S*[’'-])([a-zA-Z’'-]+)|(?=.*\w)^(\w|')+$", re.MULTILINE) # experiment
+# regex_find_single_words = re.compile(r"^[\w'’“\-\.]+$", re.MULTILINE) # experiment
+
+regex_find_single_words = re.compile(r"^[\w'’“\-\.\,\—]+$", re.MULTILINE) # experiment; version to include , & —
+
+### find single words in the source file 
+single_words_with_special_characters = re.findall(regex_find_single_words,read_source_file)
+# print ("Single words: ", single_words) # 🐛 debug (with duplicates)
+print ("With dupes: ", len(single_words_with_special_characters)) # 🐛 debug (how many words in the list)
+
+### remove duplicates from the list
+single_words_with_special_characters = list(dict.fromkeys(single_words_with_special_characters))
+print ("Without dupes: ", len(single_words_with_special_characters)) # 🐛 debug (how many words in the list)
+
+## print single words line by line & export file 
+# print (*single_words_with_special_characters, sep = "\n") # 🐛 debug
+output_lines = '\n'.join(map(str, single_words_with_special_characters)) # https://www.geeksforgeeks.org/print-lists-in-python-4-different-ways/
+if not os.path.isdir("/output/" + today_date):
+ os.mkdir("output/" + today_date)
+ print ("Folder created: " + today_date)
+with open(r"output/" + today_date + "/output-imperfect-" + today_date + ".txt", "w", encoding="utf-8") as output: 
+ output.write(output_lines.lower())
+
+### take single words with special characters from the file and remove unnecessary chars (eg ".-)
+with io.open(r"output/" + today_date + "/output-imperfect-" + today_date + ".txt", "r", encoding="utf-8") as source_file: 
+ read_source_file = source_file.read()
+
+single_words = re.findall(r"\b\w*[-'’]\w*\b|\w+",read_source_file)
+output_lines = '\n'.join(map(str, single_words))
+with open(r"output/" + today_date + "/output-perfect-" + today_date + ".txt", "w", encoding="utf-8") as output: 
+ output.write(output_lines.lower())
+
+### translation
+Translator = Translator()
+
+# print directly to the file: https://stackabuse.com/writing-to-a-file-with-pythons-print-function/
+original_stdout = sys.stdout # save a reference to the original standard output
+# with open(r"C:\Users\x\Desktop\kindle-words_export.txt", "w", encoding="utf-8") as export_translations: # "a" → append, "w" → write
+with open(r"output/" + today_date + "/kindle-words_export-" + today_date + ".txt", "w", encoding="utf-8") as export_translations: # NOTE: "a" → append, "w" → write
+ translations = Translator.translate(single_words, src=select_source_language, dest=select_target_language) # NOTE: / FIXME: black box - wwhole thing is ran inside which means progress bar won't work
+ # for translation in tqdm(translations):
+ # counter = 1 # progress
+ for translation in translations:
+ # print ("Word:", str(counter), "/", len(single_words)) # step in the process; progress
+ sys.stdout = export_translations # output to the file above
+ print (translation.origin, ' -> ', translation.text)
+ sys.stdout = original_stdout # reset the standard output to its original value
+ # counter += 1 # progress 
+
+ # translations = Translator.translate(single_words, src="en", dest="pl")
+ # for translation in translations:
+ # sys.stdout = export_translations # output to the file
+ # print(translation.origin, ' -> ', translation.text)
+ # sys.stdout = original_stdout # reset the standard output to its original value
+
+print("Script runtime: %.2f seconds" % (time.time() - start_time), "with", len(single_words), "translations.")