Skip to content

Commit

Permalink
1.5
Browse files Browse the repository at this point in the history
  • Loading branch information
vardecab committed Mar 18, 2021
1 parent 79fe91b commit c5708d8
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 8 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ _old

# test
test
output-*
data-*

# data
output/*
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ Save.
pip install inputimeout
pip install deep_translator
pip install pync
pip install langdetect
```
3. Navigate to the folder you cloned/downloaded & run the script:
```sh
Expand All @@ -66,12 +67,13 @@ python script.py

## Release History

- 1.2: Added language detection to skip translation of words already in desired language.
- 1.1.4: Added notifications for macOS & Windows.
- 1.1.3: Added support for macOS.
- 1.1.2: Added `try/except` to fix a `FileNotFoundError` error.
- 1.1.1: Fixed `io.open` bug; added some `try/except` to catch more errors; re-enabled `timeout_time`; added `last_word` export so it's easy to see which words are new and which are old. Published in [Releases](https://github.com/vardecab/kindle-words/releases).
- 1.1: Quite a big re-write: it now works properly with `My Clippings.txt` file from Kindle - all bugs are fixed. Initial run takes ~ 10 minutes to complete (depending on the size of your file) but afterwards it's usually < 1 minute because data from previous run is stored locally for comparison - only new words are being translated to save time and improve speed.
- 1.0.0: Using new API - [deep-translator](https://github.com/nidhaloff/deep-translator).
- 1.0.0: Using new backend - [deep-translator](https://github.com/nidhaloff/deep-translator).
- 0.12.5: Bug in the API discovered.
- 0.12.4: Cleared up the code for better readability.
- 0.12.3: Fixes to `regex` formula so it also takes words with `,` & ``.
Expand Down
25 changes: 18 additions & 7 deletions script.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@

elif platform == "darwin": # macOS
try:
kindle_name = inputimeout(prompt="Enter your Kindle's name: ", timeout=timeout_time).lower()
kindle_name = inputimeout(prompt="Enter your Kindle's name (default: Kindle): ", timeout=timeout_time).lower()
with io.open(f'/Volumes/{kindle_name}/documents/My Clippings.txt', "r", encoding="utf-8") as source_file:
read_source_file = source_file.readlines() # read the file to [list]
except TimeoutOccurred:
Expand Down Expand Up @@ -114,6 +114,7 @@
read_source_file = [word.replace(',','') for word in read_source_file] # remove character
read_source_file = [word.replace('.','') for word in read_source_file] # remove character
read_source_file = [word.replace(';','') for word in read_source_file] # remove character
read_source_file = [word.replace(':','') for word in read_source_file] # remove character
read_source_file = [word.replace('“','') for word in read_source_file] # remove character
read_source_file = [word.replace('”','') for word in read_source_file] # remove character
read_source_file = [word.replace('’','') for word in read_source_file] # remove character
Expand All @@ -136,6 +137,16 @@
single_words = list(dict.fromkeys(single_words)) # remove duplicates; https://www.w3schools.com/python/python_howto_remove_duplicates.asp
print ("There are", len(single_words), 'unique words in My Clippings file.')

### skip words already in target_language
from langdetect import detect # language detection

words = []
print(f'Removing words already in {select_target_language.upper()}...')
for word in range(len(single_words)):
if detect(single_words[word]) != select_target_language: # if a word is already in the target_language then skip it
words.append(single_words[word])
print(f'Without words already in {select_target_language.upper()}, there are {len(words)} words.')

### open saved list
import pickle
try:
Expand All @@ -147,25 +158,25 @@

### comparison
try:
difference = set(single_words) - set(saved_list) # what's new in single_words[]
difference = set(words) - set(saved_list) # what's new in words[]
if len(saved_list) == 0:
difference = set(single_words)
difference = set(words)
except:
difference = set(single_words)
difference = set(words)
to_translate = list(difference) # convert set to list
print("There are", len(to_translate), "new words to translate.")

if len(to_translate) > 0:
output_lines = '\n'.join(map(str, to_translate))
with open(r"output/output-original_words.txt", "a", encoding="utf-8") as output:
with open(r"output/output-original_words.txt", "w", encoding="utf-8") as output:
output.write(output_lines.lower())

### translation
# split list to smaller lists to get around 5000-character-limit of deep-translator package
chunks = [to_translate[x:x+250] for x in range(0, len(to_translate), 250)] # split into sublists of 250 words each
print('List of words was split into:', len(chunks), 'chunk(s) for translation.') # debug; how many sublists are in this master list

from deep_translator import GoogleTranslator, batch_detection
from deep_translator import GoogleTranslator
print('Translating...')

### export a pair: original → translated
Expand All @@ -188,7 +199,7 @@

### export list for future comparison
with open('data/saved_location', 'wb') as file_export:
pickle.dump(single_words, file_export)
pickle.dump(words, file_export)

### runtime
end_time = time.time() # run time end
Expand Down

0 comments on commit c5708d8

Please sign in to comment.