1.5

vardecab · Mar 18, 2021 · c5708d8 · c5708d8
1 parent 79fe91b
commit c5708d8
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,8 @@ _old
 
 # test
 test
+output-*
+data-*
 
 # data
 output/*

diff --git a/README.md b/README.md
@@ -41,6 +41,7 @@ Save.
 pip install inputimeout
 pip install deep_translator 
 pip install pync
+pip install langdetect
 ```
 3. Navigate to the folder you cloned/downloaded & run the script:
 ```sh
@@ -66,12 +67,13 @@ python script.py
 
 ## Release History
 
+- 1.2: Added language detection to skip translation of words already in desired language.
 - 1.1.4: Added notifications for macOS & Windows.
 - 1.1.3: Added support for macOS.
 - 1.1.2: Added `try/except` to fix a `FileNotFoundError` error.
 - 1.1.1: Fixed `io.open` bug; added some `try/except` to catch more errors; re-enabled `timeout_time`; added `last_word` export so it's easy to see which words are new and which are old. Published in [Releases](https://github.com/vardecab/kindle-words/releases).
 - 1.1: Quite a big re-write: it now works properly with `My Clippings.txt` file from Kindle - all bugs are fixed. Initial run takes ~ 10 minutes to complete (depending on the size of your file) but afterwards it's usually < 1 minute because data from previous run is stored locally for comparison - only new words are being translated to save time and improve speed.
-- 1.0.0: Using new API - [deep-translator](https://github.com/nidhaloff/deep-translator). 
+- 1.0.0: Using new backend - [deep-translator](https://github.com/nidhaloff/deep-translator). 
 - 0.12.5: Bug in the API discovered.
 - 0.12.4: Cleared up the code for better readability.
 - 0.12.3: Fixes to `regex` formula so it also takes words with `,` & `—`.

diff --git a/script.py b/script.py
@@ -76,7 +76,7 @@
 
 elif platform == "darwin": # macOS
  try: 
- kindle_name = inputimeout(prompt="Enter your Kindle's name: ", timeout=timeout_time).lower()
+ kindle_name = inputimeout(prompt="Enter your Kindle's name (default: Kindle): ", timeout=timeout_time).lower()
  with io.open(f'/Volumes/{kindle_name}/documents/My Clippings.txt', "r", encoding="utf-8") as source_file: 
  read_source_file = source_file.readlines() # read the file to [list]
  except TimeoutOccurred: 
@@ -114,6 +114,7 @@
 read_source_file = [word.replace(',','') for word in read_source_file] # remove character
 read_source_file = [word.replace('.','') for word in read_source_file] # remove character
 read_source_file = [word.replace(';','') for word in read_source_file] # remove character
+read_source_file = [word.replace(':','') for word in read_source_file] # remove character
 read_source_file = [word.replace('“','') for word in read_source_file] # remove character
 read_source_file = [word.replace('”','') for word in read_source_file] # remove character
 read_source_file = [word.replace('’','') for word in read_source_file] # remove character
@@ -136,6 +137,16 @@
 single_words = list(dict.fromkeys(single_words)) # remove duplicates; https://www.w3schools.com/python/python_howto_remove_duplicates.asp
 print ("There are", len(single_words), 'unique words in My Clippings file.')
 
+### skip words already in target_language
+from langdetect import detect # language detection
+
+words = []
+print(f'Removing words already in {select_target_language.upper()}...')
+for word in range(len(single_words)):
+ if detect(single_words[word]) != select_target_language: # if a word is already in the target_language then skip it
+ words.append(single_words[word])
+print(f'Without words already in {select_target_language.upper()}, there are {len(words)} words.')
+
 ### open saved list
 import pickle
 try: 
@@ -147,25 +158,25 @@
 
 ### comparison 
 try: 
- difference = set(single_words) - set(saved_list) # what's new in single_words[]
+ difference = set(words) - set(saved_list) # what's new in words[]
  if len(saved_list) == 0:
- difference = set(single_words)
+ difference = set(words)
 except: 
- difference = set(single_words)
+ difference = set(words)
 to_translate = list(difference) # convert set to list
 print("There are", len(to_translate), "new words to translate.")
 
 if len(to_translate) > 0:
  output_lines = '\n'.join(map(str, to_translate))
- with open(r"output/output-original_words.txt", "a", encoding="utf-8") as output: 
+ with open(r"output/output-original_words.txt", "w", encoding="utf-8") as output: 
  output.write(output_lines.lower())
 
  ### translation
  # split list to smaller lists to get around 5000-character-limit of deep-translator package
  chunks = [to_translate[x:x+250] for x in range(0, len(to_translate), 250)] # split into sublists of 250 words each
  print('List of words was split into:', len(chunks), 'chunk(s) for translation.') # debug; how many sublists are in this master list
 
- from deep_translator import GoogleTranslator, batch_detection
+ from deep_translator import GoogleTranslator
  print('Translating...')
 
  ### export a pair: original → translated 
@@ -188,7 +199,7 @@
 
  ### export list for future comparison 
  with open('data/saved_location', 'wb') as file_export:
- pickle.dump(single_words, file_export)
+ pickle.dump(words, file_export)
 
  ### runtime 
  end_time = time.time() # run time end