update seedrecover wordlist support & add new unit tests

* support seeds w/just the first four letters of each word (for gurnec#114) * add new BIP39 languages Italian and Korean * remove Electrum2 French language (was planned but never finished) * restrict Electrum2 seeds to those languages supported by Electrum2 * update wordlists/README.md to reflect all of the above * add new seedrecover unit tests for each type of seed typo
wanyvic · Oct 19, 2017 · aa66ca3 · aa66ca3
1 parent 56a0dbf
commit aa66ca3
Show file tree

Hide file tree

Showing 7 changed files with 4,229 additions and 1,760 deletions.
diff --git a/btcrecover/btcrseed.py b/btcrecover/btcrseed.py
@@ -28,7 +28,7 @@
 # (all optional futures for 2.7 except unicode_literals)
 from __future__ import print_function, absolute_import, division
 
-__version__ = "0.7.2"
+__version__ = "0.7.3"
 
 from . import btcrpass
 from .addressset import AddressSet
@@ -770,18 +770,30 @@ def _performance_xpub():
 
 @register_selectable_wallet_class("Standard BIP39/BIP44 (Mycelium, TREZOR, Ledger, Bither, Blockchain.info, Jaxx)")
 class WalletBIP39(WalletBIP32):
+ FIRSTFOUR_TAG = "-firstfour"
 
  # Load the wordlists for all languages (actual one to use is selected in config_mnemonic() )
  _language_words = {}
  @classmethod
- def _load_wordlists(cls, name = "bip39"):
- for filename in glob.iglob(os.path.join(wordlists_dir, name + "-??*.txt")):
- wordlist_lang = os.path.basename(filename)[len(name)+1:-4] # e.g. "en", or "zh-hant"
- if wordlist_lang in cls._language_words:
- continue # skips loading bip39-fr if electrum2-fr is already loaded
- wordlist = load_wordlist(name, wordlist_lang)
- assert len(wordlist) == 2048 or cls is not WalletBIP39, "BIP39 wordlist has 2048 words"
- cls._language_words[wordlist_lang] = wordlist
+ def _load_wordlists(cls):
+ assert not cls._language_words, "_load_wordlists() should only be called once from the first init()"
+ cls._do_load_wordlists("bip39")
+ for wordlist_lang in cls._language_words.keys(): # takes a copy of the keys so the dict can be safely changed
+ wordlist = cls._language_words[wordlist_lang]
+ assert len(wordlist) == 2048, "BIP39 wordlist has 2048 words"
+ # Special case for the four languages whose words may be truncated to the first four letters
+ if wordlist_lang in ("en", "es", "fr", "it"):
+ cls._language_words[wordlist_lang + cls.FIRSTFOUR_TAG] = [ w[:4] for w in wordlist ]
+ #
+ @classmethod
+ def _do_load_wordlists(cls, name, wordlist_langs = None):
+ if not wordlist_langs:
+ wordlist_langs = []
+ for filename in glob.iglob(os.path.join(wordlists_dir, name + "-??*.txt")):
+ wordlist_langs.append(os.path.basename(filename)[len(name)+1:-4]) # e.g. "en", or "zh-hant"
+ for lang in wordlist_langs:
+ assert lang not in cls._language_words, "wordlist not already loaded"
+ cls._language_words[lang] = load_wordlist(name, lang)
 
  @property
  def word_ids(self): return self._words
@@ -831,6 +843,33 @@ def config_mnemonic(self, mnemonic_guess = None, lang = None, passphrase = u"",
  raise ValueError("this version of Python doesn't support passphrases with Unicode code points > "+str(sys.maxunicode))
  self._derivation_salt = "mnemonic" + self._unicode_to_bytes(passphrase)
 
+ # Special case for wallets which tell users to record only the first four letters of each word;
+ # convert all short words into long ones (intentionally done *after* the finding of close words).
+ # Specifically, update self._words and the globals mnemonic_ids_guess and close_mnemonic_ids.
+ if self._lang.endswith(self.FIRSTFOUR_TAG):
+ long_lang_words = self._language_words[self._lang[:-len(self.FIRSTFOUR_TAG)]]
+ assert isinstance(long_lang_words[0], unicode), "long words haven't yet been converted into bytes"
+ assert isinstance(self._words[0], bytes), "short words have already been converted into bytes"
+ assert len(long_lang_words) == len(self._words), "long and short word lists have the same length"
+ long_lang_words = [ self._unicode_to_bytes(l) for l in long_lang_words ]
+ short_to_long = { s:l for s,l in zip(self._words, long_lang_words) }
+ self._words = long_lang_words
+ #
+ global mnemonic_ids_guess # the to-be-replaced short-words guess
+ long_ids_guess = () # the new long-words guess
+ for short_id in mnemonic_ids_guess:
+ long_ids_guess += None if short_id is None else short_to_long[short_id],
+ mnemonic_ids_guess = long_ids_guess
+ #
+ global close_mnemonic_ids
+ if close_mnemonic_ids:
+ assert isinstance(close_mnemonic_ids.iterkeys() .next(), bytes), "close word keys have already been converted into bytes"
+ assert isinstance(close_mnemonic_ids.itervalues().next()[0][0], bytes), "close word values have already been converted into bytes"
+ for key in close_mnemonic_ids.keys(): # takes a copy of the keys so the dict can be safely changed
+ vals = close_mnemonic_ids.pop(key)
+ # vals is a tuple containing length-1 tuples which in turn each contain one word in bytes-format
+ close_mnemonic_ids[short_to_long[key]] = tuple( (short_to_long[v[0]],) for v in vals )
+
  # Calculate each word's index in binary (needed by _verify_checksum())
  self._word_to_binary = { word : "{:011b}".format(i) for i,word in enumerate(self._words) }
 
@@ -1071,8 +1110,9 @@ class WalletElectrum2(WalletBIP39):
  # Load the wordlists for all languages (actual one to use is selected in config_mnemonic() )
  @classmethod
  def _load_wordlists(cls):
- super(WalletElectrum2, cls)._load_wordlists("electrum2") # the Electrum2-specific word lists
- super(WalletElectrum2, cls)._load_wordlists() # the default bip39 word lists
+ assert not cls._language_words, "_load_wordlists() should only be called once from the first init()"
+ cls._do_load_wordlists("electrum2")
+ cls._do_load_wordlists("bip39", ("en", "es", "ja", "zh-hans")) # only the four bip39 ones used by Electrum2
  assert all(len(w) >= 1411 for w in cls._language_words.values()), \
  "Electrum2 wordlists are at least 1411 words long" # because we assume a max mnemonic length of 13
 

diff --git a/btcrecover/sha512-bc-kernel.cl b/btcrecover/sha512-bc-kernel.cl
@@ -198,7 +198,7 @@ void kernel_sha512_bc(__global uint64_t* hashes_buffer,
  for (int i = 0; i < 8; i++)
  w[i] = SWAP64(hashes_buffer[i]);
 
- // Assumes original input length was 64 bytes
+ // Assumes original input length was 64 bytes; add padding to it
  w[8] = 0x8000000000000000UL; // The appended "1" bit
  #pragma unroll
  for (int i = 9; i < 15; i++)
@@ -258,7 +258,7 @@ void kernel_sha512_bc(__global uint64_t* hashes_buffer,
  w[6] = g + H6;
  w[7] = h + H7;
 
- // Assumes original input length was 64 bytes
+ // SHA512 output length is always 64 bytes; add padding to it
  w[8] = 0x8000000000000000UL; // The appended "1" bit
  #pragma unroll
  for (int i = 9; i < 15; i++)

diff --git a/btcrecover/test/test_seeds.py b/btcrecover/test/test_seeds.py
@@ -177,6 +177,12 @@ def test_bip44(self):
  "xpub6BgCDhMefYxRS1gbVbxyokYzQji65v1eGJXGEiGdoobvFBShcNeJt97zoJBkNtbASLyTPYXJHRvkb3ahxaVVGEtC1AD4LyuBXULZcfCjBZx",
  "certain come keen collect slab gauge photo inside mechanic deny leader drop")
 
+ def test_bip44_firstfour(self):
+ # an xpub at path m/44'/0'/0', as Mycelium for Android would export
+ self.mpk_tester(btcrseed.WalletBIP39,
+ "xpub6BgCDhMefYxRS1gbVbxyokYzQji65v1eGJXGEiGdoobvFBShcNeJt97zoJBkNtbASLyTPYXJHRvkb3ahxaVVGEtC1AD4LyuBXULZcfCjBZx",
+ "cert come keen coll slab gaug phot insi mech deny lead drop")
+
  def test_bip44_ja(self):
  # an xpub at path m/44'/0'/0'
  self.mpk_tester(btcrseed.WalletBIP39,
@@ -423,12 +429,74 @@ def test_bip44(self):
  "certain come keen collect slab gauge photo inside mechanic deny leader drop")
 
 
-# All seed tests are quick
-# TODO: remove slow TestAddressSet.test_false_positives from QuickTests
-class QuickTests(unittest.TestSuite) :
+class TestSeedTypos(unittest.TestCase):
+ XPUB = "xpub6BgCDhMefYxRS1gbVbxyokYzQji65v1eGJXGEiGdoobvFBShcNeJt97zoJBkNtbASLyTPYXJHRvkb3ahxaVVGEtC1AD4LyuBXULZcfCjBZx"
+
+ def seed_tester(self, the_mpk, correct_mnemonic, mnemonic_guess, typos = None, big_typos = 0):
+ correct_mnemonic = correct_mnemonic.split()
+ assert mnemonic_guess.split() != correct_mnemonic
+ assert typos or big_typos
+ btcrseed.loaded_wallet = btcrseed.WalletBIP39.create_from_params(mpk=the_mpk)
+ btcrseed.loaded_wallet.config_mnemonic(mnemonic_guess)
+ self.assertEqual(
+ btcrseed.run_btcrecover(typos or big_typos, big_typos, extra_args="--threads 1".split()),
+ tuple(correct_mnemonic))
+
+ def test_delete(self):
+ self.seed_tester(self.XPUB,
+ "certain come keen collect slab gauge photo inside mechanic deny leader drop", # correct
+ "certain come come keen collect slab gauge photo inside mechanic deny leader drop", # guess
+ typos=1)
+
+ def test_replacewrong(self):
+ self.seed_tester(self.XPUB,
+ "certain come keen collect slab gauge photo inside mechanic deny leader drop", # correct
+ "certain X keen collect slab gauge photo inside mechanic deny leader drop", # guess
+ big_typos=1)
+
+ def test_insert(self):
+ self.seed_tester(self.XPUB,
+ "certain come keen collect slab gauge photo inside mechanic deny leader drop", # correct
+ " come keen collect slab gauge photo inside mechanic deny leader drop", # guess
+ big_typos=1)
+
+ def test_swap(self):
+ self.seed_tester(self.XPUB,
+ "certain come keen collect slab gauge photo inside mechanic deny leader drop", # correct
+ "certain keen come collect slab gauge photo inside mechanic deny leader drop", # guess
+ typos=1)
+
+ def test_replace(self):
+ self.seed_tester(self.XPUB,
+ "certain come keen collect slab gauge photo inside mechanic deny leader drop", # correct
+ "disagree come keen collect slab gauge photo inside mechanic deny leader drop", # guess
+ big_typos=1)
+
+ def test_replaceclose(self):
+ self.seed_tester(self.XPUB,
+ "certain come keen collect slab gauge photo inside mechanic deny leader drop", # correct
+ "certain become keen collect slab gauge photo inside mechanic deny leader drop", # guess
+ typos=1)
+
+ def test_replaceclose_firstfour(self):
+ self.seed_tester(self.XPUB,
+ "certain come keen collect slab gauge photo inside mechanic deny leader drop", # correct
+ "cere come keen coll slab gaug phot insi mech deny lead drop", # guess
+ # "cere" is close to "cert" in the en-firstfour language, even though "cereal" is not close to "certain"
+ typos=1)
+
+
+# All seed tests except TestAddressSet.test_false_positives are quick
+class QuickTests(unittest.TestSuite):
  def __init__(self):
  super(QuickTests, self).__init__()
- self.addTests(unittest.defaultTestLoader.loadTestsFromModule(sys.modules[__name__]))
+ for suite in unittest.defaultTestLoader.loadTestsFromModule(sys.modules[__name__]):
+ if isinstance(suite._tests[0], TestAddressSet):
+ for test_num in xrange(len(suite._tests)):
+ if suite._tests[test_num]._testMethodName == "test_false_positives":
+ del suite._tests[test_num]
+ break
+ self.addTests(suite)
 
 
 if __name__ == b'__main__':

diff --git a/btcrecover/wordlists/README.md b/btcrecover/wordlists/README.md
@@ -12,12 +12,16 @@ The wordlist files themselves were copied verbatim from the sources above, inclu
 
 *seedrecover.py* attempts to guess the correct language of the mnemonic it is trying to recover, however it may not always guess correctly (in particular when it comes to Chinese). You can instruct *seedrecover.py* to use a specific language via the `--language LANG-CODE` option.
 
-The available `LANG-CODE`s are taken from the filenames in the same directory as this file; they follow the first `-` in their filenames. Specifically, in alphabetical order they are:
+The available `LANG-CODE`s (based on ISO 639-1) are taken from the filenames in the same directory as this file; they follow the first `-` in their filenames. Specifically, in alphabetical order they are:
 
- * Chinese (simplified) (BIP-39 only) - `zh-hans`
- * Chinese (traditional) (BIP-39 only) - `zh-hant`
+ * Chinese, simplified - `zh-hans`
+ * Chinese, traditional (BIP-39 only) - `zh-hant`
  * English - `en`
- * French (Electrum 2.x only) - `fr`
+ * French (BIP-39 only) - `fr`
+ * Italian (BIP-39 only) - `it`
  * Japanese - `ja`
+ * Korean (BIP-39 only) - `ko`
  * Portuguese (Electrum 2.x only) - `pt` 
  * Spanish - `es`
+
+There are also four "firstfour" language codes based on the ones above: `en-firstfour`, `es-firstfour`, `fr-firstfour`, and `it-firstfour`. Wallet software that uses mnemonics which include just the first four letters of each word can use one of these language codes.