Skip to content

Commit

Permalink
update seedrecover wordlist support & add new unit tests
Browse files Browse the repository at this point in the history
 * support seeds w/just the first four letters of each word (for gurnec#114)
 * add new BIP39 languages Italian and Korean
 * remove Electrum2 French language (was planned but never finished)
 * restrict Electrum2 seeds to those languages supported by Electrum2
 * update wordlists/README.md to reflect all of the above
 * add new seedrecover unit tests for each type of seed typo
  • Loading branch information
gurnec committed Oct 19, 2017
1 parent 56a0dbf commit aa66ca3
Show file tree
Hide file tree
Showing 7 changed files with 4,229 additions and 1,760 deletions.
62 changes: 51 additions & 11 deletions btcrecover/btcrseed.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
# (all optional futures for 2.7 except unicode_literals)
from __future__ import print_function, absolute_import, division

__version__ = "0.7.2"
__version__ = "0.7.3"

from . import btcrpass
from .addressset import AddressSet
Expand Down Expand Up @@ -770,18 +770,30 @@ def _performance_xpub():

@register_selectable_wallet_class("Standard BIP39/BIP44 (Mycelium, TREZOR, Ledger, Bither, Blockchain.info, Jaxx)")
class WalletBIP39(WalletBIP32):
FIRSTFOUR_TAG = "-firstfour"

# Load the wordlists for all languages (actual one to use is selected in config_mnemonic() )
_language_words = {}
@classmethod
def _load_wordlists(cls, name = "bip39"):
for filename in glob.iglob(os.path.join(wordlists_dir, name + "-??*.txt")):
wordlist_lang = os.path.basename(filename)[len(name)+1:-4] # e.g. "en", or "zh-hant"
if wordlist_lang in cls._language_words:
continue # skips loading bip39-fr if electrum2-fr is already loaded
wordlist = load_wordlist(name, wordlist_lang)
assert len(wordlist) == 2048 or cls is not WalletBIP39, "BIP39 wordlist has 2048 words"
cls._language_words[wordlist_lang] = wordlist
def _load_wordlists(cls):
assert not cls._language_words, "_load_wordlists() should only be called once from the first init()"
cls._do_load_wordlists("bip39")
for wordlist_lang in cls._language_words.keys(): # takes a copy of the keys so the dict can be safely changed
wordlist = cls._language_words[wordlist_lang]
assert len(wordlist) == 2048, "BIP39 wordlist has 2048 words"
# Special case for the four languages whose words may be truncated to the first four letters
if wordlist_lang in ("en", "es", "fr", "it"):
cls._language_words[wordlist_lang + cls.FIRSTFOUR_TAG] = [ w[:4] for w in wordlist ]
#
@classmethod
def _do_load_wordlists(cls, name, wordlist_langs = None):
if not wordlist_langs:
wordlist_langs = []
for filename in glob.iglob(os.path.join(wordlists_dir, name + "-??*.txt")):
wordlist_langs.append(os.path.basename(filename)[len(name)+1:-4]) # e.g. "en", or "zh-hant"
for lang in wordlist_langs:
assert lang not in cls._language_words, "wordlist not already loaded"
cls._language_words[lang] = load_wordlist(name, lang)

@property
def word_ids(self): return self._words
Expand Down Expand Up @@ -831,6 +843,33 @@ def config_mnemonic(self, mnemonic_guess = None, lang = None, passphrase = u"",
raise ValueError("this version of Python doesn't support passphrases with Unicode code points > "+str(sys.maxunicode))
self._derivation_salt = "mnemonic" + self._unicode_to_bytes(passphrase)

# Special case for wallets which tell users to record only the first four letters of each word;
# convert all short words into long ones (intentionally done *after* the finding of close words).
# Specifically, update self._words and the globals mnemonic_ids_guess and close_mnemonic_ids.
if self._lang.endswith(self.FIRSTFOUR_TAG):
long_lang_words = self._language_words[self._lang[:-len(self.FIRSTFOUR_TAG)]]
assert isinstance(long_lang_words[0], unicode), "long words haven't yet been converted into bytes"
assert isinstance(self._words[0], bytes), "short words have already been converted into bytes"
assert len(long_lang_words) == len(self._words), "long and short word lists have the same length"
long_lang_words = [ self._unicode_to_bytes(l) for l in long_lang_words ]
short_to_long = { s:l for s,l in zip(self._words, long_lang_words) }
self._words = long_lang_words
#
global mnemonic_ids_guess # the to-be-replaced short-words guess
long_ids_guess = () # the new long-words guess
for short_id in mnemonic_ids_guess:
long_ids_guess += None if short_id is None else short_to_long[short_id],
mnemonic_ids_guess = long_ids_guess
#
global close_mnemonic_ids
if close_mnemonic_ids:
assert isinstance(close_mnemonic_ids.iterkeys() .next(), bytes), "close word keys have already been converted into bytes"
assert isinstance(close_mnemonic_ids.itervalues().next()[0][0], bytes), "close word values have already been converted into bytes"
for key in close_mnemonic_ids.keys(): # takes a copy of the keys so the dict can be safely changed
vals = close_mnemonic_ids.pop(key)
# vals is a tuple containing length-1 tuples which in turn each contain one word in bytes-format
close_mnemonic_ids[short_to_long[key]] = tuple( (short_to_long[v[0]],) for v in vals )

# Calculate each word's index in binary (needed by _verify_checksum())
self._word_to_binary = { word : "{:011b}".format(i) for i,word in enumerate(self._words) }

Expand Down Expand Up @@ -1071,8 +1110,9 @@ class WalletElectrum2(WalletBIP39):
# Load the wordlists for all languages (actual one to use is selected in config_mnemonic() )
@classmethod
def _load_wordlists(cls):
super(WalletElectrum2, cls)._load_wordlists("electrum2") # the Electrum2-specific word lists
super(WalletElectrum2, cls)._load_wordlists() # the default bip39 word lists
assert not cls._language_words, "_load_wordlists() should only be called once from the first init()"
cls._do_load_wordlists("electrum2")
cls._do_load_wordlists("bip39", ("en", "es", "ja", "zh-hans")) # only the four bip39 ones used by Electrum2
assert all(len(w) >= 1411 for w in cls._language_words.values()), \
"Electrum2 wordlists are at least 1411 words long" # because we assume a max mnemonic length of 13

Expand Down
4 changes: 2 additions & 2 deletions btcrecover/sha512-bc-kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ void kernel_sha512_bc(__global uint64_t* hashes_buffer,
for (int i = 0; i < 8; i++)
w[i] = SWAP64(hashes_buffer[i]);

// Assumes original input length was 64 bytes
// Assumes original input length was 64 bytes; add padding to it
w[8] = 0x8000000000000000UL; // The appended "1" bit
#pragma unroll
for (int i = 9; i < 15; i++)
Expand Down Expand Up @@ -258,7 +258,7 @@ void kernel_sha512_bc(__global uint64_t* hashes_buffer,
w[6] = g + H6;
w[7] = h + H7;

// Assumes original input length was 64 bytes
// SHA512 output length is always 64 bytes; add padding to it
w[8] = 0x8000000000000000UL; // The appended "1" bit
#pragma unroll
for (int i = 9; i < 15; i++)
Expand Down
76 changes: 72 additions & 4 deletions btcrecover/test/test_seeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,12 @@ def test_bip44(self):
"xpub6BgCDhMefYxRS1gbVbxyokYzQji65v1eGJXGEiGdoobvFBShcNeJt97zoJBkNtbASLyTPYXJHRvkb3ahxaVVGEtC1AD4LyuBXULZcfCjBZx",
"certain come keen collect slab gauge photo inside mechanic deny leader drop")

def test_bip44_firstfour(self):
# an xpub at path m/44'/0'/0', as Mycelium for Android would export
self.mpk_tester(btcrseed.WalletBIP39,
"xpub6BgCDhMefYxRS1gbVbxyokYzQji65v1eGJXGEiGdoobvFBShcNeJt97zoJBkNtbASLyTPYXJHRvkb3ahxaVVGEtC1AD4LyuBXULZcfCjBZx",
"cert come keen coll slab gaug phot insi mech deny lead drop")

def test_bip44_ja(self):
# an xpub at path m/44'/0'/0'
self.mpk_tester(btcrseed.WalletBIP39,
Expand Down Expand Up @@ -423,12 +429,74 @@ def test_bip44(self):
"certain come keen collect slab gauge photo inside mechanic deny leader drop")


# All seed tests are quick
# TODO: remove slow TestAddressSet.test_false_positives from QuickTests
class QuickTests(unittest.TestSuite) :
class TestSeedTypos(unittest.TestCase):
XPUB = "xpub6BgCDhMefYxRS1gbVbxyokYzQji65v1eGJXGEiGdoobvFBShcNeJt97zoJBkNtbASLyTPYXJHRvkb3ahxaVVGEtC1AD4LyuBXULZcfCjBZx"

def seed_tester(self, the_mpk, correct_mnemonic, mnemonic_guess, typos = None, big_typos = 0):
correct_mnemonic = correct_mnemonic.split()
assert mnemonic_guess.split() != correct_mnemonic
assert typos or big_typos
btcrseed.loaded_wallet = btcrseed.WalletBIP39.create_from_params(mpk=the_mpk)
btcrseed.loaded_wallet.config_mnemonic(mnemonic_guess)
self.assertEqual(
btcrseed.run_btcrecover(typos or big_typos, big_typos, extra_args="--threads 1".split()),
tuple(correct_mnemonic))

def test_delete(self):
self.seed_tester(self.XPUB,
"certain come keen collect slab gauge photo inside mechanic deny leader drop", # correct
"certain come come keen collect slab gauge photo inside mechanic deny leader drop", # guess
typos=1)

def test_replacewrong(self):
self.seed_tester(self.XPUB,
"certain come keen collect slab gauge photo inside mechanic deny leader drop", # correct
"certain X keen collect slab gauge photo inside mechanic deny leader drop", # guess
big_typos=1)

def test_insert(self):
self.seed_tester(self.XPUB,
"certain come keen collect slab gauge photo inside mechanic deny leader drop", # correct
" come keen collect slab gauge photo inside mechanic deny leader drop", # guess
big_typos=1)

def test_swap(self):
self.seed_tester(self.XPUB,
"certain come keen collect slab gauge photo inside mechanic deny leader drop", # correct
"certain keen come collect slab gauge photo inside mechanic deny leader drop", # guess
typos=1)

def test_replace(self):
self.seed_tester(self.XPUB,
"certain come keen collect slab gauge photo inside mechanic deny leader drop", # correct
"disagree come keen collect slab gauge photo inside mechanic deny leader drop", # guess
big_typos=1)

def test_replaceclose(self):
self.seed_tester(self.XPUB,
"certain come keen collect slab gauge photo inside mechanic deny leader drop", # correct
"certain become keen collect slab gauge photo inside mechanic deny leader drop", # guess
typos=1)

def test_replaceclose_firstfour(self):
self.seed_tester(self.XPUB,
"certain come keen collect slab gauge photo inside mechanic deny leader drop", # correct
"cere come keen coll slab gaug phot insi mech deny lead drop", # guess
# "cere" is close to "cert" in the en-firstfour language, even though "cereal" is not close to "certain"
typos=1)


# All seed tests except TestAddressSet.test_false_positives are quick
class QuickTests(unittest.TestSuite):
def __init__(self):
super(QuickTests, self).__init__()
self.addTests(unittest.defaultTestLoader.loadTestsFromModule(sys.modules[__name__]))
for suite in unittest.defaultTestLoader.loadTestsFromModule(sys.modules[__name__]):
if isinstance(suite._tests[0], TestAddressSet):
for test_num in xrange(len(suite._tests)):
if suite._tests[test_num]._testMethodName == "test_false_positives":
del suite._tests[test_num]
break
self.addTests(suite)


if __name__ == b'__main__':
Expand Down
12 changes: 8 additions & 4 deletions btcrecover/wordlists/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@ The wordlist files themselves were copied verbatim from the sources above, inclu

*seedrecover.py* attempts to guess the correct language of the mnemonic it is trying to recover, however it may not always guess correctly (in particular when it comes to Chinese). You can instruct *seedrecover.py* to use a specific language via the `--language LANG-CODE` option.

The available `LANG-CODE`s are taken from the filenames in the same directory as this file; they follow the first `-` in their filenames. Specifically, in alphabetical order they are:
The available `LANG-CODE`s (based on ISO 639-1) are taken from the filenames in the same directory as this file; they follow the first `-` in their filenames. Specifically, in alphabetical order they are:

* Chinese (simplified) (BIP-39 only) - `zh-hans`
* Chinese (traditional) (BIP-39 only) - `zh-hant`
* Chinese, simplified - `zh-hans`
* Chinese, traditional (BIP-39 only) - `zh-hant`
* English - `en`
* French (Electrum 2.x only) - `fr`
* French (BIP-39 only) - `fr`
* Italian (BIP-39 only) - `it`
* Japanese - `ja`
* Korean (BIP-39 only) - `ko`
* Portuguese (Electrum 2.x only) - `pt`
* Spanish - `es`

There are also four "firstfour" language codes based on the ones above: `en-firstfour`, `es-firstfour`, `fr-firstfour`, and `it-firstfour`. Wallet software that uses mnemonics which include just the first four letters of each word can use one of these language codes.
Loading

0 comments on commit aa66ca3

Please sign in to comment.