Skip to content

Commit

Permalink
code documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
internaut committed Jan 10, 2016
1 parent edf2555 commit c26983a
Showing 1 changed file with 78 additions and 23 deletions.
101 changes: 78 additions & 23 deletions lbg.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#
# Python 3!
#
"""
Linde-Buzo-Gray / Generalized Lloyd algorithm implementation in Python *3*.
Heuristic process that can be used to generate cluster points from a big amount of multidimensional vectors.
"""

import dist
from functools import reduce
Expand All @@ -11,6 +12,16 @@


def generate_codebook(data, size_codebook, epsilon=0.00001):
"""
Generate codebook of size <size_codebook> with convergence value <epsilon>. Will return a tuple with the
generated codebook, a vector with absolute weights and a vector with relative weights (the weight denotes how many
vectors for <data> are in the proximity of the codevector.
:param data: input data with N k-dimensional vectors
:param size_codebook: codebook size. Because the codevectors are split on each iteration, this should be a
power-of-2-value
:param epsilon: convergence value
:return tuple of: codebook of size <size_codebook>, absolute weights, relative weights
"""
global _size_data, _dim

_size_data = len(data)
Expand All @@ -27,21 +38,31 @@ def generate_codebook(data, size_codebook, epsilon=0.00001):
c0 = avg_vec_of_vecs(data, _dim, _size_data)
codebook.append(c0)

while len(codebook) < size_codebook:
avg_dist = avg_distortion_c0(c0, data)
# calculate the average distortion
avg_dist = avg_distortion_c0(c0, data)

codebook, codebook_abs_weights, codebook_rel_weights = split_codebook(data, codebook,
codebook_abs_weights,
codebook_rel_weights,
epsilon, avg_dist)
# split codevectors until we have have enough
while len(codebook) < size_codebook:
codebook, codebook_abs_weights, codebook_rel_weights, avg_dist = split_codebook(data, codebook,
epsilon, avg_dist)

return codebook, codebook_abs_weights, codebook_rel_weights


def split_codebook(data, codebook, abs_weights, rel_weights, epsilon, initial_avg_dist):
def split_codebook(data, codebook, epsilon, initial_avg_dist):
"""
Split the codebook so that each codevector in the codebook is split into two.
:param data: input data
:param codebook: input codebook. its codevectors will be split into two.
:param epsilon: convergence value
:param initial_avg_dist: initial average distortion
:return Tuple with new codebook, codebook absolute weights and codebook relative weights
"""

# split codevectors
new_codevectors = []
for c in codebook:
# the new codevectors c1 and c2 will moved by epsilon and -epsilon so to be apart from each other
c1 = new_codevector(c, epsilon)
c2 = new_codevector(c, -epsilon)
new_codevectors.extend((c1, c2))
Expand All @@ -53,18 +74,20 @@ def split_codebook(data, codebook, abs_weights, rel_weights, epsilon, initial_av

# print('> splitting to size', len_codebook)

# try to reach a convergence by minimizing the average distortion. this is done by moving the codevectors step by
# step to the center of the points in their proximity
avg_dist = 0
err = epsilon + 1
num_iter = 0
while err > epsilon:
# find closest codevectors for each vector in data
closest_c_list = [None] * _size_data
vecs_near_c = defaultdict(list)
vec_idxs_near_c = defaultdict(list)
for i, vec in enumerate(data):
# find closest codevectors for each vector in data (find the proximity of each codevector)
closest_c_list = [None] * _size_data # list that contains the nearest codevector for each input data vector
vecs_near_c = defaultdict(list) # list with codevector index -> input data vector mapping
vec_idxs_near_c = defaultdict(list) # list with codevector index -> input data index mapping
for i, vec in enumerate(data): # for each input vector
min_dist = None
closest_c_index = None
for i_c, c in enumerate(codebook):
for i_c, c in enumerate(codebook): # for each codevector
d = dist.euclid_squared(vec, c)
if min_dist is None or d < min_dist: # found new closest codevector
min_dist = d
Expand All @@ -73,31 +96,42 @@ def split_codebook(data, codebook, abs_weights, rel_weights, epsilon, initial_av
vecs_near_c[closest_c_index].append(vec)
vec_idxs_near_c[closest_c_index].append(i)

# update codebook
for i_c in range(len_codebook):
vecs = vecs_near_c.get(i_c) or []
# update codebook: recalculate each codevector so that it sits in the center of the points in their proximity
for i_c in range(len_codebook): # for each codevector index
vecs = vecs_near_c.get(i_c) or [] # get its proximity input vectors
num_vecs_near_c = len(vecs)
if num_vecs_near_c > 0:
new_c = avg_vec_of_vecs(vecs, _dim)
codebook[i_c] = new_c
for i in vec_idxs_near_c[i_c]:
new_c = avg_vec_of_vecs(vecs, _dim) # calculate the new center
codebook[i_c] = new_c # update in codebook
for i in vec_idxs_near_c[i_c]: # update in input vector index -> codevector mapping list
closest_c_list[i] = new_c

# update the weights
abs_weights[i_c] = num_vecs_near_c
rel_weights[i_c] = num_vecs_near_c / _size_data

# recalculate average distortion value
prev_avg_dist = avg_dist if avg_dist > 0 else initial_avg_dist
avg_dist = avg_distortion_c_list(closest_c_list, data)

# recalculate the new error value
err = (prev_avg_dist - avg_dist) / prev_avg_dist
# print(closest_c_list)
# print('> iteration', num_iter, 'avg_dist', avg_dist, 'prev_avg_dist', prev_avg_dist, 'err', err)

num_iter += 1

return codebook, abs_weights, rel_weights
return codebook, abs_weights, rel_weights, avg_dist


def avg_vec_of_vecs(vecs, dim=None, size=None):
"""
Calculcate average vector (center vector) for input vectors <vecs>.
:param vecs: input vectors
:param dim: dimension of <vecs> if it was already calculated
:param size: size of <vecs> if it was already calculated
:return average vector (center vector) for input vectors <vecs>
"""
size = size or len(vecs)
dim = dim or len(vecs[0])
avg_vec = [0.0] * dim
Expand All @@ -109,10 +143,23 @@ def avg_vec_of_vecs(vecs, dim=None, size=None):


def new_codevector(c, e):
"""
Create a new codevector based on <c> but moved by factor <e>
:param c: base codevector
:param e: move factor
:return new codevector
"""
return [x * (1.0 + e) for x in c]


def avg_distortion_c0(c0, data, size=None):
"""
Average distortion of <c0> in relation to <data> (i.e. how good does <c0> describe <data>?).
:param c0: comparison vector
:param data: sample data
:param size: size of <data> if it was already calculated
:return average distortion
"""
size = size or _size_data
return reduce(lambda s, d: s + d / size,
(dist.euclid_squared(c0, vec)
Expand All @@ -121,6 +168,14 @@ def avg_distortion_c0(c0, data, size=None):


def avg_distortion_c_list(c_list, data, size=None):
"""
Average distortion between input samples <data> and a list <c_list> that contains a codevector for each point in
<data>.
:param c_list: list that contains a codevector for each point in <data>
:param data: input samples
:param size: Size of <data> if it was already calculated
:return:
"""
size = size or _size_data
return reduce(lambda s, d: s + d / size,
(dist.euclid_squared(c_i, data[i])
Expand Down

0 comments on commit c26983a

Please sign in to comment.