Skip to content

Commit

Permalink
problem 3: intrinsic dimension
Browse files Browse the repository at this point in the history
  • Loading branch information
orbxball committed May 11, 2017
1 parent 5e5f0e4 commit 1697d1f
Show file tree
Hide file tree
Showing 3 changed files with 203 additions and 0 deletions.
154 changes: 154 additions & 0 deletions hw4/dim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import sys, os
import argparse
import numpy as np
import gendata
from sklearn.neighbors import NearestNeighbors


def ensure_dir(file_path):
directory = os.path.dirname(file_path)
if len(directory) == 0: return
if not os.path.exists(directory):
os.makedirs(directory)


def get_param():
N = np.random.randint(1, 11) * 10000
# the hidden dimension is randomly chosen from [60, 79] uniformly
layer_dims = [np.random.randint(60, 80), 100]
return N, layer_dims


def sampling(N, X):
permu = np.random.permutation(N)[:sample_size]
return X[permu,:]


def NN(fitting_data, query_data):
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree', n_jobs=-1)
return nbrs.fit(fitting_data).kneighbors(query_data)


def get_index(target, nparray):
array = list(nparray)
idx = 0
for i in range(len(array)-1):
if array[i] < target and target <= array[i+1]:
idx = (target - array[i]) / (array[i+1] - array[i]) + i
break
if target >= array[len(array)-1]:
idx = len(array)-1
return idx


def train(sample_size, round_size, table_name):
for dim in range(1, 61):
average_distance = 0.0
print('Dim: {}'.format(dim))
for r in range(round_size):
N, layer_dims = get_param()
X = gendata.gen_data(dim, layer_dims, N)

sample_X = sampling(N, X)
distances, indices = NN(X, sample_X)

avg_d = np.mean(distances[:,1])
print('Round {}: Start NN with N={} hidden_dim={}, distance => {}'.format(r, N, layer_dims[0], avg_d))
average_distance += avg_d
average_distance /= round_size
print('average distance => {}'.format(average_distance))
table.append(average_distance)

print('Saving table...')
np.save(table_name, np.array(table))
print('Saved')
return table


def validate(model, sample_size):
error = 0.0
test_size = 200
validate_round_size = 10
for i in range(test_size):
total_avg = 0.0
dim = np.random.randint(1, 61)
N, layer_dims = get_param()
X = gendata.gen_data(dim, layer_dims, N)

for j in range(validate_round_size):
sample_X = sampling(N, X)
distances, indices = NN(X, sample_X)

avg_d = np.mean(distances[:,1])
total_avg += avg_d
predicted = get_index(total_avg/validate_round_size, model) + 1
print('Validate {}: true => {} predicted => {}'.format(i, dim, predicted))
error += np.absolute(np.log(predicted) - np.log(dim))
error /= test_size
print('Validation: {}'.format(error))


def test(model, test_data):
ans = []
test_round_size = 10
total_avg = 0.0
for i in test_data.keys():
print('Predicting {}...'.format(i))
for j in range(test_round_size):
sample_test_data = sampling(test_data[i].shape[0], test_data[i])
distances, indices = NN(test_data[i], sample_test_data)

avg_d = np.mean(distances[:,1])
total_avg += avg_d
print('Round: {}, avg_d: {}'.format(j, avg_d))
total_avg /= test_round_size
predicted = get_index(total_avg, model) + 1

ans.append(np.log(predicted))
return ans


def main():
try:
print('Loading model...')
model = np.load(model_name)
print('Loading Done!')
except:
print('Start training...')
table = train(sample_size, round_size, table_name)
print('Training Done!')
model = np.array(table)

# validate(model, sample_size)
# sys.exit(-1)

ans = test(model, np.load(data_path))

ensure_dir(output_path)
result = []
for index, value in enumerate(ans):
result.append("{0},{1}".format(index, value))
with open(output_path, "w+") as f:
f.write("SetId,LogDim\n")
f.write("\n".join(result))



if __name__ == '__main__':
parser = argparse.ArgumentParser(description='problem 3: Estimation of Intrinsic Diemnsion')
parser.add_argument('--data', metavar='<#datapath>', type=str, required=True)
parser.add_argument('--out', metavar='<#output>', type=str, required=True)
args = parser.parse_args()

data_path = args.data
output_path = args.out

# table record the average distance of dimension
table = []
model_dir = './model'
table_name = os.path.join(model_dir, 'model2.npy')
sample_size = 50
round_size = 50
model_name = os.path.join(model_dir, 'model2.npy')

main()
3 changes: 3 additions & 0 deletions hw4/dim.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash

python3 dim.py --data $1 --out $2
46 changes: 46 additions & 0 deletions hw4/gendata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import numpy as np


def elu(arr):
return np.where(arr > 0, arr, np.exp(arr) - 1)


def make_layer(in_size, out_size):
w = np.random.normal(scale=0.5, size=(in_size, out_size))
b = np.random.normal(scale=0.5, size=out_size)
return (w, b)


def forward(inpd, layers):
out = inpd
for layer in layers:
w, b = layer
out = elu(out @ w + b)

return out


def gen_data(dim, layer_dims, N):
layers = []
data = np.random.normal(size=(N, dim))

nd = dim
for d in layer_dims:
layers.append(make_layer(nd, d))
nd = d

w, b = make_layer(nd, nd)
gen_data = forward(data, layers)
gen_data = gen_data @ w + b
return gen_data


if __name__ == '__main__':
# if we want to generate data with intrinsic dimension of 10
dim = 10
N = 10000
# the hidden dimension is randomly chosen from [60, 79] uniformly
layer_dims = [np.random.randint(60, 80), 100]
data = gen_data(dim, layer_dims, N)
# (data, dim) is a (question, answer) pair
print(data)

0 comments on commit 1697d1f

Please sign in to comment.