Skip to content

Commit

Permalink
added evaluation from bitbucket repo
Browse files Browse the repository at this point in the history
  • Loading branch information
ragib06 committed Jan 30, 2020
1 parent 30668f4 commit d1f6154
Show file tree
Hide file tree
Showing 59 changed files with 20,783 additions and 0 deletions.
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
*.DS_Store
*.zip
*.JPG
*.jpg
*.csv
*.o
*.txt
*.pyc
*.ipynb_checkpoints
*.vscode
*.png
10 changes: 10 additions & 0 deletions evaluation/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

cd svm_proprank
make

cd ../lib/pyrankagg
python setup.py install

cd ../../
pip install -r requirements.txt
4 changes: 4 additions & 0 deletions evaluation/clean.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
rm -rf nsvm_results
rm -rf psvm_results
rm -rf temp
rm -rf data
146 changes: 146 additions & 0 deletions evaluation/combine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import os
import pdb
import argparse

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from pyrankagg.rankagg import FullListRankAggregator

from combine_utils import *


metric_map = {
'arrr' : lambda rel, rank: (rank[rel.astype(bool)]-1).mean(),
'mrr1' : lambda rel, rank: 1/(rank[rel.astype(bool)].mean()),
'mrr2' : lambda rel, rank: 1/(rank[rel.astype(bool)].mean() ** 2),
'ndcg_3' : lambda rel, rank: ndcg_from_ranking(rel, np.argsort(rank), k=3),
'ndcg_5' : lambda rel, rank: ndcg_from_ranking(rel, np.argsort(rank), k=5),
'ndcg_7' : lambda rel, rank: ndcg_from_ranking(rel, np.argsort(rank), k=7),
'ndcg_10' : lambda rel, rank: ndcg_from_ranking(rel, np.argsort(rank), k=10)
}



def rank(df, algos):
for name, group in df.groupby(['qid']):
qid = group['qid'].min()
for algo in algos:
df.loc[df['qid'] == qid, rank_map[algo]] = group[score_map[algo]].rank(ascending=False).astype(int)
return df


def evaluate(df, algos, metrics):
results = create_custom_df('rank', algos, metrics)

for name, group in df.groupby(['qid']):
qid = group['qid'].min()

for algo in algos:
df.loc[df['qid'] == qid, rank_map[algo]] = group[score_map[algo]].rank(ascending=False).astype(int)

df_qid = df[df['qid'] == qid]
results['qid'].append(qid)

for m in metrics:
for algo in algos:
rel = np.asarray(df_qid['C'])
rank = np.asarray(df_qid[rank_map[algo]])
res = metric_map[m](rel, rank)
results['%s_%s' % (algo, m)].append(res)

result_df = pd.DataFrame.from_dict(results)

return result_df


def combine_weight(eval_data_train, eval_data_test, X_cols, Y_col):
X_train = eval_data_train[X_cols]
Y_train = eval_data_train['C']
clf = LogisticRegression(solver='lbfgs', class_weight='balanced').fit(X_train, Y_train)

X_test = eval_data_test[X_cols]
return clf.predict_proba(X_test)[:, 1]


def rank_aggregate(eval_data_train, eval_data_test, X_cols, Y_col):
df = eval_data_test.copy()
df['rank_agg'] = np.zeros(eval_data_test.shape[0])
FLRA = FullListRankAggregator()
for name, group in df.groupby(['qid']):
qid = group['qid'].min()

rank_dicts = []
for algo_rank in X_cols:
rank_dicts.append(df.loc[df['qid'] == qid, algo_rank].to_dict())

com_ranks = FLRA.aggregate_ranks(rank_dicts, areScores=False)
com_ranks = pd.DataFrame.from_dict(com_ranks, orient='index')[0]
df.loc[df['qid'] == qid, 'rank_agg'] = com_ranks.astype(int)

return -df['rank_agg']


def main():
parser = argparse.ArgumentParser()
parser.add_argument('-hr', default='data/result/svm_eta2_first', help='heckman results csv.')
parser.add_argument('-pr', default='psvm_results/svm_eta2_first', help='predicrions from psvm.')
parser.add_argument('-nr', default='nsvm_results/svm_eta2_first', help='predicrions from psvm.')
parser.add_argument('-n', type=int, default='29', help='max threshold to combine')
parser.add_argument('-eta', type=str, default='2', help='eta.')
parser.add_argument('-out', default='results/10_pass_combined_eta2_first.csv', help='combined results csv.')
args = parser.parse_args()

metrics = ['arrr', 'mrr1', 'ndcg_10']
algos = ['naive-svm', 'prop-svm', 'heckman']
algos_to_combine = ['prop-svm', 'heckman']
combine_methods = ['combinedw', 'combined-agg']
combine_map = {'combinedw' : combine_weight, 'combined-agg': rank_aggregate}

results = create_custom_df('result', algos + combine_methods, metrics)

for i in range(args.n+1):
print('combining see %d' % i)
try:
# Load training scores and combine
file_heckman_train_result = os.path.join(args.hr, 'train_scores_see%d_%s.csv' % (i, args.eta))
file_psvm_train_result = os.path.join(args.pr, 'prediction_train_see%d.txt' % (i))
eval_data_train = combine(file_heckman_train_result, file_psvm_train_result, 's_score')
eval_data_train = rank(eval_data_train, algos_to_combine)

# Load test scores and combine
file_heckman_test_result = os.path.join(args.hr, 'test_scores_see%d_%s.csv' % (i, args.eta))
file_psvm_test_result = os.path.join(args.pr, 'prediction_test_see%d.txt' % (i))
file_nsvm_test_result = os.path.join(args.nr, 'naive_prediction_test_see%d.txt' % (i))
eval_data_test = combine(file_heckman_test_result, file_psvm_test_result, 's_score', file_nsvm_test_result, 'n_score')
eval_data_test = rank(eval_data_test, algos)
except:
continue

# Predict on test scores
X_cols = [rank_map[algo] for algo in algos_to_combine]

for comb_name in combine_methods:
comb_method = combine_map[comb_name]
eval_data_test[score_map[comb_name]] = comb_method(eval_data_train, eval_data_test, X_cols, 'C')

# generate ranking based on final score
all_algos = algos + combine_methods
result_df = evaluate(eval_data_test, all_algos, metrics)

results['see'].append(i)
for m in metrics:
for algo in (all_algos):
algo_m = '%s_%s' % (algo, m)
results[algo_m].append(result_df[algo_m].mean())

df = pd.DataFrame.from_dict(results)

for m in metrics:
dump_result_for_metric(df, all_algos, m, pretty_map, args.out)



if __name__ == "__main__":
main()
87 changes: 87 additions & 0 deletions evaluation/combine_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import os
import pdb

import numpy as np
import pandas as pd

from letor_metrics import *


rank_map = {
'heckman': 'h_rank',
'naive-svm' : 'n_rank',
'prop-svm' : 's_rank',
'combinedw' : 'c_rank',
'combined-agg' : 'agg_rank',
}
score_map = {
'heckman': 'h_score',
'prop-svm' : 's_score',
'naive-svm' : 'n_score',
'combinedw' : 'cw_score',
'combined-agg' : 'ca_score',
}
pretty_map = {
'heckman': 'Heckman',
'prop-svm' : 'PropSVM',
'naive-svm' : 'NaiveSVM',
'combinedw' : 'CombinedW',
'combined-agg' : 'RankAgg'
}


def load_svm_result(svm_pred_path, eval_data, score_name):
with open(svm_pred_path, 'r') as svm_file:
for name, group in eval_data.groupby(['qid']):
num_lines = group.shape[0]

qid = group['qid'].min()
scores = []
for i in range(num_lines):
try:
scores.append(float(svm_file.readline().strip()))
except:
print("ERROR: heckman=svm line mismatch!!")

eval_data.loc[eval_data['qid'] == qid, score_name] = scores


def combine(heckman_file_path, psvm_file_path, psvm_score='s_score', nsvm_file_path='', nsvm_score='n_score'):
eval_data = pd.read_csv(heckman_file_path)
eval_data['qid'] = eval_data['qid'].astype(int)

load_svm_result(psvm_file_path, eval_data, psvm_score)
eval_data[psvm_score] = (eval_data[psvm_score] - eval_data[psvm_score].min()) / (eval_data[psvm_score].max() - eval_data[psvm_score].min())

if nsvm_file_path != '':
load_svm_result(nsvm_file_path, eval_data, nsvm_score)
eval_data[nsvm_score] = (eval_data[nsvm_score] - eval_data[nsvm_score].min()) / (eval_data[nsvm_score].max() - eval_data[nsvm_score].min())

return eval_data


def create_custom_df(name, algos, metrics):
init_map = {
'result' : 'see',
'rank' : 'qid'
}

results = {
init_map[name] : []
}

for algo in algos:
for m in metrics:
results['%s_%s' % (algo, m)] = []

return results


def dump_result_for_metric(df, algos, metric, pretty_map, out_file):
out = ''.join(out_file.split('.')[:-1])
out_m = '%s_%s.csv' % (out, metric)
cols = ['see']
algo_cols = ['%s_%s' % (algo, metric) for algo in algos]
cols += algo_cols
result_m = df[cols].rename(columns=dict([('%s_%s' % (algo, metric), pretty_map[algo]) for algo in algos]))
result_m.to_csv(out_m, index=False)
14 changes: 14 additions & 0 deletions evaluation/convert.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

eta=$1
pass=$2
n=$3
source=$4
target=$5

for i in $(seq -f "%g" 0 $n)
do
python psvm_to_heckman.py -s $source -t $target -n $i -eta $eta -npass $pass &
done
wait

echo "convert done"
37 changes: 37 additions & 0 deletions evaluation/dump_batch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
ds=$1
eta=$2
npass=$3
out=$4
n=$5

njobs=4

procs=0
for seen in $(seq 0 $n);
do
python dump_df.py -f $ds"/heckman_svm."$npass"pass.see"$seen".eta"$eta".train" -o $out -dt train -eta $eta &
echo "train done "$seen

procs=$(( $procs + 1 ))
if [ $procs -eq $njobs ];
then
wait
procs=0
fi
done
wait

procs=0
for seen in $(seq 0 $n);
do
python dump_df.py -f $ds"/heckman_svm."$npass"pass.see"$seen".eta"$eta".test" -o $out -dt test -eta $eta &
echo "test done "$seen

if [ $procs -eq $njobs ];
then
wait
procs=0
fi
done
wait

19 changes: 19 additions & 0 deletions evaluation/dump_batch_3.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
ds=$1
eta=$2
name=$3
npass=$4
n=$5

mkdir -p "data/pickles/"$name"_eta"$eta"_first/"
sh dump_batch.sh $ds"/eta"$eta"/first_run/" $eta $npass "data/pickles/"$name"_eta"$eta"_first/" $n
wait
echo "first run done"

# mkdir -p "data/pickles/"$name"_eta"$eta"_second/"
# sh dump_batch.sh $ds"/eta"$eta"/second_run/" $eta "data/pickles/"$name"_eta"$eta"_second/"
# wait
# echo "second run done"

# mkdir -p "data/pickles/"$name"_eta"$eta"_third/"
# sh dump_batch.sh $ds"/eta"$eta"/third_run/" $eta "data/pickles/"$name"_eta"$eta"_third/"
# echo "third run done"
56 changes: 56 additions & 0 deletions evaluation/dump_df.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import pdb
import argparse
import numpy as np
import pandas as pd

def load_clicks(file_path, dim=700):
cols = ['qid', 'C', 'S']
cols += ['X'+str(i+1) for i in range(dim)]

num_lines = sum(1 for line in open(file_path))

data = {}
for c in cols:
data[c] = np.zeros(num_lines)

with open(file_path) as f:
lcount = 0
for line in f:
tokens = line.strip().split(' ')

data['qid'][lcount] = int(tokens[0])
data['C'][lcount] = int(tokens[1])
data['S'][lcount] = int(tokens[-1])

for t in tokens[2:-2]:
col = 'X' + t.split(':')[0]
data[col][lcount] = float(t.split(':')[1])

lcount += 1

df = pd.DataFrame.from_dict(data)
df = df.fillna(0.0)
return df


def main():
parser = argparse.ArgumentParser()
parser.add_argument('-f', default='data/test.Ragib_svm_to_anything.see10.1pass.eta1', help='data file path.')
parser.add_argument('-o', default='data/pickles/', help='output dir for pickles.')
parser.add_argument('-dt', default='train', help='train/test.')
parser.add_argument('-eta', default='1.0', help='eta.')
parser.add_argument('-fmt', default='csv', help='file format (pkl/csv).')
args = parser.parse_args()

df = load_clicks(args.f)
see = args.f.split('/')[-1].split('.')[2] # sample: naive_svm.5pass.[see19].eta2.train

if args.fmt == 'pkl':
df.to_pickle(args.o + '/' + args.dt + '_clicks_' + see + '_' + args.eta + '.pkl')
elif args.fmt == 'csv':
df.to_csv(args.o + '/' + args.dt + '_clicks_' + see + '_' + args.eta + '.csv', index=False)
else:
pass

if __name__ == "__main__":
main()
Loading

0 comments on commit d1f6154

Please sign in to comment.