-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added evaluation from bitbucket repo
- Loading branch information
Showing
59 changed files
with
20,783 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
*.DS_Store | ||
*.zip | ||
*.JPG | ||
*.jpg | ||
*.csv | ||
*.o | ||
*.txt | ||
*.pyc | ||
*.ipynb_checkpoints | ||
*.vscode | ||
*.png |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#!/bin/bash | ||
|
||
cd svm_proprank | ||
make | ||
|
||
cd ../lib/pyrankagg | ||
python setup.py install | ||
|
||
cd ../../ | ||
pip install -r requirements.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
rm -rf nsvm_results | ||
rm -rf psvm_results | ||
rm -rf temp | ||
rm -rf data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
import os | ||
import pdb | ||
import argparse | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from sklearn.linear_model import LogisticRegression | ||
from pyrankagg.rankagg import FullListRankAggregator | ||
|
||
from combine_utils import * | ||
|
||
|
||
metric_map = { | ||
'arrr' : lambda rel, rank: (rank[rel.astype(bool)]-1).mean(), | ||
'mrr1' : lambda rel, rank: 1/(rank[rel.astype(bool)].mean()), | ||
'mrr2' : lambda rel, rank: 1/(rank[rel.astype(bool)].mean() ** 2), | ||
'ndcg_3' : lambda rel, rank: ndcg_from_ranking(rel, np.argsort(rank), k=3), | ||
'ndcg_5' : lambda rel, rank: ndcg_from_ranking(rel, np.argsort(rank), k=5), | ||
'ndcg_7' : lambda rel, rank: ndcg_from_ranking(rel, np.argsort(rank), k=7), | ||
'ndcg_10' : lambda rel, rank: ndcg_from_ranking(rel, np.argsort(rank), k=10) | ||
} | ||
|
||
|
||
|
||
def rank(df, algos): | ||
for name, group in df.groupby(['qid']): | ||
qid = group['qid'].min() | ||
for algo in algos: | ||
df.loc[df['qid'] == qid, rank_map[algo]] = group[score_map[algo]].rank(ascending=False).astype(int) | ||
return df | ||
|
||
|
||
def evaluate(df, algos, metrics): | ||
results = create_custom_df('rank', algos, metrics) | ||
|
||
for name, group in df.groupby(['qid']): | ||
qid = group['qid'].min() | ||
|
||
for algo in algos: | ||
df.loc[df['qid'] == qid, rank_map[algo]] = group[score_map[algo]].rank(ascending=False).astype(int) | ||
|
||
df_qid = df[df['qid'] == qid] | ||
results['qid'].append(qid) | ||
|
||
for m in metrics: | ||
for algo in algos: | ||
rel = np.asarray(df_qid['C']) | ||
rank = np.asarray(df_qid[rank_map[algo]]) | ||
res = metric_map[m](rel, rank) | ||
results['%s_%s' % (algo, m)].append(res) | ||
|
||
result_df = pd.DataFrame.from_dict(results) | ||
|
||
return result_df | ||
|
||
|
||
def combine_weight(eval_data_train, eval_data_test, X_cols, Y_col): | ||
X_train = eval_data_train[X_cols] | ||
Y_train = eval_data_train['C'] | ||
clf = LogisticRegression(solver='lbfgs', class_weight='balanced').fit(X_train, Y_train) | ||
|
||
X_test = eval_data_test[X_cols] | ||
return clf.predict_proba(X_test)[:, 1] | ||
|
||
|
||
def rank_aggregate(eval_data_train, eval_data_test, X_cols, Y_col): | ||
df = eval_data_test.copy() | ||
df['rank_agg'] = np.zeros(eval_data_test.shape[0]) | ||
FLRA = FullListRankAggregator() | ||
for name, group in df.groupby(['qid']): | ||
qid = group['qid'].min() | ||
|
||
rank_dicts = [] | ||
for algo_rank in X_cols: | ||
rank_dicts.append(df.loc[df['qid'] == qid, algo_rank].to_dict()) | ||
|
||
com_ranks = FLRA.aggregate_ranks(rank_dicts, areScores=False) | ||
com_ranks = pd.DataFrame.from_dict(com_ranks, orient='index')[0] | ||
df.loc[df['qid'] == qid, 'rank_agg'] = com_ranks.astype(int) | ||
|
||
return -df['rank_agg'] | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('-hr', default='data/result/svm_eta2_first', help='heckman results csv.') | ||
parser.add_argument('-pr', default='psvm_results/svm_eta2_first', help='predicrions from psvm.') | ||
parser.add_argument('-nr', default='nsvm_results/svm_eta2_first', help='predicrions from psvm.') | ||
parser.add_argument('-n', type=int, default='29', help='max threshold to combine') | ||
parser.add_argument('-eta', type=str, default='2', help='eta.') | ||
parser.add_argument('-out', default='results/10_pass_combined_eta2_first.csv', help='combined results csv.') | ||
args = parser.parse_args() | ||
|
||
metrics = ['arrr', 'mrr1', 'ndcg_10'] | ||
algos = ['naive-svm', 'prop-svm', 'heckman'] | ||
algos_to_combine = ['prop-svm', 'heckman'] | ||
combine_methods = ['combinedw', 'combined-agg'] | ||
combine_map = {'combinedw' : combine_weight, 'combined-agg': rank_aggregate} | ||
|
||
results = create_custom_df('result', algos + combine_methods, metrics) | ||
|
||
for i in range(args.n+1): | ||
print('combining see %d' % i) | ||
try: | ||
# Load training scores and combine | ||
file_heckman_train_result = os.path.join(args.hr, 'train_scores_see%d_%s.csv' % (i, args.eta)) | ||
file_psvm_train_result = os.path.join(args.pr, 'prediction_train_see%d.txt' % (i)) | ||
eval_data_train = combine(file_heckman_train_result, file_psvm_train_result, 's_score') | ||
eval_data_train = rank(eval_data_train, algos_to_combine) | ||
|
||
# Load test scores and combine | ||
file_heckman_test_result = os.path.join(args.hr, 'test_scores_see%d_%s.csv' % (i, args.eta)) | ||
file_psvm_test_result = os.path.join(args.pr, 'prediction_test_see%d.txt' % (i)) | ||
file_nsvm_test_result = os.path.join(args.nr, 'naive_prediction_test_see%d.txt' % (i)) | ||
eval_data_test = combine(file_heckman_test_result, file_psvm_test_result, 's_score', file_nsvm_test_result, 'n_score') | ||
eval_data_test = rank(eval_data_test, algos) | ||
except: | ||
continue | ||
|
||
# Predict on test scores | ||
X_cols = [rank_map[algo] for algo in algos_to_combine] | ||
|
||
for comb_name in combine_methods: | ||
comb_method = combine_map[comb_name] | ||
eval_data_test[score_map[comb_name]] = comb_method(eval_data_train, eval_data_test, X_cols, 'C') | ||
|
||
# generate ranking based on final score | ||
all_algos = algos + combine_methods | ||
result_df = evaluate(eval_data_test, all_algos, metrics) | ||
|
||
results['see'].append(i) | ||
for m in metrics: | ||
for algo in (all_algos): | ||
algo_m = '%s_%s' % (algo, m) | ||
results[algo_m].append(result_df[algo_m].mean()) | ||
|
||
df = pd.DataFrame.from_dict(results) | ||
|
||
for m in metrics: | ||
dump_result_for_metric(df, all_algos, m, pretty_map, args.out) | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import os | ||
import pdb | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
from letor_metrics import * | ||
|
||
|
||
rank_map = { | ||
'heckman': 'h_rank', | ||
'naive-svm' : 'n_rank', | ||
'prop-svm' : 's_rank', | ||
'combinedw' : 'c_rank', | ||
'combined-agg' : 'agg_rank', | ||
} | ||
score_map = { | ||
'heckman': 'h_score', | ||
'prop-svm' : 's_score', | ||
'naive-svm' : 'n_score', | ||
'combinedw' : 'cw_score', | ||
'combined-agg' : 'ca_score', | ||
} | ||
pretty_map = { | ||
'heckman': 'Heckman', | ||
'prop-svm' : 'PropSVM', | ||
'naive-svm' : 'NaiveSVM', | ||
'combinedw' : 'CombinedW', | ||
'combined-agg' : 'RankAgg' | ||
} | ||
|
||
|
||
def load_svm_result(svm_pred_path, eval_data, score_name): | ||
with open(svm_pred_path, 'r') as svm_file: | ||
for name, group in eval_data.groupby(['qid']): | ||
num_lines = group.shape[0] | ||
|
||
qid = group['qid'].min() | ||
scores = [] | ||
for i in range(num_lines): | ||
try: | ||
scores.append(float(svm_file.readline().strip())) | ||
except: | ||
print("ERROR: heckman=svm line mismatch!!") | ||
|
||
eval_data.loc[eval_data['qid'] == qid, score_name] = scores | ||
|
||
|
||
def combine(heckman_file_path, psvm_file_path, psvm_score='s_score', nsvm_file_path='', nsvm_score='n_score'): | ||
eval_data = pd.read_csv(heckman_file_path) | ||
eval_data['qid'] = eval_data['qid'].astype(int) | ||
|
||
load_svm_result(psvm_file_path, eval_data, psvm_score) | ||
eval_data[psvm_score] = (eval_data[psvm_score] - eval_data[psvm_score].min()) / (eval_data[psvm_score].max() - eval_data[psvm_score].min()) | ||
|
||
if nsvm_file_path != '': | ||
load_svm_result(nsvm_file_path, eval_data, nsvm_score) | ||
eval_data[nsvm_score] = (eval_data[nsvm_score] - eval_data[nsvm_score].min()) / (eval_data[nsvm_score].max() - eval_data[nsvm_score].min()) | ||
|
||
return eval_data | ||
|
||
|
||
def create_custom_df(name, algos, metrics): | ||
init_map = { | ||
'result' : 'see', | ||
'rank' : 'qid' | ||
} | ||
|
||
results = { | ||
init_map[name] : [] | ||
} | ||
|
||
for algo in algos: | ||
for m in metrics: | ||
results['%s_%s' % (algo, m)] = [] | ||
|
||
return results | ||
|
||
|
||
def dump_result_for_metric(df, algos, metric, pretty_map, out_file): | ||
out = ''.join(out_file.split('.')[:-1]) | ||
out_m = '%s_%s.csv' % (out, metric) | ||
cols = ['see'] | ||
algo_cols = ['%s_%s' % (algo, metric) for algo in algos] | ||
cols += algo_cols | ||
result_m = df[cols].rename(columns=dict([('%s_%s' % (algo, metric), pretty_map[algo]) for algo in algos])) | ||
result_m.to_csv(out_m, index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
|
||
eta=$1 | ||
pass=$2 | ||
n=$3 | ||
source=$4 | ||
target=$5 | ||
|
||
for i in $(seq -f "%g" 0 $n) | ||
do | ||
python psvm_to_heckman.py -s $source -t $target -n $i -eta $eta -npass $pass & | ||
done | ||
wait | ||
|
||
echo "convert done" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
ds=$1 | ||
eta=$2 | ||
npass=$3 | ||
out=$4 | ||
n=$5 | ||
|
||
njobs=4 | ||
|
||
procs=0 | ||
for seen in $(seq 0 $n); | ||
do | ||
python dump_df.py -f $ds"/heckman_svm."$npass"pass.see"$seen".eta"$eta".train" -o $out -dt train -eta $eta & | ||
echo "train done "$seen | ||
|
||
procs=$(( $procs + 1 )) | ||
if [ $procs -eq $njobs ]; | ||
then | ||
wait | ||
procs=0 | ||
fi | ||
done | ||
wait | ||
|
||
procs=0 | ||
for seen in $(seq 0 $n); | ||
do | ||
python dump_df.py -f $ds"/heckman_svm."$npass"pass.see"$seen".eta"$eta".test" -o $out -dt test -eta $eta & | ||
echo "test done "$seen | ||
|
||
if [ $procs -eq $njobs ]; | ||
then | ||
wait | ||
procs=0 | ||
fi | ||
done | ||
wait | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
ds=$1 | ||
eta=$2 | ||
name=$3 | ||
npass=$4 | ||
n=$5 | ||
|
||
mkdir -p "data/pickles/"$name"_eta"$eta"_first/" | ||
sh dump_batch.sh $ds"/eta"$eta"/first_run/" $eta $npass "data/pickles/"$name"_eta"$eta"_first/" $n | ||
wait | ||
echo "first run done" | ||
|
||
# mkdir -p "data/pickles/"$name"_eta"$eta"_second/" | ||
# sh dump_batch.sh $ds"/eta"$eta"/second_run/" $eta "data/pickles/"$name"_eta"$eta"_second/" | ||
# wait | ||
# echo "second run done" | ||
|
||
# mkdir -p "data/pickles/"$name"_eta"$eta"_third/" | ||
# sh dump_batch.sh $ds"/eta"$eta"/third_run/" $eta "data/pickles/"$name"_eta"$eta"_third/" | ||
# echo "third run done" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import pdb | ||
import argparse | ||
import numpy as np | ||
import pandas as pd | ||
|
||
def load_clicks(file_path, dim=700): | ||
cols = ['qid', 'C', 'S'] | ||
cols += ['X'+str(i+1) for i in range(dim)] | ||
|
||
num_lines = sum(1 for line in open(file_path)) | ||
|
||
data = {} | ||
for c in cols: | ||
data[c] = np.zeros(num_lines) | ||
|
||
with open(file_path) as f: | ||
lcount = 0 | ||
for line in f: | ||
tokens = line.strip().split(' ') | ||
|
||
data['qid'][lcount] = int(tokens[0]) | ||
data['C'][lcount] = int(tokens[1]) | ||
data['S'][lcount] = int(tokens[-1]) | ||
|
||
for t in tokens[2:-2]: | ||
col = 'X' + t.split(':')[0] | ||
data[col][lcount] = float(t.split(':')[1]) | ||
|
||
lcount += 1 | ||
|
||
df = pd.DataFrame.from_dict(data) | ||
df = df.fillna(0.0) | ||
return df | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('-f', default='data/test.Ragib_svm_to_anything.see10.1pass.eta1', help='data file path.') | ||
parser.add_argument('-o', default='data/pickles/', help='output dir for pickles.') | ||
parser.add_argument('-dt', default='train', help='train/test.') | ||
parser.add_argument('-eta', default='1.0', help='eta.') | ||
parser.add_argument('-fmt', default='csv', help='file format (pkl/csv).') | ||
args = parser.parse_args() | ||
|
||
df = load_clicks(args.f) | ||
see = args.f.split('/')[-1].split('.')[2] # sample: naive_svm.5pass.[see19].eta2.train | ||
|
||
if args.fmt == 'pkl': | ||
df.to_pickle(args.o + '/' + args.dt + '_clicks_' + see + '_' + args.eta + '.pkl') | ||
elif args.fmt == 'csv': | ||
df.to_csv(args.o + '/' + args.dt + '_clicks_' + see + '_' + args.eta + '.csv', index=False) | ||
else: | ||
pass | ||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.