From e6c47d3cf05228db42868a5ad9c901ad82f5fa3a Mon Sep 17 00:00:00 2001 From: zohrehovaisi <38776554+zohrehovaisi@users.noreply.github.com> Date: Mon, 20 Apr 2020 00:19:03 -0700 Subject: [PATCH] Add files via upload --- generation/svm.py | 419 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 419 insertions(+) create mode 100644 generation/svm.py diff --git a/generation/svm.py b/generation/svm.py new file mode 100644 index 0000000..765d481 --- /dev/null +++ b/generation/svm.py @@ -0,0 +1,419 @@ + + +#This code generate data when base ranker is SVM-rank, the process is as follows: +#Base ranker learn o 1% of data, and generates a prediction score for the remaining 99% of data +#The 99% of data will be ranked according to the prediction score +#clicks will be generated from this ranked data with multiple sampling times (multiple pass) to augment data +# The final generated data will be written to be fed into naive svm, propensity svm and heckman + + +import random,operator,numpy as np +from copy import deepcopy +import sys + +class searchItem: + + def __init__(self , r , t, a): + self.true_rel=t + self.ranking=r + self.attributes=a + + +class searchResult: + + def __init__(self): + self.result = [] + self.clicked = {} + + def sortByRank(self): + + self.result.sort(key = operator.attrgetter('ranking'), reverse = True) + for i in range(len(self.result)): + self.result[i].ranking = i + + def doPrint(self): + print "searchResult" + for r in self.result: + print "ranking=" + str(r.ranking) + ", true_rel=" + str(r.true_rel) + ", att=" + str(r.attributes) + print "============" + + def getRankOfClicked(self, clicks): + counter = 0 + r=[] + # assert len(clicks) == len(self.result) + for c in clicks: + assert c==0 or c==1 + if c==1: + r.append(self.result[counter].ranking) + + counter = counter+1 + + return r + + + def setPropencityScores(self, clicks, eta): + counter = 0 + r=[] + # assert len(clicks) == len(self.result) + for c in clicks: + assert c==0 or c==1 + if c==1: + p = 1/((1/(1+float(self.result[counter].ranking)))**eta) + self.clicked[counter] = p + counter = counter+1 + + + + def gen_clicks(self, eta): + + # assert len(self.ranking) == len(self.true_rel) + count = len(self.result) + ''' + count = number of documents in search result (for a single query) + ranking = ranking of the documents in search result + relevance = true relevance of the documents provided in data + eta = presentation bias factor + ''' + +#@@@@@@@@@START: noisy click, comment out for noiseless@@@@@@@@@# + #eneg = 0.4 + #for i in range(count): + #if self.result[i].true_rel < 1: # docs with rel=0, will have rel=0.1# + #assert (self.result[i].true_rel == 0) + #self.result[i].true_rel = eneg +#@@@@@@@@@END: noisy click, comment out for noiseless@@@@@@@@@# + + + exam_prob = np.zeros(count) # examination probability + click_prob = np.zeros(count) # click probability + clicks = np.zeros(count, dtype=np.int) # actual clicks + + for i in range(count): + doc_i = self.result[i].ranking + exam_prob[doc_i] = (1.0 / (i + 1)) ** eta + click_prob[doc_i] = exam_prob[doc_i] * self.result[doc_i].true_rel + + if np.sum( + click_prob) <= 0: # if no probability of clicks (possibly because of no relevance), return empty clicks, + return np.array([]) + + + + + for i in range(min(count,observe_num+1)): #from initiall, not generate clicks for bellow cut-off# + doc_i = self.result[i].ranking + clicks[doc_i] = np.random.binomial(n=1, p=click_prob[doc_i]) # generate the click + assert i==doc_i + + + + if np.sum(clicks) <= 0: # if no clicks generated, return empty + return np.array([]) + + self.setPropencityScores(clicks , eta) + + + + return clicks + + + +input_file_name = sys.argv[1] #99% of train data# +prediction= sys.argv[2] #prediction file that contains base ranker scores for 99% of train data# +output_file_name_naive= sys.argv[3] #output file that we write data that will be fed to naive-svm# +output_file_name_propensity= sys.argv[4]#output file that we write data that will be fed to propensity-svm# +output_file_name_heckman= sys.argv[5] #output file where we write data that will be fed to heckman# +sampling_times= int(sys.argv[6]) #number of passes (5)# +observe_num = int(sys.argv[7]) #cut off (1-30)# +eta= float(sys.argv[8]) #eta=position bias severity# + + + +#read test data# +input_file_name_test = "../set1.test.binary.txt" + +file_test = open(input_file_name_test, "r") +D_test={} +for f1 in file_test: + qid=f1[f1.find(":")+1:f1.find(" ",f1.find(":"))] + attributes="" + if f1[-1]== "\n": + attributes=f1[f1.find(" ", f1.find(":"))+1:-1] + else: + attributes = f1[f1.find(" ", f1.find(":"))+1:] + trueR = float(f1[:f1.find(" ")]) + # print(trueR) + if qid not in D_test: + # D[qid]=[] + D_test[qid] = searchResult() + newSearchItem = searchItem (1, trueR, attributes) + D_test[qid].result.append(newSearchItem) + + + + + + + + + + +#read train data, and prediction file# +file_train = open(input_file_name, "r") + +file_prediction = open(prediction, "r") + + + +D={} +for f1 in file_train: + + qid=f1[f1.find(":")+1:f1.find(" ",f1.find(":"))] + attributes="" + if f1[-1]== "\n": + attributes=f1[f1.find(" ", f1.find(":"))+1:-1] + else: + attributes = f1[f1.find(" ", f1.find(":"))+1:] + + + + f2 = file_prediction.readline() + ranking=float(f2[:-1]) + trueR = float(f1[:f1.find(" ")]) + if qid not in D: + D[qid] = searchResult() + + newSearchItem = searchItem (ranking, trueR, attributes) + + D[qid].result.append(newSearchItem) + + + +#### Start Sampling ##### +D_new={} +D_train=D + +D={} + +qids = list(D_train) +#print qids +for sampling in range(sampling_times): # number of passes + for r in range(len(qids)): + #r = random.randint(0,len(qids)-1) #choose a random number to sample# + for i in range(1,len(qids)*100): #assuming that irrespective of how many sampling we are doing, the lenghth of qid would not exceed 100 times its initial length# + newQID = qids[r]+"_"+str(i) + if newQID not in D_new: + D_new[newQID] = deepcopy(D_train[qids[r]]) + break + +#### END Sampling ##### + + + +D_train=D_new + + + +#### START CLICK GENERATION ##### + + +for k in D_train: # for each key, sort the corresponding value + #print k + #D[k].doPrint() + D_train[k].sortByRank() + #print "sorted" + #D[k].doPrint() + + clicks= D_train[k].gen_clicks(eta) + #print clicks + #print D[k].getRankOfClicked(clicks) + #print D[k].clicked + + + + + +#### END CLICK GENERATION ##### + + + + + +#### START Writing Output (this data will be reranked by Heckman, niave svm and propensity svm)##### + + + +#write propensity svm# + + +train_out = open(output_file_name_propensity+".train", "w") +test_out = open(output_file_name_propensity+".test", "w") + + +counter=1 +for k in D_train: + for c in D_train[k].clicked: + train_out.write("1 qid:"+str(counter)+" cost:"+str(D_train[k].clicked[c])+" "+D_train[k].result[c].attributes+"\n") + lastRanking = -1 + for i in range(len(D_train[k].result)): + if i ==c: + continue + assert(D_train[k].result[i].ranking > lastRanking) + lastRanking = D_train[k].result[i].ranking + train_out.write("0 qid:" + str(counter) + " " + D_train[k].result[i].attributes+"\n") + + counter = counter+1 + + + +for k in D_test: + for c in range(len(D_test[k].result)): + if D_test[k].result[c].true_rel == 0: + continue + test_out.write("1 qid:"+str(counter)+" cost:1 "+D_test[k].result[c].attributes+"\n") + + + for i in range(len(D_test[k].result)): + if i ==c: + continue + + test_out.write("0 qid:" + str(counter) + " " + D_test[k].result[i].attributes+"\n") + + counter = counter+1 + + + + + + + +#write naive svm# + +train_out = open(output_file_name_naive+".train", "w") +test_out = open(output_file_name_naive+".test", "w") + + +counter=1 +for k in D_train: + + for c in D_train[k].clicked: + train_out.write("1 qid:" + str(counter) + " cost:1 " + D_train[k].result[c].attributes + "\n") + lastRanking = -1 + for i in range(len(D_train[k].result)): + if i ==c: + continue + assert(D_train[k].result[i].ranking > lastRanking) + lastRanking = D_train[k].result[i].ranking + train_out.write("0 qid:" + str(counter) + " " + D_train[k].result[i].attributes+"\n") + + counter = counter+1 + +for k in D_test: + + for c in range(len(D_test[k].result)): + if D_test[k].result[c].true_rel == 0: + continue + test_out.write("1 qid:"+str(counter)+" cost:1 "+D_test[k].result[c].attributes+"\n") + + for i in range(len(D_test[k].result)): + if i ==c: + continue + + test_out.write("0 qid:" + str(counter) + " " + D_test[k].result[i].attributes+"\n") + + counter = counter+1 + + + + + + + + +#write heckman# + +train_out = open(output_file_name_heckman+".train", "w") +test_out = open(output_file_name_heckman+".test", "w") + + +toCreateNewQUD = True #if want to creat a new qid for a new click, keep it true, if want to have multiple click in a query, keep it false# + + + + +if toCreateNewQUD: + + + counter=1 + for k in D_train: + + for c in D_train[k].clicked: + + seen=-1 + if D_train[k].result[c].ranking <= observe_num: + seen = 1 + else: + assert False + seen = 0 + train_out.write(str(counter) + " 1 " + D_train[k].result[c].attributes + " " + str(D_train[k].result[c].ranking) + " " + str(seen) + "\n") + lastRanking = -1 + for i in range(len(D_train[k].result)): + if i ==c: + continue + assert(D_train[k].result[i].ranking > lastRanking) + lastRanking = D_train[k].result[i].ranking + if D_train[k].result[i].ranking <= observe_num: + seen = 1 + else: + seen = 0 + train_out.write(str(counter) + " 0 " + D_train[k].result[i].attributes + " " + str(D_train[k].result[i].ranking) + " " + str(seen) + "\n") + + counter = counter+1 + + + + + + +else: + + counter=1 + for k in D_train: + + for i in range(len(D_train[k].result)): + if D_train[k].result[i].ranking <= observe_num: + seen = 1 + else: + seen = 0 + if i in D_train[k].clicked: + train_out.write(str(counter) + " 1 " + D_train[k].result[i].attributes + " " + str(D_train[k].result[i].ranking) + " " + str(seen) + "\n") + else: + train_out.write(str(counter) + " 0 " + D_train[k].result[i].attributes + " " + str(D_train[k].result[i].ranking) + " " + str(seen) + "\n") + + counter = counter+1 + + + + + + +for k in D_test: + + for c in range(len(D_test[k].result)): + if D_test[k].result[c].true_rel == 0: + continue + + seen=1 + + test_out.write(str(counter) + " 1 " + D_test[k].result[c].attributes + " " + str(D_test[k].result[c].ranking) + " " + str(seen) + "\n") + + for i in range(len(D_test[k].result)): + if i ==c: + continue + + test_out.write(str(counter) + " 0 " + D_test[k].result[i].attributes + " " + str(D_test[k].result[i].ranking) + " " + str(seen) + "\n") + + counter = counter+1 + +#### End Writing Output ##### +print "done\n"