-
Notifications
You must be signed in to change notification settings - Fork 2
/
multinomial.py
175 lines (137 loc) · 4.56 KB
/
multinomial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import numpy as np
import json
from load_data import make_freq_dict
from bayesian_blocks import bayesian_blocks
def find_prob(x,bins):
'''
Parameters:
-----------
=> x = train_data
=> bins = bin edges from the bayesian blocks
Function:
----------
Used to find the probabilities of each of the training samples, within its bin.
Output:
-------
returns probabilities bin wise as a list of dicts
'''
c=[]
x_i=[]
p=[]
for i in range(len(bins)-1):
lis=[]
for j in x:
if j>=bins[i] and j<bins[i+1]:
lis.append(j)
if i==len(bins)-2 and j== bins[i+1]:
lis.append(j)
freq = make_freq_dict(lis)
c.append(list(freq.values()))
x_i.append(list(freq.keys())) #x_i = list of lists: values of corresponding probability
for row in c:
temp=[]
for i in row:
temp.append(i/sum(row))
p.append(temp) #p= list of list: prob of all train samples
dic_lis=[]
for i in range(len(p)):
dic={}
for j in range(len(p[i])):
dic[x_i[i][j]]=p[i][j] #dict of probabilities within a bin
dic_lis.append(dic) #list of dicts over all the bins.
return dic_lis
def find_prob_test(x_test,dic_lis,bins):
'''
Parameters:
-----------
=> x_test = test list of counts
=> dic_list = probabilities bin wise as a list of dicts estimated on train data
=> bins = bin edges from the bayesian blocks estimated on train_data
Function:
----------
Used to find the probabilities of each of the testing samples.
If new sample exists in test but not in train, the probabilty for that is taken as 0.
Output:
-------
returns dict of x_test amd its corresponding prob
'''
prob={}
for i in range(len(bins)-1):
#lis=[]
for j in x_test:
if j>=bins[i] and j<bins[i+1]:
#id.append(i)
if j in dic_lis[i]:
prob[j]=dic_lis[i][j]
else:
prob[j]=0.0
if i==len(bins)-2 and j== bins[i+1]:
if j in dic_lis[i]:
prob[j]=dic_lis[i][j]
else:
prob[j]=0.0
return prob
def likeli_mln(x_test,prob):
'''
Parameters:
-----------
=> X_test = test array
=> prob = probabilities estimated on test data
Function:
----------
Used to find the Log Likelihood of the test data.
Output:
-------
returns likelihood value.
'''
likeli=0
for i in x_test:
if i in prob:
if prob[i]!=0:
likeli+= i*np.log(prob[i])#x_i log p_i
return likeli
def multinomial_bay_block(tr,tes,k,gammas,iter):
'''
Parameters:
-----------
=> tr = train data
=> tes = test data
=> k = value of factor in prior function (1-gamma)/(1-gamma**(N//2**k))
=> gammas = list of gammas to iterate over
=> iter = number of times the experiment has to be performed
Function:
----------
Used to estimate the parameters of multinominal on train data.
And calculate the log likelihood for test data for varying gamma.
Output:
-------
returns likelihood list with parameters.
'''
dumper={}
total_likeli=[]
for gamma in gammas:
likeli_mlnn=[]
for_best=[] # array for selecting best one
fold=0
for i in range(iter):
X_train = tr[i]
X_test =tes[i]
bin_edges = bayesian_blocks(X_train,fitness='multinomial',lam=k,gamma=gamma)
# print("edges",bin_edges)
dic_lis = find_prob(X_train,bin_edges)
prob_train = find_prob_test(X_train,dic_lis,bin_edges)
prob = find_prob_test(X_test,dic_lis,bin_edges)
#print(prob)
likeli= likeli_mln(X_test,prob)
tr_likeli = likeli_mln(X_train,prob_train)
#print("likeli",likeli)
likeli_mlnn.append([fold,-likeli,len(bin_edges)-1])#negetive log likeli
for_best.append(-likeli)
fold+=1
total_likeli.append([gamma,likeli_mlnn])
mu = np.mean(for_best)
sig = np.std(for_best)
dumper[gamma]=mu/sig
with open("./select_best/mln_mu_sig_"+str(len(tes[0]))+".json", "w") as write_file:
json.dump(dumper, write_file)
return total_likeli