-
Notifications
You must be signed in to change notification settings - Fork 0
/
assignment2.py
249 lines (198 loc) · 10.3 KB
/
assignment2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#################################
# Your name: Lior Erenreich
#################################
import numpy as np
import matplotlib.pyplot as plt
import intervals
class Assignment2(object):
"""Assignment 2 skeleton.
Please use these function signatures for this assignment and submit this file, together with the intervals.py.
"""
def sample_from_D(self, m: int):
"""Sample m data samples from D.
Input: m - an integer, the size of the data sample.
Returns: np.ndarray of shape (m,2) :
A two-dimensional array of size m that contains the pairs where drawn from the distribution P.
"""
x = np.sort(np.random.uniform(0, 1, m))
y = np.array([self.random_choose(self.define_x_intersection(x)[i]) for i in range(m)]).reshape(m, )
# Return a 2D array where the first column is x and the second column is y
return np.column_stack((x, y))
def experiment_m_range_erm(self, first: int, last: int, step: int, k: int, T: int):
"""Runs the ERM algorithm.
Calculates the empirical error and the true error.
Plots the average empirical and true errors.
Input: m_first - an integer, the smallest size of the data sample in the range.
m_last - an integer, the largest size of the data sample in the range.
step - an integer, the difference between the size of m in each loop.
k - an integer, the maximum number of intervals.
T - an integer, the number of times the experiment is performed.
Returns: np.ndarray of shape (n_steps,2).
A two-dimensional array that contains the average empirical error
and the average true error for each m in the range accordingly.
"""
# Generate the range of m values
ns = np.arange(first, last + 1, step)
# Run the experiment T times
errs = np.array([self.empirical_true_err(m, k) for m in ns for _ in range(T)])
errs = errs.reshape(len(ns), T, 2)
# Calculate the average empirical and true error
np_array_avg_errs = np.asarray(np.mean(errs, axis=1))
# Plot the empirical and true errors as a function of n
plt.title("Average Empirical and True Errors as a Function of Sample Size")
plt.xlabel("n")
plt.ylabel("Error")
plt.plot(ns, np_array_avg_errs[:, 0], label="Empirical Error")
plt.plot(ns, np_array_avg_errs[:, 1], label="True Error")
plt.legend()
plt.show()
return np_array_avg_errs
def experiment_k_range_erm(self, m, k_first, k_last, step):
"""Finds the best hypothesis for k= 1,2,...,10.
Plots the empirical and true errors as a function of k.
Input: m - an integer, the size of the data sample.
k_first - an integer, the maximum number of intervals in the first experiment.
m_last - an integer, the maximum number of intervals in the last experiment.
step - an integer, the difference between the size of k in each experiment.
Returns: The best k value (an integer) according to the ERM algorithm.
"""
# Create an array of k values to try
k_choices = np.arange(k_first, k_last + 1, step)
# Initialize arrays to store the empirical and true errors for each k
empirical_err = np.zeros(k_choices.shape[0])
true_err = np.zeros(k_choices.shape[0])
# Sample data from the distribution
sample = self.sample_from_D(m)
# Loop over the k values and calculate the ERM intervals and errors
for k in range(k_choices.shape[0]):
ERM_intervals, ERM_empirical_err = intervals.find_best_interval(sample[:, 0], sample[:, 1], k_choices[k])
empirical_err[k] = ERM_empirical_err / m
true_err[k] = self.true_err(ERM_intervals)
# Plot the empirical and true errors as a function of k
plt.scatter(k_choices, true_err)
plt.scatter(k_choices, empirical_err)
plt.plot(k_choices, empirical_err, label='Empirical Error')
plt.plot(k_choices, true_err, label='True Error')
plt.legend()
plt.title('Empirical and True Errors as a Function of k')
plt.xlabel("k")
plt.ylabel("Error")
plt.show()
return np.argmin(empirical_err)
def cross_validation(self, m):
"""Finds a k that gives a good test error.
Input: m - an integer, the size of the data sample.
Returns: The best k value (an integer) found by the cross validation algorithm.
"""
# Generate a sample of size m from the distribution D
sample = self.sample_from_D(m)
# Shuffle the sample
np.random.shuffle(sample)
# Split the sample into training and validation sets (80-20 split)
s1, s2 = sample[:int(m * 0.8)], sample[int(m * 0.8):]
# Sort the training set by x-values
s1 = s1[s1[:, 0].argsort()]
# Initialize variables to keep track of best k and its empirical error
# For each k value (1 to 10), find the best interval hypothesis using ERM algorithm and calculate empirical
# Return the best k value
return np.argmin([self.empirical_err(s2, intervals.find_best_interval(s1[:, 0], s1[:, 1], k)[0]) for k in np.arange(1, 11)]) + 1
#################################
# Additional methods
def define_x_intersection(self, x):
"""
Generate an array of the intersection of intervals 1 and 2
"""
return ~((x > 0.2) & (x < 0.4)) & ~((x > 0.6) & (x < 0.8))
def random_choose(self, param):
"""
Randomly choose a float number from the given list based on the parameter.
Args:
param: a boolean parameter to decide which list to use for random choice
Returns:
A float number from [0.0, 1.0] list based on the given parameter with probabilities of
[0.2, 0.8] or [0.9, 0.1]
"""
return np.random.choice([0.0, 1.0], size=1, p=(lambda: [0.2, 0.8] if param else [0.9, 0.1])())[0]
def empirical_true_err(self, n: int, k: int):
"""
Runs the ERM algorithm and calculates the empirical error and true error
for a given sample size and maximum number of intervals.
Args:
n: An integer, the size of the data sample.
k: An integer, the maximum number of intervals.
Returns:
A tuple of two floats representing the empirical error and the true
error, respectively.
"""
sample = self.sample_from_D(n)
best_intervals, err_cnt = intervals.find_best_interval(sample[:, 0], sample[:, 1], k)
return err_cnt / n, self.true_err(best_intervals)
def true_err(self, intervals):
"""
Calculate the true error of the hypothesis corresponding to the given intervals.
The true error is defined as the sum of the probabilities of the two sets of intervals not covered
by the given intervals. The first set of intervals has label 1 and a high probability, and the second
set of intervals has label 1 and a low probability.
Args:
intervals (list): A list of tuples, where each tuple represents an interval.
Returns:
float: The true error of the hypothesis corresponding to the given intervals.
"""
# Calculate the intersection between the current intervals and the intervals with high and low probability
len_cur_high_p = self.intersection_len(intervals, [(0, 0.2), (0.4, 0.6), (0.8, 1)])
len_cur_low_p = self.intersection_len(intervals, [(0.2, 0.4), (0.6, 0.8)])
# Calculate and return the true error using the lengths of the intervals
return 0.8 * (0.6 - len_cur_high_p) + 0.1 * (0.4 - len_cur_low_p) + 0.2 * len_cur_high_p + 0.9 * len_cur_low_p
def intersection_len(self, l1, l2) -> int:
"""
Calculate the length of the intersection between the intervals in list1 and list2.
Args:
l1 (List[Tuple]): A list of tuples representing intervals.
l2 (List[Tuple]): A list of tuples representing intervals.
Returns:
int: The length of the intersection between the intervals in list1 and list2.
"""
start = np.maximum(np.array(l1)[:, 0].reshape(-1, 1), np.array(l2)[:, 0])
end = np.minimum(np.array(l1)[:, 1].reshape(-1, 1), np.array(l2)[:, 1])
mask = start < end
return np.sum((end - start) * mask)
def empirical_err(self, sample, intervals):
"""
Calculate the empirical error of a given hypothesis, represented by a list of intervals, on a given sample.
Args:
sample (np.ndarray): A two-dimensional array containing pairs drawn from the distribution P.
intervals (List[Tuple]): A list of tuples, where each tuple represents an interval in the hypothesis.
Returns:
float: The empirical error of the hypothesis on the given sample.
"""
return sum([self.zero_one_loss(intervals, x, y) for x, y in sample]) / len(sample)
def zero_one_loss(self, intervals, x, y) -> int:
"""
Calculates the zero-one loss of a hypothesis h(x) with respect to a label y and an input value x.
Args:
intervals (List[Tuple[float, float]]): A list of intervals defining the hypothesis.
x (float): The input value to evaluate the hypothesis.
y (int): The true label corresponding to the input value (0 or 1).
Returns:
int: 0 if h(x) = y, else 1.
"""
x_in_interval = self.is_in_interval(np.array(intervals), x)
if (x_in_interval and y == 1) or (not x_in_interval and y == 0):
return 0
return 1
def is_in_interval(self, intervals, x) -> bool:
"""
Check if x is in the intervals in list_intervals.
Args:
intervals: An array of intervals represented by tuples.
x: A float number in [0,1].
Returns:
True if x is in any of the intervals, False otherwise.
"""
return np.any((intervals[:, 0] <= x) & (x <= intervals[:, 1]))
#################################
if __name__ == '__main__':
ass = Assignment2()
ass.experiment_m_range_erm(10, 100, 5, 3, 100)
ass.experiment_k_range_erm(1500, 1, 10, 1)
ass.cross_validation(1500)