Skip to content

Commit

Permalink
MNT Applies black formatting to most of the code base (scikit-learn#1…
Browse files Browse the repository at this point in the history
  • Loading branch information
thomasjpfan committed Jun 17, 2021
1 parent 0e7761c commit 82df489
Show file tree
Hide file tree
Showing 513 changed files with 59,810 additions and 42,580 deletions.
10 changes: 2 additions & 8 deletions .github/scripts/label_title_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,9 @@
title = issue.title


regex_to_labels = [
(r"\bDOC\b", "Documentation"),
(r"\bCI\b", "Build / CI")
]
regex_to_labels = [(r"\bDOC\b", "Documentation"), (r"\bCI\b", "Build / CI")]

labels_to_add = [
label for regex, label in regex_to_labels
if re.search(regex, title)
]
labels_to_add = [label for regex, label in regex_to_labels if re.search(regex, title)]

if labels_to_add:
issue.add_to_labels(*labels_to_add)
82 changes: 43 additions & 39 deletions asv_benchmarks/benchmarks/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ class KMeansBenchmark(Predictor, Transformer, Estimator, Benchmark):
Benchmarks for KMeans.
"""

param_names = ['representation', 'algorithm', 'init']
params = (['dense', 'sparse'], ['full', 'elkan'], ['random', 'k-means++'])
param_names = ["representation", "algorithm", "init"]
params = (["dense", "sparse"], ["full", "elkan"], ["random", "k-means++"])

def setup_cache(self):
super().setup_cache()

def make_data(self, params):
representation, algorithm, init = params

if representation == 'sparse':
if representation == "sparse":
data = _20newsgroups_highdim_dataset(n_samples=8000)
else:
data = _blobs_dataset(n_clusters=20)
Expand All @@ -29,44 +29,46 @@ def make_data(self, params):
def make_estimator(self, params):
representation, algorithm, init = params

max_iter = 30 if representation == 'sparse' else 100
max_iter = 30 if representation == "sparse" else 100

estimator = KMeans(n_clusters=20,
algorithm=algorithm,
init=init,
n_init=1,
max_iter=max_iter,
tol=-1,
random_state=0)
estimator = KMeans(
n_clusters=20,
algorithm=algorithm,
init=init,
n_init=1,
max_iter=max_iter,
tol=-1,
random_state=0,
)

return estimator

def make_scorers(self):
self.train_scorer = (
lambda _, __: neg_mean_inertia(self.X,
self.estimator.predict(self.X),
self.estimator.cluster_centers_))
self.test_scorer = (
lambda _, __: neg_mean_inertia(self.X_val,
self.estimator.predict(self.X_val),
self.estimator.cluster_centers_))
self.train_scorer = lambda _, __: neg_mean_inertia(
self.X, self.estimator.predict(self.X), self.estimator.cluster_centers_
)
self.test_scorer = lambda _, __: neg_mean_inertia(
self.X_val,
self.estimator.predict(self.X_val),
self.estimator.cluster_centers_,
)


class MiniBatchKMeansBenchmark(Predictor, Transformer, Estimator, Benchmark):
"""
Benchmarks for MiniBatchKMeans.
"""

param_names = ['representation', 'init']
params = (['dense', 'sparse'], ['random', 'k-means++'])
param_names = ["representation", "init"]
params = (["dense", "sparse"], ["random", "k-means++"])

def setup_cache(self):
super().setup_cache()

def make_data(self, params):
representation, init = params

if representation == 'sparse':
if representation == "sparse":
data = _20newsgroups_highdim_dataset()
else:
data = _blobs_dataset(n_clusters=20)
Expand All @@ -76,25 +78,27 @@ def make_data(self, params):
def make_estimator(self, params):
representation, init = params

max_iter = 5 if representation == 'sparse' else 2
max_iter = 5 if representation == "sparse" else 2

estimator = MiniBatchKMeans(n_clusters=20,
init=init,
n_init=1,
max_iter=max_iter,
batch_size=1000,
max_no_improvement=None,
compute_labels=False,
random_state=0)
estimator = MiniBatchKMeans(
n_clusters=20,
init=init,
n_init=1,
max_iter=max_iter,
batch_size=1000,
max_no_improvement=None,
compute_labels=False,
random_state=0,
)

return estimator

def make_scorers(self):
self.train_scorer = (
lambda _, __: neg_mean_inertia(self.X,
self.estimator.predict(self.X),
self.estimator.cluster_centers_))
self.test_scorer = (
lambda _, __: neg_mean_inertia(self.X_val,
self.estimator.predict(self.X_val),
self.estimator.cluster_centers_))
self.train_scorer = lambda _, __: neg_mean_inertia(
self.X, self.estimator.predict(self.X), self.estimator.cluster_centers_
)
self.test_scorer = lambda _, __: neg_mean_inertia(
self.X_val,
self.estimator.predict(self.X_val),
self.estimator.cluster_centers_,
)
125 changes: 73 additions & 52 deletions asv_benchmarks/benchmarks/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,86 +14,102 @@ def get_from_config():
"""Get benchmarks configuration from the config.json file"""
current_path = Path(__file__).resolve().parent

config_path = current_path / 'config.json'
with open(config_path, 'r') as config_file:
config_file = ''.join(line for line in config_file
if line and '//' not in line)
config_path = current_path / "config.json"
with open(config_path, "r") as config_file:
config_file = "".join(line for line in config_file if line and "//" not in line)
config = json.loads(config_file)

profile = os.getenv('SKLBENCH_PROFILE', config['profile'])
profile = os.getenv("SKLBENCH_PROFILE", config["profile"])

n_jobs_vals_env = os.getenv('SKLBENCH_NJOBS')
n_jobs_vals_env = os.getenv("SKLBENCH_NJOBS")
if n_jobs_vals_env:
n_jobs_vals = eval(n_jobs_vals_env)
else:
n_jobs_vals = config['n_jobs_vals']
n_jobs_vals = config["n_jobs_vals"]
if not n_jobs_vals:
n_jobs_vals = list(range(1, 1 + cpu_count()))

cache_path = current_path / 'cache'
cache_path = current_path / "cache"
cache_path.mkdir(exist_ok=True)
(cache_path / 'estimators').mkdir(exist_ok=True)
(cache_path / 'tmp').mkdir(exist_ok=True)
(cache_path / "estimators").mkdir(exist_ok=True)
(cache_path / "tmp").mkdir(exist_ok=True)

save_estimators = os.getenv('SKLBENCH_SAVE_ESTIMATORS',
config['save_estimators'])
save_dir = os.getenv('ASV_COMMIT', 'new')[:8]
save_estimators = os.getenv("SKLBENCH_SAVE_ESTIMATORS", config["save_estimators"])
save_dir = os.getenv("ASV_COMMIT", "new")[:8]

if save_estimators:
(cache_path / 'estimators' / save_dir).mkdir(exist_ok=True)
(cache_path / "estimators" / save_dir).mkdir(exist_ok=True)

base_commit = os.getenv('SKLBENCH_BASE_COMMIT', config['base_commit'])
base_commit = os.getenv("SKLBENCH_BASE_COMMIT", config["base_commit"])

bench_predict = os.getenv('SKLBENCH_PREDICT', config['bench_predict'])
bench_transform = os.getenv('SKLBENCH_TRANSFORM',
config['bench_transform'])
bench_predict = os.getenv("SKLBENCH_PREDICT", config["bench_predict"])
bench_transform = os.getenv("SKLBENCH_TRANSFORM", config["bench_transform"])

return (profile, n_jobs_vals, save_estimators, save_dir, base_commit,
bench_predict, bench_transform)
return (
profile,
n_jobs_vals,
save_estimators,
save_dir,
base_commit,
bench_predict,
bench_transform,
)


def get_estimator_path(benchmark, directory, params, save=False):
"""Get path of pickled fitted estimator"""
path = Path(__file__).resolve().parent / 'cache'
path = (path / 'estimators' / directory) if save else (path / 'tmp')
path = Path(__file__).resolve().parent / "cache"
path = (path / "estimators" / directory) if save else (path / "tmp")

filename = (benchmark.__class__.__name__
+ '_estimator_' + '_'.join(list(map(str, params))) + '.pkl')
filename = (
benchmark.__class__.__name__
+ "_estimator_"
+ "_".join(list(map(str, params)))
+ ".pkl"
)

return path / filename


def clear_tmp():
"""Clean the tmp directory"""
path = Path(__file__).resolve().parent / 'cache' / 'tmp'
path = Path(__file__).resolve().parent / "cache" / "tmp"
for child in path.iterdir():
child.unlink()


class Benchmark(ABC):
"""Abstract base class for all the benchmarks"""

timer = timeit.default_timer # wall time
processes = 1
timeout = 500

(profile, n_jobs_vals, save_estimators, save_dir, base_commit,
bench_predict, bench_transform) = get_from_config()

if profile == 'fast':
(
profile,
n_jobs_vals,
save_estimators,
save_dir,
base_commit,
bench_predict,
bench_transform,
) = get_from_config()

if profile == "fast":
warmup_time = 0
repeat = 1
number = 1
min_run_count = 1
data_size = 'small'
elif profile == 'regular':
data_size = "small"
elif profile == "regular":
warmup_time = 1
repeat = (3, 100, 30)
data_size = 'small'
elif profile == 'large_scale':
data_size = "small"
elif profile == "large_scale":
warmup_time = 1
repeat = 3
number = 1
data_size = 'large'
data_size = "large"

@property
@abstractmethod
Expand All @@ -103,6 +119,7 @@ def params(self):

class Estimator(ABC):
"""Abstract base class for all benchmarks of estimators"""

@abstractmethod
def make_data(self, params):
"""Return the dataset for a combination of parameters"""
Expand All @@ -112,8 +129,7 @@ def make_data(self, params):

@abstractmethod
def make_estimator(self, params):
"""Return an instance of the estimator for a combination of parameters
"""
"""Return an instance of the estimator for a combination of parameters"""
pass

def skip(self, params):
Expand All @@ -137,9 +153,10 @@ def setup_cache(self):

estimator.fit(X, y)

est_path = get_estimator_path(self, Benchmark.save_dir,
params, Benchmark.save_estimators)
with est_path.open(mode='wb') as f:
est_path = get_estimator_path(
self, Benchmark.save_dir, params, Benchmark.save_estimators
)
with est_path.open(mode="wb") as f:
pickle.dump(estimator, f)

def setup(self, *params):
Expand All @@ -152,9 +169,10 @@ def setup(self, *params):

self.X, self.X_val, self.y, self.y_val = self.make_data(params)

est_path = get_estimator_path(self, Benchmark.save_dir,
params, Benchmark.save_estimators)
with est_path.open(mode='rb') as f:
est_path = get_estimator_path(
self, Benchmark.save_dir, params, Benchmark.save_estimators
)
with est_path.open(mode="rb") as f:
self.estimator = pickle.load(f)

self.make_scorers()
Expand All @@ -166,14 +184,14 @@ def peakmem_fit(self, *args):
self.estimator.fit(self.X, self.y)

def track_train_score(self, *args):
if hasattr(self.estimator, 'predict'):
if hasattr(self.estimator, "predict"):
y_pred = self.estimator.predict(self.X)
else:
y_pred = None
return float(self.train_scorer(self.y, y_pred))

def track_test_score(self, *args):
if hasattr(self.estimator, 'predict'):
if hasattr(self.estimator, "predict"):
y_val_pred = self.estimator.predict(self.X_val)
else:
y_val_pred = None
Expand All @@ -182,18 +200,20 @@ def track_test_score(self, *args):

class Predictor(ABC):
"""Abstract base class for benchmarks of estimators implementing predict"""

if Benchmark.bench_predict:

def time_predict(self, *args):
self.estimator.predict(self.X)

def peakmem_predict(self, *args):
self.estimator.predict(self.X)

if Benchmark.base_commit is not None:

def track_same_prediction(self, *args):
est_path = get_estimator_path(self, Benchmark.base_commit,
args, True)
with est_path.open(mode='rb') as f:
est_path = get_estimator_path(self, Benchmark.base_commit, args, True)
with est_path.open(mode="rb") as f:
estimator_base = pickle.load(f)

y_val_pred_base = estimator_base.predict(self.X_val)
Expand All @@ -208,20 +228,21 @@ def params(self):


class Transformer(ABC):
"""Abstract base class for benchmarks of estimators implementing transform
"""
"""Abstract base class for benchmarks of estimators implementing transform"""

if Benchmark.bench_transform:

def time_transform(self, *args):
self.estimator.transform(self.X)

def peakmem_transform(self, *args):
self.estimator.transform(self.X)

if Benchmark.base_commit is not None:

def track_same_transform(self, *args):
est_path = get_estimator_path(self, Benchmark.base_commit,
args, True)
with est_path.open(mode='rb') as f:
est_path = get_estimator_path(self, Benchmark.base_commit, args, True)
with est_path.open(mode="rb") as f:
estimator_base = pickle.load(f)

X_val_t_base = estimator_base.transform(self.X_val)
Expand Down
Loading

0 comments on commit 82df489

Please sign in to comment.