Skip to content

Commit

Permalink
FIX fetch_california_housing
Browse files Browse the repository at this point in the history
Using a different resource than lib.stat.cmu.edu which has not been fixed yet.
  • Loading branch information
lesteve authored and ogrisel committed Jan 4, 2016
1 parent e464689 commit 3daab8a
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 22 deletions.
13 changes: 5 additions & 8 deletions examples/ensemble/plot_partial_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,7 @@


def main():
# fetch California housing dataset
try:
cal_housing = fetch_california_housing()
except HTTPError:
print("Failed downloading california housing data.")
return
cal_housing = fetch_california_housing()

# split 80/20 train-test
X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
Expand All @@ -87,7 +82,8 @@ def main():
print

features = [0, 5, 1, 2, (5, 1)]
fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
fig, axs = plot_partial_dependence(clf, X_train, features,
feature_names=names,
n_jobs=3, grid_resolution=50)
fig.suptitle('Partial dependence of house value on nonlocation features\n'
'for the California housing dataset')
Expand Down Expand Up @@ -118,5 +114,6 @@ def main():
plt.show()


if __name__ == "__main__":
# Needed on Windows because plot_partial_dependence uses multiprocessing
if __name__ == '__main__':
main()
30 changes: 16 additions & 14 deletions sklearn/datasets/california_housing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@
# License: BSD 3 clause

from io import BytesIO
import os
from os.path import exists
from os import makedirs
from zipfile import ZipFile
import tarfile

try:
# Python 2
from urllib2 import urlopen
Expand All @@ -39,8 +41,7 @@
from ..externals import joblib


DATA_URL = "http:https://lib.stat.cmu.edu/modules.php?op=modload&name=Downloads&"\
"file=index&req=getit&lid=83"
DATA_URL = "http:https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz"
TARGET_FILENAME = "cal_housing.pkz"

# Grab the module-level docstring to use as a description of the
Expand Down Expand Up @@ -90,17 +91,18 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
filepath = _pkl_filepath(data_home, TARGET_FILENAME)
if not exists(filepath):
print('downloading Cal. housing from %s to %s' % (DATA_URL, data_home))
fhandle = urlopen(DATA_URL)
buf = BytesIO(fhandle.read())
zip_file = ZipFile(buf)
try:
cadata_fd = zip_file.open('cadata.txt', 'r')
cadata = BytesIO(cadata_fd.read())
# skip the first 27 lines (documentation)
cal_housing = np.loadtxt(cadata, skiprows=27)
joblib.dump(cal_housing, filepath, compress=6)
finally:
zip_file.close()
archive_fileobj = BytesIO(urlopen(DATA_URL).read())
fileobj = tarfile.open(
mode="r:gz",
fileobj=archive_fileobj).extractfile(
'CaliforniaHousing/cal_housing.data')

cal_housing = np.loadtxt(fileobj, delimiter=',')
# Columns are not in the same order compared to the previous
# URL resource on lib.stat.cmu.edu
columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
cal_housing = cal_housing[:, columns_index]
joblib.dump(cal_housing, filepath, compress=6)
else:
cal_housing = joblib.load(filepath)

Expand Down

0 comments on commit 3daab8a

Please sign in to comment.