FIX fetch_california_housing

Using a different resource than lib.stat.cmu.edu which has not been fixed yet.
hugobowne · Jan 4, 2016 · 3daab8a · 3daab8a
1 parent e464689
commit 3daab8a
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 22 deletions.
diff --git a/examples/ensemble/plot_partial_dependence.py b/examples/ensemble/plot_partial_dependence.py
@@ -60,12 +60,7 @@
 
 
 def main():
- # fetch California housing dataset
- try:
- cal_housing = fetch_california_housing()
- except HTTPError:
- print("Failed downloading california housing data.")
- return
+ cal_housing = fetch_california_housing()
 
  # split 80/20 train-test
  X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
@@ -87,7 +82,8 @@ def main():
  print
 
  features = [0, 5, 1, 2, (5, 1)]
- fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
+ fig, axs = plot_partial_dependence(clf, X_train, features,
+ feature_names=names,
  n_jobs=3, grid_resolution=50)
  fig.suptitle('Partial dependence of house value on nonlocation features\n'
  'for the California housing dataset')
@@ -118,5 +114,6 @@ def main():
  plt.show()
 
 
-if __name__ == "__main__":
+# Needed on Windows because plot_partial_dependence uses multiprocessing
+if __name__ == '__main__':
  main()
diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py
@@ -22,9 +22,11 @@
 # License: BSD 3 clause
 
 from io import BytesIO
+import os
 from os.path import exists
 from os import makedirs
-from zipfile import ZipFile
+import tarfile
+
 try:
  # Python 2
  from urllib2 import urlopen
@@ -39,8 +41,7 @@
 from ..externals import joblib
 
 
-DATA_URL = "http:https://lib.stat.cmu.edu/modules.php?op=modload&name=Downloads&"\
- "file=index&req=getit&lid=83"
+DATA_URL = "http:https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz"
 TARGET_FILENAME = "cal_housing.pkz"
 
 # Grab the module-level docstring to use as a description of the
@@ -90,17 +91,18 @@ def fetch_california_housing(data_home=None, download_if_missing=True):
  filepath = _pkl_filepath(data_home, TARGET_FILENAME)
  if not exists(filepath):
  print('downloading Cal. housing from %s to %s' % (DATA_URL, data_home))
- fhandle = urlopen(DATA_URL)
- buf = BytesIO(fhandle.read())
- zip_file = ZipFile(buf)
- try:
- cadata_fd = zip_file.open('cadata.txt', 'r')
- cadata = BytesIO(cadata_fd.read())
- # skip the first 27 lines (documentation)
- cal_housing = np.loadtxt(cadata, skiprows=27)
- joblib.dump(cal_housing, filepath, compress=6)
- finally:
- zip_file.close()
+ archive_fileobj = BytesIO(urlopen(DATA_URL).read())
+ fileobj = tarfile.open(
+ mode="r:gz",
+ fileobj=archive_fileobj).extractfile(
+ 'CaliforniaHousing/cal_housing.data')
+
+ cal_housing = np.loadtxt(fileobj, delimiter=',')
+ # Columns are not in the same order compared to the previous
+ # URL resource on lib.stat.cmu.edu
+ columns_index = [8, 7, 2, 3, 4, 5, 6, 1, 0]
+ cal_housing = cal_housing[:, columns_index]
+ joblib.dump(cal_housing, filepath, compress=6)
  else:
  cal_housing = joblib.load(filepath)