sparse_reindex, homogenize and tests

maxwell-lv · May 12, 2011 · 87f7fa2 · 87f7fa2
1 parent 3490a91
commit 87f7fa2
Show file tree

Hide file tree

Showing 5 changed files with 168 additions and 7 deletions.
diff --git a/Makefile b/Makefile
@@ -1,3 +1,5 @@
+clean:
+ -rm -rf build dist
 
 sparse: pandas/lib/src/sparse.pyx
  -python build_cython.py build_ext --inplace

diff --git a/pandas/core/sparse.py b/pandas/core/sparse.py
@@ -446,6 +446,33 @@ def reindex(self, new_index, method=None):
  return SparseSeries(new_values, index=new_index,
  fill_value=self.fill_value)
 
+ def sparse_reindex(self, new_index):
+ """
+ Conform sparse values to new SparseIndex
+
+ Parameters
+ ----------
+ new_index : {BlockIndex, IntIndex}
+
+ Returns
+ -------
+ reindexed : SparseSeries
+ """
+ assert(isinstance(new_index, splib.SparseIndex))
+
+ new_values = self.sp_index.to_int_index().reindex(self.sp_values,
+ self.fill_value,
+ new_index)
+
+ # indexer = self.sp_index.get_indexer(new_index)
+
+ # new_values = self.sp_values.take(indexer)
+ # new_values[indexer == -1] = self.fill_value
+
+ return SparseSeries(new_values, index=self.index,
+ sparse_index=new_index,
+ fill_value=self.fill_value)
+
  def count(self):
  sp_values = self.sp_values
  valid_spvals = np.isfinite(sp_values).sum()
@@ -688,8 +715,8 @@ def _reindex_columns(self, columns):
 
 from pandas.core.panel import WidePanel
 
-from line_profiler import LineProfiler
-prof = LineProfiler()
+# from line_profiler import LineProfiler
+# prof = LineProfiler()
 
 def stack_sparse_frame(frame):
  """
@@ -725,6 +752,37 @@ def stack_sparse_frame(frame):
  lp = LongPanel(stacked_values.reshape((nobs, 1)), ['foo'], index)
  return lp.sort('major')
 
+def homogenize(series_dict):
+ """
+ Conform a set of SparseSeries (with NaN fill_value) to a common SparseIndex
+ corresponding to the locations where they all have data
+
+ Notes
+ -----
+ Using the dumbest algorithm I could think of. Should put some more thought
+ into this
+
+ """
+ index = None
+
+ need_reindex = False
+
+ for _, series in series_dict.iteritems():
+ if index is None:
+ index = series.sp_index
+ elif not series.sp_index.equals(index):
+ need_reindex = True
+ index = index.intersect(series.sp_index)
+
+ if need_reindex:
+ output = {}
+ for name, series in series_dict.iteritems():
+ output[name] = series.sparse_reindex(index)
+ else:
+ output = series_dict
+
+ return output
+
 class SparseWidePanel(WidePanel):
  """
 

diff --git a/pandas/core/tests/test_sparse.py b/pandas/core/tests/test_sparse.py
@@ -13,7 +13,7 @@
  assert_frame_equal)
 from numpy.testing import assert_equal
 
-from pandas import DataFrame, DateRange, WidePanel
+from pandas import Series, DataFrame, DateRange, WidePanel
 from pandas.core.datetools import BDay
 from pandas.core.series import remove_na
 from pandas.core.sparse import (IntIndex, BlockIndex,
@@ -389,6 +389,45 @@ def _compare_with_series(sps, new_index):
  sp_zero = SparseSeries([], index=[], fill_value=0)
  _compare_with_series(sp, np.arange(10))
 
+ def test_sparse_reindex(self):
+ length = 10
+
+ def _check(values, index1, index2, fill_value):
+ first_series = SparseSeries(values, sparse_index=index1,
+ fill_value=fill_value)
+ reindexed = first_series.sparse_reindex(index2)
+ self.assert_(reindexed.sp_index is index2)
+
+ int_indices1 = index1.to_int_index().indices
+ int_indices2 = index2.to_int_index().indices
+
+ expected = Series(values, index=int_indices1)
+ expected = expected.reindex(int_indices2).fillna(fill_value)
+ assert_almost_equal(expected.values, reindexed.sp_values)
+
+ def _check_with_fill_value(values, first, second, fill_value=nan):
+ i_index1 = IntIndex(length, first)
+ i_index2 = IntIndex(length, second)
+
+ b_index1 = i_index1.to_block_index()
+ b_index2 = i_index2.to_block_index()
+
+ _check(values, i_index1, i_index2, fill_value)
+ _check(values, b_index1, b_index2, fill_value)
+
+ def _check_all(values, first, second):
+ _check_with_fill_value(values, first, second, fill_value=nan)
+ _check_with_fill_value(values, first, second, fill_value=0)
+
+ index1 = [2, 4, 5, 6, 8, 9]
+ values1 = np.arange(6.)
+
+ _check_all(values1, index1, [2, 4, 5])
+ _check_all(values1, index1, [2, 3, 4, 5, 6, 7, 8, 9])
+ _check_all(values1, index1, [0, 1])
+ _check_all(values1, index1, [0, 1, 7, 8, 9])
+ _check_all(values1, index1, [])
+
  def test_repr(self):
  bsrepr = repr(self.bseries)
  isrepr = repr(self.iseries)

diff --git a/pandas/lib/src/sparse.pyx b/pandas/lib/src/sparse.pyx
@@ -98,6 +98,9 @@ cdef class IntIndex(SparseIndex):
  if not isinstance(other, IntIndex):
  return False
 
+ if self is other:
+ return True
+
  same_length = self.length == other.length
  same_indices = np.array_equal(self.indices, other.indices)
  return same_length and same_indices
@@ -198,6 +201,37 @@ cdef class IntIndex(SparseIndex):
  else:
  return -1
 
+ cpdef ndarray reindex(self, ndarray[float64_t, ndim=1] values,
+ float64_t fill_value, SparseIndex other_):
+ cdef:
+ pyst i = 0, j = 0
+ IntIndex other
+ ndarray[float64_t, ndim=1] result
+ ndarray[int32_t, ndim=1] sinds, oinds
+
+ other = other_.to_int_index()
+
+ oinds = other.indices
+ sinds = self.indices
+
+ result = np.empty(other.npoints, dtype=np.float64)
+ result.fill(fill_value)
+
+ for 0 <= i < other.npoints:
+ while oinds[i] > sinds[j] and j < self.npoints:
+ j += 1
+
+ if j == self.npoints:
+ break
+
+ if oinds[i] < sinds[j]:
+ continue
+ elif oinds[i] == sinds[j]:
+ result[i] = values[j]
+ j += 1
+
+ return result
+
  cpdef put(self, ndarray[float64_t, ndim=1] values,
  ndarray[int32_t, ndim=1] indices, object to_put):
  pass
@@ -322,6 +356,9 @@ cdef class BlockIndex(SparseIndex):
  if not isinstance(other, BlockIndex):
  return False
 
+ if self is other:
+ return True
+
  same_length = self.length == other.length
  same_blocks = (np.array_equal(self.blocs, other.blocs) and
  np.array_equal(self.blengths, other.blengths))
@@ -466,6 +503,30 @@ cdef class BlockIndex(SparseIndex):
 
  return -1
 
+ cpdef ndarray reindex(self, ndarray[float64_t, ndim=1] values,
+ float64_t fill_value, SparseIndex other_):
+ cdef:
+ pyst i = 0, j = 0, ocur, ocurlen
+ BlockIndex other
+ ndarray[float64_t, ndim=1] result
+ ndarray[int32_t, ndim=1] slocs, slens, olocs, olens
+
+ other = other_.to_block_index()
+
+ olocs = other.blocs
+ olens = other.blengths
+ slocs = self.blocs
+ slens = self.blengths
+
+ result = np.empty(other.npoints, dtype=np.float64)
+
+ for 0 <= i < other.nblocks:
+ ocur = olocs[i]
+ ocurlen = olens[i]
+
+ while slocs[j] + slens[j] < ocur:
+ j += 1
+
  cpdef put(self, ndarray[float64_t, ndim=1] values,
  ndarray[int32_t, ndim=1] indices, object to_put):
  pass

diff --git a/pandas/lib/tests/test_libsparse.py b/pandas/lib/tests/test_libsparse.py
@@ -3,10 +3,14 @@
 from pandas import Series
 
 import nose
+from numpy import nan
 import numpy as np
 import operator
 from numpy.testing import assert_almost_equal, assert_equal
 
+from pandas.core.sparse import SparseSeries
+from pandas import DataFrame
+
 from pandas.lib.sparse import IntIndex, BlockIndex
 import pandas.lib.sparse as splib
 
@@ -245,7 +249,6 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
  self.assert_(isinstance(xbindex, BlockIndex))
  self.assert_(xbindex.equals(xindex))
  self.assert_(ybindex.equals(yindex))
-
  check_cases(_check_case)
 
  def test_to_int_index(self):
@@ -267,11 +270,9 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
  check_cases(_check_case)
 
  def test_make_union(self):
+ # TODO
  pass
 
-from pandas.core.sparse import SparseSeries
-from pandas import DataFrame
-
 class TestSparseOperators(TestCase):
 
  def _nan_op_tests(self, sparse_op, python_op):