Skip to content

Commit

Permalink
sparse_reindex, homogenize and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed May 12, 2011
1 parent 3490a91 commit 87f7fa2
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 7 deletions.
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
clean:
-rm -rf build dist

sparse: pandas/lib/src/sparse.pyx
-python build_cython.py build_ext --inplace
Expand Down
62 changes: 60 additions & 2 deletions pandas/core/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,33 @@ def reindex(self, new_index, method=None):
return SparseSeries(new_values, index=new_index,
fill_value=self.fill_value)

def sparse_reindex(self, new_index):
"""
Conform sparse values to new SparseIndex
Parameters
----------
new_index : {BlockIndex, IntIndex}
Returns
-------
reindexed : SparseSeries
"""
assert(isinstance(new_index, splib.SparseIndex))

new_values = self.sp_index.to_int_index().reindex(self.sp_values,
self.fill_value,
new_index)

# indexer = self.sp_index.get_indexer(new_index)

# new_values = self.sp_values.take(indexer)
# new_values[indexer == -1] = self.fill_value

return SparseSeries(new_values, index=self.index,
sparse_index=new_index,
fill_value=self.fill_value)

def count(self):
sp_values = self.sp_values
valid_spvals = np.isfinite(sp_values).sum()
Expand Down Expand Up @@ -688,8 +715,8 @@ def _reindex_columns(self, columns):

from pandas.core.panel import WidePanel

from line_profiler import LineProfiler
prof = LineProfiler()
# from line_profiler import LineProfiler
# prof = LineProfiler()

def stack_sparse_frame(frame):
"""
Expand Down Expand Up @@ -725,6 +752,37 @@ def stack_sparse_frame(frame):
lp = LongPanel(stacked_values.reshape((nobs, 1)), ['foo'], index)
return lp.sort('major')

def homogenize(series_dict):
"""
Conform a set of SparseSeries (with NaN fill_value) to a common SparseIndex
corresponding to the locations where they all have data
Notes
-----
Using the dumbest algorithm I could think of. Should put some more thought
into this
"""
index = None

need_reindex = False

for _, series in series_dict.iteritems():
if index is None:
index = series.sp_index
elif not series.sp_index.equals(index):
need_reindex = True
index = index.intersect(series.sp_index)

if need_reindex:
output = {}
for name, series in series_dict.iteritems():
output[name] = series.sparse_reindex(index)
else:
output = series_dict

return output

class SparseWidePanel(WidePanel):
"""
Expand Down
41 changes: 40 additions & 1 deletion pandas/core/tests/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
assert_frame_equal)
from numpy.testing import assert_equal

from pandas import DataFrame, DateRange, WidePanel
from pandas import Series, DataFrame, DateRange, WidePanel
from pandas.core.datetools import BDay
from pandas.core.series import remove_na
from pandas.core.sparse import (IntIndex, BlockIndex,
Expand Down Expand Up @@ -389,6 +389,45 @@ def _compare_with_series(sps, new_index):
sp_zero = SparseSeries([], index=[], fill_value=0)
_compare_with_series(sp, np.arange(10))

def test_sparse_reindex(self):
length = 10

def _check(values, index1, index2, fill_value):
first_series = SparseSeries(values, sparse_index=index1,
fill_value=fill_value)
reindexed = first_series.sparse_reindex(index2)
self.assert_(reindexed.sp_index is index2)

int_indices1 = index1.to_int_index().indices
int_indices2 = index2.to_int_index().indices

expected = Series(values, index=int_indices1)
expected = expected.reindex(int_indices2).fillna(fill_value)
assert_almost_equal(expected.values, reindexed.sp_values)

def _check_with_fill_value(values, first, second, fill_value=nan):
i_index1 = IntIndex(length, first)
i_index2 = IntIndex(length, second)

b_index1 = i_index1.to_block_index()
b_index2 = i_index2.to_block_index()

_check(values, i_index1, i_index2, fill_value)
_check(values, b_index1, b_index2, fill_value)

def _check_all(values, first, second):
_check_with_fill_value(values, first, second, fill_value=nan)
_check_with_fill_value(values, first, second, fill_value=0)

index1 = [2, 4, 5, 6, 8, 9]
values1 = np.arange(6.)

_check_all(values1, index1, [2, 4, 5])
_check_all(values1, index1, [2, 3, 4, 5, 6, 7, 8, 9])
_check_all(values1, index1, [0, 1])
_check_all(values1, index1, [0, 1, 7, 8, 9])
_check_all(values1, index1, [])

def test_repr(self):
bsrepr = repr(self.bseries)
isrepr = repr(self.iseries)
Expand Down
61 changes: 61 additions & 0 deletions pandas/lib/src/sparse.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ cdef class IntIndex(SparseIndex):
if not isinstance(other, IntIndex):
return False

if self is other:
return True

same_length = self.length == other.length
same_indices = np.array_equal(self.indices, other.indices)
return same_length and same_indices
Expand Down Expand Up @@ -198,6 +201,37 @@ cdef class IntIndex(SparseIndex):
else:
return -1

cpdef ndarray reindex(self, ndarray[float64_t, ndim=1] values,
float64_t fill_value, SparseIndex other_):
cdef:
pyst i = 0, j = 0
IntIndex other
ndarray[float64_t, ndim=1] result
ndarray[int32_t, ndim=1] sinds, oinds

other = other_.to_int_index()

oinds = other.indices
sinds = self.indices

result = np.empty(other.npoints, dtype=np.float64)
result.fill(fill_value)

for 0 <= i < other.npoints:
while oinds[i] > sinds[j] and j < self.npoints:
j += 1

if j == self.npoints:
break

if oinds[i] < sinds[j]:
continue
elif oinds[i] == sinds[j]:
result[i] = values[j]
j += 1

return result

cpdef put(self, ndarray[float64_t, ndim=1] values,
ndarray[int32_t, ndim=1] indices, object to_put):
pass
Expand Down Expand Up @@ -322,6 +356,9 @@ cdef class BlockIndex(SparseIndex):
if not isinstance(other, BlockIndex):
return False

if self is other:
return True

same_length = self.length == other.length
same_blocks = (np.array_equal(self.blocs, other.blocs) and
np.array_equal(self.blengths, other.blengths))
Expand Down Expand Up @@ -466,6 +503,30 @@ cdef class BlockIndex(SparseIndex):

return -1

cpdef ndarray reindex(self, ndarray[float64_t, ndim=1] values,
float64_t fill_value, SparseIndex other_):
cdef:
pyst i = 0, j = 0, ocur, ocurlen
BlockIndex other
ndarray[float64_t, ndim=1] result
ndarray[int32_t, ndim=1] slocs, slens, olocs, olens

other = other_.to_block_index()

olocs = other.blocs
olens = other.blengths
slocs = self.blocs
slens = self.blengths

result = np.empty(other.npoints, dtype=np.float64)

for 0 <= i < other.nblocks:
ocur = olocs[i]
ocurlen = olens[i]

while slocs[j] + slens[j] < ocur:
j += 1

cpdef put(self, ndarray[float64_t, ndim=1] values,
ndarray[int32_t, ndim=1] indices, object to_put):
pass
Expand Down
9 changes: 5 additions & 4 deletions pandas/lib/tests/test_libsparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
from pandas import Series

import nose
from numpy import nan
import numpy as np
import operator
from numpy.testing import assert_almost_equal, assert_equal

from pandas.core.sparse import SparseSeries
from pandas import DataFrame

from pandas.lib.sparse import IntIndex, BlockIndex
import pandas.lib.sparse as splib

Expand Down Expand Up @@ -245,7 +249,6 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
self.assert_(isinstance(xbindex, BlockIndex))
self.assert_(xbindex.equals(xindex))
self.assert_(ybindex.equals(yindex))

check_cases(_check_case)

def test_to_int_index(self):
Expand All @@ -267,11 +270,9 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen):
check_cases(_check_case)

def test_make_union(self):
# TODO
pass

from pandas.core.sparse import SparseSeries
from pandas import DataFrame

class TestSparseOperators(TestCase):

def _nan_op_tests(self, sparse_op, python_op):
Expand Down

0 comments on commit 87f7fa2

Please sign in to comment.