Skip to content

Commit

Permalink
ARROW-7022, ARROW-7023: [Python] fix handling of pandas Index and Per…
Browse files Browse the repository at this point in the history
…iod/Interval extension arrays in pa.array

Fixes https://issues.apache.org/jira/browse/ARROW-7022, and while doing this noticed another bug this is fixing (for which I opened https://issues.apache.org/jira/browse/ARROW-7023)

Closes apache#5753 from jorisvandenbossche/ARROW-7022-arrow-array-extension and squashes the following commits:

b2f0eb5 <Joris Van den Bossche> do not fallback to ndarray for pandas ExtensionArray
53ee108 <Joris Van den Bossche> fix error message
57b7a50 <Joris Van den Bossche> ARROW-7022, ARROW-7023:  fix handling of pandas Index and Period/Interval extension arrays in pa.array

Authored-by: Joris Van den Bossche <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
jorisvandenbossche authored and pitrou committed Nov 5, 2019
1 parent e0cc9c4 commit e0e8e53
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 7 deletions.
10 changes: 5 additions & 5 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,9 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
elif _is_array_like(obj):
if mask is not None:
# out argument unused
mask = get_series_values(mask, &is_pandas_object)
mask = get_values(mask, &is_pandas_object)

values = get_series_values(obj, &is_pandas_object)
values = get_values(obj, &is_pandas_object)
if is_pandas_object and from_pandas is None:
c_from_pandas = True

Expand Down Expand Up @@ -1725,9 +1725,9 @@ cdef dict _array_classes = {
}


cdef object get_series_values(object obj, bint* is_series):
if pandas_api.is_series(obj):
result = obj.values
cdef object get_values(object obj, bint* is_series):
if pandas_api.is_series(obj) or pandas_api.is_index(obj):
result = pandas_api.get_values(obj)
is_series[0] = True
elif isinstance(obj, np.ndarray):
result = obj
Expand Down
25 changes: 25 additions & 0 deletions python/pyarrow/pandas-shim.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ cdef class _PandasAPIShim(object):
object _datetimetz_type, _extension_array, _extension_dtype
object _array_like_types
bint has_sparse
bint _pd024

def __init__(self):
self._tried_importing_pandas = False
Expand Down Expand Up @@ -89,6 +90,8 @@ cdef class _PandasAPIShim(object):
else:
self.has_sparse = True

self._pd024 = self._loose_version >= LooseVersion('0.24')

cdef inline _check_import(self, bint raise_=True):
if self._tried_importing_pandas:
if not self._have_pandas and raise_:
Expand Down Expand Up @@ -197,6 +200,28 @@ cdef class _PandasAPIShim(object):
else:
return False

cpdef is_index(self, obj):
if self._have_pandas_internal():
return isinstance(obj, self._index)
else:
return False

cpdef get_values(self, obj):
"""
Get the underlying array values of a pandas Series or Index in the
format (np.ndarray or pandas ExtensionArray) as we need them.
Assumes obj is a pandas Series or Index.
"""
self._check_import()
if isinstance(obj.dtype, (self.pd.api.types.IntervalDtype,
self.pd.api.types.PeriodDtype)):
if self._pd024:
# only since pandas 0.24, interval and period are stored as
# such in Series
return obj.array
return obj.values

def assert_frame_equal(self, *args, **kwargs):
self._check_import()
return self._pd.util.testing.assert_frame_equal
Expand Down
11 changes: 11 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1203,6 +1203,17 @@ def test_pandas_null_sentinels_raise_error():
assert result.null_count == (1 if ty != 'null' else 2)


@pytest.mark.pandas
def test_pandas_null_sentinels_index():
# ARROW-7023 - ensure that when passing a pandas Index, "from_pandas"
# semantics are used
import pandas as pd
idx = pd.Index([1, 2, np.nan], dtype=object)
result = pa.array(idx)
expected = pa.array([1, 2, np.nan], from_pandas=True)
assert result.equals(expected)


def test_array_from_numpy_datetimeD():
arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]')

Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/tests/test_feather.py
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@ def test_unsupported(self):

# period
df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
self._assert_error_on_write(df, ValueError)
self._assert_error_on_write(df, TypeError)

# non-strings
df = pd.DataFrame({'a': ['a', 1, 2.0]})
Expand Down
43 changes: 42 additions & 1 deletion python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2576,7 +2576,7 @@ def test_convert_unsupported_type_error_message():
})

expected_msg = 'Conversion failed for column a with type period'
with pytest.raises(pa.ArrowInvalid, match=expected_msg):
with pytest.raises(TypeError, match=expected_msg):
pa.Table.from_pandas(df)


Expand Down Expand Up @@ -3222,6 +3222,47 @@ def test_array_protocol():
assert result.equals(expected2)


class DummyExtensionType(pa.PyExtensionType):

def __init__(self):
pa.PyExtensionType.__init__(self, pa.int64())

def __reduce__(self):
return DummyExtensionType, ()


def PandasArray__arrow_array__(self, type=None):
# harcode dummy return regardless of self - we only want to check that
# this method is correctly called
storage = pa.array([1, 2, 3], type=pa.int64())
return pa.ExtensionArray.from_storage(DummyExtensionType(), storage)


def test_array_protocol_pandas_extension_types(monkeypatch):
# ARROW-7022 - ensure protocol works for Period / Interval extension dtypes

if LooseVersion(pd.__version__) < '0.24.0':
pytest.skip(reason='Period/IntervalArray only introduced in 0.24')

storage = pa.array([1, 2, 3], type=pa.int64())
expected = pa.ExtensionArray.from_storage(DummyExtensionType(), storage)

monkeypatch.setattr(pd.arrays.PeriodArray, "__arrow_array__",
PandasArray__arrow_array__, raising=False)
monkeypatch.setattr(pd.arrays.IntervalArray, "__arrow_array__",
PandasArray__arrow_array__, raising=False)
for arr in [pd.period_range("2012-01-01", periods=3, freq="D").array,
pd.interval_range(1, 4).array]:
result = pa.array(arr)
assert result.equals(expected)
result = pa.array(pd.Series(arr))
assert result.equals(expected)
result = pa.array(pd.Index(arr))
assert result.equals(expected)
result = pa.table(pd.DataFrame({'a': arr})).column('a').chunk(0)
assert result.equals(expected)


# ----------------------------------------------------------------------
# Pandas ExtensionArray support

Expand Down

0 comments on commit e0e8e53

Please sign in to comment.