Skip to content

Commit

Permalink
BUG: where coerces numeric to str incorrectly
Browse files Browse the repository at this point in the history
  • Loading branch information
sinhrks committed Jan 18, 2015
1 parent c567701 commit f677011
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 3 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ Bug Fixes
- Bug in read_csv when using skiprows on a file with CR line endings with the c engine. (:issue:`9079`)
- isnull now detects ``NaT`` in PeriodIndex (:issue:`9129`)
- Bug in groupby ``.nth()`` with a multiple column groupby (:issue:`8979`)
- Bug in ``DataFrame.where`` and ``Series.where`` coerce numerics to string incorrectly (:issue:`9280`)
- Bug in ``DataFrame.where`` and ``Series.where`` raise ``ValueError`` when string list-like is passed. (:issue:`9280`)

- Fixed division by zero error for ``Series.kurt()`` when all values are equal (:issue:`9197`)

Expand Down
15 changes: 14 additions & 1 deletion pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import pandas.lib as lib
import pandas.tslib as tslib
from pandas import compat
from pandas.compat import StringIO, BytesIO, range, long, u, zip, map
from pandas.compat import StringIO, BytesIO, range, long, u, zip, map, string_types

from pandas.core.config import get_option

Expand Down Expand Up @@ -1322,6 +1322,19 @@ def _possibly_downcast_to_dtype(result, dtype):
return result


def _maybe_convert_string_to_object(values):
"""
Convert string-like and string-like array to convert object dtype.
This is to avoid numpy to handle the array as str dtype.
"""
if isinstance(values, string_types):
values = np.array([values], dtype=object)
elif (isinstance(values, np.ndarray) and
issubclass(values.dtype.type, (np.string_, np.unicode_))):
values = values.astype(object)
return values


def _lcd_dtypes(a_dtype, b_dtype):
""" return the lcd dtype to hold these types """

Expand Down
6 changes: 5 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3292,7 +3292,11 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None,
if self.ndim == 1:

# try to set the same dtype as ourselves
new_other = np.array(other, dtype=self.dtype)
try:
new_other = np.array(other, dtype=self.dtype)
except ValueError:
new_other = np.array(other)

if not (new_other == np.array(other)).all():
other = np.array(other)

Expand Down
4 changes: 3 additions & 1 deletion pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
ABCSparseSeries, _infer_dtype_from_scalar,
_is_null_datelike_scalar, _maybe_promote,
is_timedelta64_dtype, is_datetime64_dtype,
_possibly_infer_to_datetimelike, array_equivalent)
_possibly_infer_to_datetimelike, array_equivalent,
_maybe_convert_string_to_object)
from pandas.core.index import Index, MultiIndex, _ensure_index
from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer)
from pandas.core.categorical import Categorical, _maybe_to_categorical, _is_categorical
Expand Down Expand Up @@ -1052,6 +1053,7 @@ def where(self, other, cond, align=True, raise_on_error=True,
values = values.T
is_transposed = not is_transposed

other = _maybe_convert_string_to_object(other)

# our where function
def func(c, v, o):
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -947,6 +947,34 @@ def test_2d_datetime64(self):
tm.assert_almost_equal(result, expected)


class TestMaybe(tm.TestCase):

def test_maybe_convert_string_to_array(self):
result = com._maybe_convert_string_to_object('x')
tm.assert_numpy_array_equal(result, np.array(['x'], dtype=object))
self.assertTrue(result.dtype == object)

result = com._maybe_convert_string_to_object(1)
self.assertEquals(result, 1)

arr = np.array(['x', 'y'], dtype=str)
result = com._maybe_convert_string_to_object(arr)
tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object))
self.assertTrue(result.dtype == object)

# unicode
arr = np.array(['x', 'y']).astype('U')
result = com._maybe_convert_string_to_object(arr)
tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object))
self.assertTrue(result.dtype == object)

# object
arr = np.array(['x', 2], dtype=object)
result = com._maybe_convert_string_to_object(arr)
tm.assert_numpy_array_equal(result, np.array(['x', 2], dtype=object))
self.assertTrue(result.dtype == object)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
22 changes: 22 additions & 0 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1886,6 +1886,28 @@ def test_ix_setitem(self):
self.assertEqual(self.series[d1], 4)
self.assertEqual(self.series[d2], 6)

def test_where_numeric_with_string(self):
# GH 9280
s = pd.Series([1, 2, 3])
w = s.where(s>1, 'X')

self.assertTrue(isinstance(w[0], str))
self.assertTrue(isinstance(w[1], int))
self.assertTrue(isinstance(w[2], int))
self.assertTrue(w.dtype == 'object')

w = s.where(s>1, ['X', 'Y', 'Z'])
self.assertTrue(isinstance(w[0], str))
self.assertTrue(isinstance(w[1], int))
self.assertTrue(isinstance(w[2], int))
self.assertTrue(w.dtype == 'object')

w = s.where(s>1, np.array(['X', 'Y', 'Z']))
self.assertTrue(isinstance(w[0], str))
self.assertTrue(isinstance(w[1], int))
self.assertTrue(isinstance(w[2], int))
self.assertTrue(w.dtype == 'object')

def test_setitem_boolean(self):
mask = self.series > self.series.median()

Expand Down

0 comments on commit f677011

Please sign in to comment.