diff --git a/doc/source/io.rst b/doc/source/io.rst index 4cee1f7d9510b..c9872c22e2f23 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1107,8 +1107,11 @@ is ``None``. To explicity force ``Series`` parsing, pass ``typ=series`` - ``keep_default_dates`` : boolean, default True. If parsing dates, then parse the default datelike columns - ``numpy`` : direct decoding to numpy arrays. default is False; Note that the JSON ordering **MUST** be the same for each term if ``numpy=True`` -- ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function - when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality +- ``precise_float`` : boolean, default ``False``. Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (``False``) is to use fast but less precise builtin functionality +- ``date_unit`` : string, the timestamp unit to detect if converting dates. Default + None. By default the timestamp precision will be detected, if this is not desired + then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to + seconds, milliseconds, microseconds or nanoseconds respectively. The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parsable. @@ -1168,6 +1171,25 @@ I like my string indicies sij.index sij.columns +My dates have been written in nanoseconds, so they need to be read back in +nanoseconds + +.. ipython:: python + + json = dfj2.to_json(date_unit='ns') + + # Try to parse timestamps as millseconds -> Won't Work + dfju = pd.read_json(json, date_unit='ms') + dfju + + # Let Pandas detect the correct precision + dfju = pd.read_json(json) + dfju + + # Or specify that all timestamps are in nanoseconds + dfju = pd.read_json(json, date_unit='ns') + dfju + .. ipython:: python :suppress: diff --git a/doc/source/release.rst b/doc/source/release.rst index e0532b6be136f..a8a801252d68c 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -134,6 +134,9 @@ pandas 0.13 local variable was undefined (:issue:`4381`) - In ``to_json``, raise if a passed ``orient`` would cause loss of data because of a duplicate index (:issue:`4359`) + - In ``to_json``, fix date handling so milliseconds are the default timestamp + as the docstring says (:issue:`4362`). + - JSON NaT handling fixed, NaTs are now serialised to `null` (:issue:`4498`) - Fixed passing ``keep_default_na=False`` when ``na_values=None`` (:issue:`4318`) - Fixed bug with ``values`` raising an error on a DataFrame with duplicate columns and mixed dtypes, surfaced in (:issue:`4377`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2ee7f791c671f..ece7d460c0d33 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -535,7 +535,7 @@ def to_clipboard(self): clipboard.to_clipboard(self) def to_json(self, path_or_buf=None, orient=None, date_format='epoch', - double_precision=10, force_ascii=True): + double_precision=10, force_ascii=True, date_unit='ms'): """ Convert the object to a JSON string. @@ -566,11 +566,15 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', - columns : dict like {column -> {index -> value}} - values : just the values array - date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601) - default is epoch + date_format : string, default 'epoch' + type of date conversion, 'epoch' for timestamp, 'iso' for ISO8601 double_precision : The number of decimal places to use when encoding floating point values, default 10. force_ascii : force encoded string to be ASCII, default True. + date_unit : string, default 'ms' (milliseconds) + The time unit to encode to, governs timestamp and ISO8601 + precision. One of 's', 'ms', 'us', 'ns' for second, millisecond, + microsecond, and nanosecond respectively. Returns ------- @@ -580,8 +584,13 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', """ from pandas.io import json - return json.to_json(path_or_buf=path_or_buf, obj=self, orient=orient, date_format=date_format, - double_precision=double_precision, force_ascii=force_ascii) + return json.to_json( + path_or_buf=path_or_buf, + obj=self, orient=orient, + date_format=date_format, + double_precision=double_precision, + force_ascii=force_ascii, + date_unit=date_unit) # install the indexerse for _name, _indexer in indexing.get_indexers_list(): diff --git a/pandas/io/json.py b/pandas/io/json.py index 78d1bc83d6107..eb59b5c88933d 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -1,9 +1,8 @@ # pylint: disable-msg=E1101,W0613,W0603 -from pandas.compat import StringIO, long -from pandas import compat -import os +from pandas.compat import long +from pandas import compat, isnull from pandas import Series, DataFrame, to_datetime from pandas.io.common import get_filepath_or_buffer import pandas.json as _json @@ -12,32 +11,39 @@ import numpy as np from pandas.tslib import iNaT -import pandas.lib as lib ### interface to/from ### -def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True): + +def to_json(path_or_buf, obj, orient=None, date_format='epoch', + double_precision=10, force_ascii=True, date_unit='ms'): if isinstance(obj, Series): - s = SeriesWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision, - ensure_ascii=force_ascii).write() + s = SeriesWriter( + obj, orient=orient, date_format=date_format, + double_precision=double_precision, ensure_ascii=force_ascii, + date_unit=date_unit).write() elif isinstance(obj, DataFrame): - s = FrameWriter(obj, orient=orient, date_format=date_format, double_precision=double_precision, - ensure_ascii=force_ascii).write() + s = FrameWriter( + obj, orient=orient, date_format=date_format, + double_precision=double_precision, ensure_ascii=force_ascii, + date_unit=date_unit).write() else: raise NotImplementedError if isinstance(path_or_buf, compat.string_types): - with open(path_or_buf,'w') as fh: + with open(path_or_buf, 'w') as fh: fh.write(s) elif path_or_buf is None: return s else: path_or_buf.write(s) + class Writer(object): - def __init__(self, obj, orient, date_format, double_precision, ensure_ascii): + def __init__(self, obj, orient, date_format, double_precision, + ensure_ascii, date_unit): self.obj = obj if orient is None: @@ -47,38 +53,23 @@ def __init__(self, obj, orient, date_format, double_precision, ensure_ascii): self.date_format = date_format self.double_precision = double_precision self.ensure_ascii = ensure_ascii + self.date_unit = date_unit self.is_copy = False self._format_axes() - self._format_dates() - - def _needs_to_date(self, obj): - return obj.dtype == 'datetime64[ns]' - - def _format_dates(self): - raise NotImplementedError def _format_axes(self): raise NotImplementedError - def _format_to_date(self, data): - - # iso - if self.date_format == 'iso': - return data.apply(lambda x: x.isoformat()) - - # int64 - else: - return data.astype(np.int64) - - def copy_if_needed(self): - """ copy myself if necessary """ - if not self.is_copy: - self.obj = self.obj.copy() - self.is_copy = True - def write(self): - return dumps(self.obj, orient=self.orient, double_precision=self.double_precision, ensure_ascii=self.ensure_ascii) + return dumps( + self.obj, + orient=self.orient, + double_precision=self.double_precision, + ensure_ascii=self.ensure_ascii, + date_unit=self.date_unit, + iso_dates=self.date_format == 'iso') + class SeriesWriter(Writer): _default_orient = 'index' @@ -87,17 +78,7 @@ def _format_axes(self): if not self.obj.index.is_unique and self.orient == 'index': raise ValueError("Series index must be unique for orient=" "'%s'" % self.orient) - if self._needs_to_date(self.obj.index): - self.copy_if_needed() - self.obj.index = self._format_to_date(self.obj.index.to_series()) - def _format_dates(self): - if self.obj.dtype == 'datetime64[ns]': - self.obj = self._format_to_date(self.obj) - - def _format_bools(self): - if self._needs_to_bool(self.obj): - self.obj = self._format_to_bool(self.obj) class FrameWriter(Writer): _default_orient = 'columns' @@ -113,39 +94,10 @@ def _format_axes(self): raise ValueError("DataFrame columns must be unique for orient=" "'%s'." % self.orient) - if self.orient == 'columns': - axis = 'index' - elif self.orient == 'index': - axis = 'columns' - else: - return - - a = getattr(self.obj,axis) - if self._needs_to_date(a): - self.copy_if_needed() - setattr(self.obj,axis,self._format_to_date(a.to_series())) - - def _format_dates(self): - dtypes = self.obj.dtypes - if len(dtypes[dtypes == 'datetime64[ns]']): - - # need to create a new object - d = {} - - for i, (col, c) in enumerate(self.obj.iteritems()): - - if c.dtype == 'datetime64[ns]': - c = self._format_to_date(c) - - d[i] = c - - d = DataFrame(d,index=self.obj.index) - d.columns = self.obj.columns - self.obj = d def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, - numpy=False, precise_float=False): + numpy=False, precise_float=False, date_unit=None): """ Convert JSON string to pandas object @@ -176,18 +128,28 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, values : just the values array typ : type of object to recover (series or frame), default 'frame' - dtype : if True, infer dtypes, if a dict of column to dtype, then use those, - if False, then don't infer dtypes at all, default is True, - apply only to the data - convert_axes : boolean, try to convert the axes to the proper dtypes, default is True - convert_dates : a list of columns to parse for dates; If True, then try to parse datelike columns - default is True - keep_default_dates : boolean, default True. If parsing dates, - then parse the default datelike columns - numpy : direct decoding to numpy arrays. default is False.Note that the JSON ordering MUST be the same - for each term if numpy=True. - precise_float : boolean, default False. Set to enable usage of higher precision (strtod) function - when decoding string to double values. Default (False) is to use fast but less precise builtin functionality + dtype : boolean or dict, default True + If True, infer dtypes, if a dict of column to dtype, then use those, + if False, then don't infer dtypes at all, applies only to the data. + convert_axes : boolean, default True + Try to convert the axes to the proper dtypes. + convert_dates : boolean, default True + List of columns to parse for dates; If True, then try to parse + datelike columns default is True + keep_default_dates : boolean, default True. + If parsing dates, then parse the default datelike columns + numpy : boolean, default False + Direct decoding to numpy arrays. Note that the JSON ordering MUST be + the same for each term if numpy=True. + precise_float : boolean, default False. + Set to enable usage of higher precision (strtod) function when + decoding string to double values. Default (False) is to use fast but + less precise builtin functionality + date_unit : string, default None + The timestamp unit to detect if converting dates. The default behaviour + is to try and detect the correct precision, but if this is not desired + then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, + milliseconds, microseconds or nanoseconds respectively. Returns ------- @@ -208,20 +170,28 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, obj = None if typ == 'frame': - obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy).parse() + obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit).parse() if typ == 'series' or obj is None: if not isinstance(dtype,bool): dtype = dict(data = dtype) - obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy).parse() + obj = SeriesParser(json, orient, dtype, convert_axes, convert_dates, keep_default_dates, numpy, precise_float, date_unit).parse() return obj + class Parser(object): + _STAMP_UNITS = ('s', 'ms', 'us', 'ns') + _MIN_STAMPS = { + 's': long(31536000), + 'ms': long(31536000000), + 'us': long(31536000000000), + 'ns': long(31536000000000000)} + def __init__(self, json, orient, dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=False, numpy=False, - precise_float=False): + precise_float=False, date_unit=None): self.json = json if orient is None: @@ -233,10 +203,20 @@ def __init__(self, json, orient, dtype=True, convert_axes=True, if orient == "split": numpy = False + if date_unit is not None: + date_unit = date_unit.lower() + if date_unit not in self._STAMP_UNITS: + raise ValueError('date_unit must be one of %s' % + (self._STAMP_UNITS,)) + self.min_stamp = self._MIN_STAMPS[date_unit] + else: + self.min_stamp = self._MIN_STAMPS['s'] + self.numpy = numpy self.precise_float = precise_float - self.convert_axes = convert_axes + self.convert_axes = convert_axes self.convert_dates = convert_dates + self.date_unit = date_unit self.keep_default_dates = keep_default_dates self.obj = None @@ -356,21 +336,23 @@ def _try_convert_to_date(self, data): # ignore numbers that are out of range - if issubclass(new_data.dtype.type,np.number): - if not ((new_data == iNaT) | (new_data > long(31536000000000000))).all(): + if issubclass(new_data.dtype.type, np.number): + in_range = (isnull(new_data.values) | (new_data > self.min_stamp) | + (new_data.values == iNaT)) + if not in_range.all(): return data, False - try: - new_data = to_datetime(new_data) - except: + date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS + for date_unit in date_units: try: - new_data = to_datetime(new_data.astype('int64')) + new_data = to_datetime(new_data, errors='raise', + unit=date_unit) + except OverflowError: + continue except: - - # return old, noting more we can do - return data, False - - return new_data, True + break + return new_data, True + return data, False def _try_convert_dates(self): raise NotImplementedError diff --git a/pandas/io/tests/test_json/data/tsframe_iso_v012.json b/pandas/io/tests/test_json/data/tsframe_iso_v012.json new file mode 100644 index 0000000000000..bd9ff885ad23a --- /dev/null +++ b/pandas/io/tests/test_json/data/tsframe_iso_v012.json @@ -0,0 +1 @@ +{"A":{"2000-01-03T00:00:00":1.56808523,"2000-01-04T00:00:00":-0.2550111,"2000-01-05T00:00:00":1.51493992,"2000-01-06T00:00:00":-0.02765498,"2000-01-07T00:00:00":0.05951614},"B":{"2000-01-03T00:00:00":0.65727391,"2000-01-04T00:00:00":-0.08072427,"2000-01-05T00:00:00":0.11805825,"2000-01-06T00:00:00":0.44679743,"2000-01-07T00:00:00":-2.69652057},"C":{"2000-01-03T00:00:00":1.81021139,"2000-01-04T00:00:00":-0.03202878,"2000-01-05T00:00:00":1.629455,"2000-01-06T00:00:00":0.33192641,"2000-01-07T00:00:00":1.28163262},"D":{"2000-01-03T00:00:00":-0.17251653,"2000-01-04T00:00:00":-0.17581665,"2000-01-05T00:00:00":-1.31506612,"2000-01-06T00:00:00":-0.27885413,"2000-01-07T00:00:00":0.34703478},"date":{"2000-01-03T00:00:00":"1992-01-06T18:21:32.120000","2000-01-04T00:00:00":"1992-01-06T18:21:32.120000","2000-01-05T00:00:00":"1992-01-06T18:21:32.120000","2000-01-06T00:00:00":"2013-01-01T00:00:00","2000-01-07T00:00:00":"1992-01-06T18:21:32.120000"}} \ No newline at end of file diff --git a/pandas/io/tests/test_json/data/tsframe_v012.json b/pandas/io/tests/test_json/data/tsframe_v012.json new file mode 100644 index 0000000000000..d4474c767855c --- /dev/null +++ b/pandas/io/tests/test_json/data/tsframe_v012.json @@ -0,0 +1 @@ +{"A":{"946857600000000000":1.56808523,"946944000000000000":-0.2550111,"947030400000000000":1.51493992,"947116800000000000":-0.02765498,"947203200000000000":0.05951614},"B":{"946857600000000000":0.65727391,"946944000000000000":-0.08072427,"947030400000000000":0.11805825,"947116800000000000":0.44679743,"947203200000000000":-2.69652057},"C":{"946857600000000000":1.81021139,"946944000000000000":-0.03202878,"947030400000000000":1.629455,"947116800000000000":0.33192641,"947203200000000000":1.28163262},"D":{"946857600000000000":-0.17251653,"946944000000000000":-0.17581665,"947030400000000000":-1.31506612,"947116800000000000":-0.27885413,"947203200000000000":0.34703478},"date":{"946857600000000000":694722092120000000,"946944000000000000":694722092120000000,"947030400000000000":694722092120000000,"947116800000000000":1356998400000000000,"947203200000000000":694722092120000000},"modified":{"946857600000000000":694722092120000000,"946944000000000000":null,"947030400000000000":694722092120000000,"947116800000000000":1356998400000000000,"947203200000000000":694722092120000000}} \ No newline at end of file diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index cd0e56db84256..1f79f5670cc75 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -1,11 +1,7 @@ - # pylint: disable-msg=W0612,E1101 -from copy import deepcopy -from datetime import datetime, timedelta -from pandas.compat import range, lrange, StringIO, cPickle as pickle +from pandas.compat import range, lrange, StringIO from pandas import compat from pandas.io.common import URLError -import operator import os import unittest @@ -37,6 +33,8 @@ class TestPandasContainer(unittest.TestCase): def setUp(self): + self.dirpath = tm.get_data_path() + self.ts = tm.makeTimeSeries() self.ts.name = 'ts' @@ -119,7 +117,8 @@ def _check_orient(df, orient, dtype=None, numpy=False, convert_axes=True, check_ check_dtype=False if not convert_axes and df.index.dtype.type == np.datetime64: - unser.index = DatetimeIndex(unser.index.values.astype('i8')) + unser.index = DatetimeIndex( + unser.index.values.astype('i8') * 1e6) if orient == "records": # index is not captured in this orientation assert_almost_equal(df.values, unser.values) @@ -280,6 +279,30 @@ def test_frame_to_json_except(self): df = DataFrame([1, 2, 3]) self.assertRaises(ValueError, df.to_json, orient="garbage") + def test_v12_compat(self): + df = DataFrame( + [[1.56808523, 0.65727391, 1.81021139, -0.17251653], + [-0.2550111, -0.08072427, -0.03202878, -0.17581665], + [1.51493992, 0.11805825, 1.629455, -1.31506612], + [-0.02765498, 0.44679743, 0.33192641, -0.27885413], + [0.05951614, -2.69652057, 1.28163262, 0.34703478]], + columns=['A', 'B', 'C', 'D'], + index=pd.date_range('2000-01-03', '2000-01-07')) + df['date'] = pd.Timestamp('19920106 18:21:32.12') + df.ix[3, 'date'] = pd.Timestamp('20130101') + df['modified'] = df['date'] + df.ix[1, 'modified'] = pd.NaT + + v12_json = os.path.join(self.dirpath, 'tsframe_v012.json') + df_unser = pd.read_json(v12_json) + df_unser = pd.read_json(v12_json) + assert_frame_equal(df, df_unser) + + df_iso = df.drop(['modified'], axis=1) + v12_iso_json = os.path.join(self.dirpath, 'tsframe_iso_v012.json') + df_unser_iso = pd.read_json(v12_iso_json) + assert_frame_equal(df_iso, df_unser_iso) + def test_series_non_unique_index(self): s = Series(['a', 'b'], index=[1, 1]) @@ -295,11 +318,10 @@ def test_series_from_json_to_json(self): def _check_orient(series, orient, dtype=None, numpy=False): series = series.sort_index() - unser = read_json(series.to_json(orient=orient), typ='series', - orient=orient, numpy=numpy, dtype=dtype) + unser = read_json(series.to_json(orient=orient), + typ='series', orient=orient, numpy=numpy, + dtype=dtype) unser = unser.sort_index() - #if series.index.dtype.type == np.datetime64: - # unser.index = DatetimeIndex(unser.index.values.astype('i8')) if orient == "records" or orient == "values": assert_almost_equal(series.values, unser.values) else: @@ -380,12 +402,12 @@ def test_axis_dates(self): # frame json = self.tsframe.to_json() result = read_json(json) - assert_frame_equal(result,self.tsframe) + assert_frame_equal(result, self.tsframe) # series json = self.ts.to_json() - result = read_json(json,typ='series') - assert_series_equal(result,self.ts) + result = read_json(json, typ='series') + assert_series_equal(result, self.ts) def test_convert_dates(self): @@ -395,39 +417,84 @@ def test_convert_dates(self): json = df.to_json() result = read_json(json) - assert_frame_equal(result,df) + assert_frame_equal(result, df) df['foo'] = 1. - json = df.to_json() - result = read_json(json,convert_dates=False) + json = df.to_json(date_unit='ns') + result = read_json(json, convert_dates=False) expected = df.copy() expected['date'] = expected['date'].values.view('i8') expected['foo'] = expected['foo'].astype('int64') - assert_frame_equal(result,expected) + assert_frame_equal(result, expected) # series - ts = Series(Timestamp('20130101'),index=self.ts.index) + ts = Series(Timestamp('20130101'), index=self.ts.index) json = ts.to_json() - result = read_json(json,typ='series') - assert_series_equal(result,ts) + result = read_json(json, typ='series') + assert_series_equal(result, ts) + + def test_date_format_frame(self): + df = self.tsframe.copy() + + def test_w_date(date, date_unit=None): + df['date'] = Timestamp(date) + df.ix[1, 'date'] = pd.NaT + df.ix[5, 'date'] = pd.NaT + if date_unit: + json = df.to_json(date_format='iso', date_unit=date_unit) + else: + json = df.to_json(date_format='iso') + result = read_json(json) + assert_frame_equal(result, df) + + test_w_date('20130101 20:43:42.123') + test_w_date('20130101 20:43:42', date_unit='s') + test_w_date('20130101 20:43:42.123', date_unit='ms') + test_w_date('20130101 20:43:42.123456', date_unit='us') + test_w_date('20130101 20:43:42.123456789', date_unit='ns') + + self.assertRaises(ValueError, df.to_json, date_format='iso', + date_unit='foo') + + def test_date_format_series(self): + def test_w_date(date, date_unit=None): + ts = Series(Timestamp(date), index=self.ts.index) + ts.ix[1] = pd.NaT + ts.ix[5] = pd.NaT + if date_unit: + json = ts.to_json(date_format='iso', date_unit=date_unit) + else: + json = ts.to_json(date_format='iso') + result = read_json(json, typ='series') + assert_series_equal(result, ts) + + test_w_date('20130101 20:43:42.123') + test_w_date('20130101 20:43:42', date_unit='s') + test_w_date('20130101 20:43:42.123', date_unit='ms') + test_w_date('20130101 20:43:42.123456', date_unit='us') + test_w_date('20130101 20:43:42.123456789', date_unit='ns') - def test_date_format(self): + ts = Series(Timestamp('20130101 20:43:42.123'), index=self.ts.index) + self.assertRaises(ValueError, ts.to_json, date_format='iso', + date_unit='foo') + def test_date_unit(self): df = self.tsframe.copy() - df['date'] = Timestamp('20130101') - df_orig = df.copy() + df['date'] = Timestamp('20130101 20:43:42') + df.ix[1, 'date'] = Timestamp('19710101 20:43:42') + df.ix[2, 'date'] = Timestamp('21460101 20:43:42') + df.ix[4, 'date'] = pd.NaT - json = df.to_json(date_format='iso') - result = read_json(json) - assert_frame_equal(result,df_orig) + for unit in ('s', 'ms', 'us', 'ns'): + json = df.to_json(date_format='epoch', date_unit=unit) - # make sure that we did in fact copy - assert_frame_equal(df,df_orig) + # force date unit + result = read_json(json, date_unit=unit) + assert_frame_equal(result, df) - ts = Series(Timestamp('20130101'),index=self.ts.index) - json = ts.to_json(date_format='iso') - result = read_json(json,typ='series') - assert_series_equal(result,ts) + # detect date unit + result = read_json(json, date_unit=None) + assert_frame_equal(result, df) def test_weird_nested_json(self): diff --git a/pandas/io/tests/test_json/test_ujson.py b/pandas/io/tests/test_json/test_ujson.py index ff684e30b206d..831a426ee8307 100644 --- a/pandas/io/tests/test_json/test_ujson.py +++ b/pandas/io/tests/test_json/test_ujson.py @@ -1,5 +1,4 @@ -import unittest -from unittest import TestCase +from unittest import TestCase try: import json @@ -13,20 +12,17 @@ import datetime import calendar import re -import random import decimal from functools import partial from pandas.compat import range, zip, StringIO, u -from pandas import compat import pandas.json as ujson import pandas.compat as compat import numpy as np -from pandas.util.testing import assert_almost_equal from numpy.testing import (assert_array_equal, assert_array_almost_equal_nulp, assert_approx_equal) -from pandas import DataFrame, Series, Index +from pandas import DataFrame, Series, Index, NaT, DatetimeIndex import pandas.util.testing as tm @@ -327,38 +323,58 @@ def test_encodeFalseConversion(self): self.assertEquals(input, json.loads(output)) self.assertEquals(output, json.dumps(input)) self.assertEquals(input, ujson.decode(output)) - pass - # def test_encodeDatetimeConversion(self): - # ts = time.time() - # input = datetime.datetime.fromtimestamp(ts) - # output = ujson.encode(input) - # expected = calendar.timegm(input.utctimetuple()) - # self.assertEquals(int(expected), json.loads(output)) - # self.assertEquals(int(expected), ujson.decode(output)) - # pass + def test_encodeDatetimeConversion(self): + ts = time.time() + input = datetime.datetime.fromtimestamp(ts) + output = ujson.encode(input, date_unit='s') + expected = calendar.timegm(input.utctimetuple()) + self.assertEquals(int(expected), json.loads(output)) + self.assertEquals(int(expected), ujson.decode(output)) + + def test_encodeDateConversion(self): + ts = time.time() + input = datetime.date.fromtimestamp(ts) + + output = ujson.encode(input, date_unit='s') + tup = (input.year, input.month, input.day, 0, 0, 0) - # def test_encodeDateConversion(self): - # ts = time.time() - # input = datetime.date.fromtimestamp(ts) + expected = calendar.timegm(tup) + self.assertEquals(int(expected), json.loads(output)) + self.assertEquals(int(expected), ujson.decode(output)) + + def test_nat(self): + input = NaT + assert ujson.encode(input) == 'null', "Expected null" - # output = ujson.encode(input) - # tup = ( input.year, input.month, input.day, 0, 0, 0 ) + def test_npy_nat(self): + from distutils.version import LooseVersion + if LooseVersion(np.__version__) < '1.7.0': + raise nose.SkipTest - # expected = calendar.timegm(tup) - # self.assertEquals(int(expected), json.loads(output)) - # self.assertEquals(int(expected), ujson.decode(output)) + input = np.datetime64('NaT') + assert ujson.encode(input) == 'null', "Expected null" - def test_datetime_nanosecond_unit(self): - from datetime import datetime + def test_datetime_units(self): from pandas.lib import Timestamp - val = datetime.now() + val = datetime.datetime.now() stamp = Timestamp(val) - roundtrip = ujson.decode(ujson.encode(val)) + roundtrip = ujson.decode(ujson.encode(val, date_unit='s')) + self.assert_(roundtrip == stamp.value // 1e9) + + roundtrip = ujson.decode(ujson.encode(val, date_unit='ms')) + self.assert_(roundtrip == stamp.value // 1e6) + + roundtrip = ujson.decode(ujson.encode(val, date_unit='us')) + self.assert_(roundtrip == stamp.value / 1e3) + + roundtrip = ujson.decode(ujson.encode(val, date_unit='ns')) self.assert_(roundtrip == stamp.value) + self.assertRaises(ValueError, ujson.encode, val, date_unit='foo') + def test_encodeToUTF8(self): _skip_if_python_ver(2, 5) input = "\xe6\x97\xa5\xd1\x88" @@ -1267,17 +1283,17 @@ def testIndex(self): self.assert_(i.equals(outp)) def test_datetimeindex(self): - from pandas.tseries.index import date_range, DatetimeIndex + from pandas.tseries.index import date_range rng = date_range('1/1/2000', periods=20) - encoded = ujson.encode(rng) + encoded = ujson.encode(rng, date_unit='ns') decoded = DatetimeIndex(np.array(ujson.decode(encoded))) self.assert_(rng.equals(decoded)) ts = Series(np.random.randn(len(rng)), index=rng) - decoded = Series(ujson.decode(ujson.encode(ts))) + decoded = Series(ujson.decode(ujson.encode(ts, date_unit='ns'))) idx_values = decoded.index.values.astype(np.int64) decoded.index = DatetimeIndex(idx_values) tm.assert_series_equal(ts, decoded) diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index bebaf89de341d..f28ed137383c6 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -38,20 +38,24 @@ Numeric decoder derived from from TCL library #include "py_defines.h" #include -#include +#include #include +#include +#include +#include +#include #include #include #include -#define EPOCH_ORD 719163 static PyObject* type_decimal; #define NPY_JSON_BUFSIZE 32768 -static PyObject* cls_dataframe; -static PyObject* cls_series; -static PyObject* cls_index; +static PyTypeObject* cls_dataframe; +static PyTypeObject* cls_series; +static PyTypeObject* cls_index; +static PyTypeObject* cls_nat; typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti, void *outValue, size_t *_outLen); @@ -63,7 +67,6 @@ typedef struct __NpyArrContext { PyObject *array; char* dataptr; - int was_datetime64; int curdim; // current dimension in array's order int stridedim; // dimension we are striding over int inc; // stride dimension increment (+/- 1) @@ -71,7 +74,6 @@ typedef struct __NpyArrContext npy_intp stride; npy_intp ndim; npy_intp index[NPY_MAXDIMS]; - PyArray_GetItemFunc* getitem; char** rowLabels; char** columnLabels; @@ -96,7 +98,7 @@ typedef struct __TypeContext JSINT64 longValue; - char *citemName; + char *cStr; NpyArrContext *npyarr; int transpose; char** rowLabels; @@ -112,6 +114,9 @@ typedef struct __PyObjectEncoder // pass through the NpyArrContext when encoding multi-dimensional arrays NpyArrContext* npyCtxtPassthru; + int datetimeIso; + PANDAS_DATETIMEUNIT datetimeUnit; + // output format style for pandas data types int outputFormat; int originalOutputFormat; @@ -144,7 +149,8 @@ void initObjToJSON(void) int initObjToJSON(void) #endif { - PyObject *mod_frame; + PyObject *mod_pandas; + PyObject *mod_tslib; PyObject* mod_decimal = PyImport_ImportModule("decimal"); type_decimal = PyObject_GetAttrString(mod_decimal, "Decimal"); Py_INCREF(type_decimal); @@ -152,13 +158,20 @@ int initObjToJSON(void) PyDateTime_IMPORT; - mod_frame = PyImport_ImportModule("pandas.core.frame"); - if (mod_frame) + mod_pandas = PyImport_ImportModule("pandas"); + if (mod_pandas) { - cls_dataframe = PyObject_GetAttrString(mod_frame, "DataFrame"); - cls_index = PyObject_GetAttrString(mod_frame, "Index"); - cls_series = PyObject_GetAttrString(mod_frame, "Series"); - Py_DECREF(mod_frame); + cls_dataframe = (PyTypeObject*) PyObject_GetAttrString(mod_pandas, "DataFrame"); + cls_index = (PyTypeObject*) PyObject_GetAttrString(mod_pandas, "Index"); + cls_series = (PyTypeObject*) PyObject_GetAttrString(mod_pandas, "Series"); + Py_DECREF(mod_pandas); + } + + mod_tslib = PyImport_ImportModule("pandas.tslib"); + if (mod_tslib) + { + cls_nat = (PyTypeObject*) PyObject_GetAttrString(mod_tslib, "NaTType"); + Py_DECREF(mod_tslib); } /* Initialise numpy API */ @@ -187,9 +200,9 @@ static void *PyLongToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size static void *NpyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { - PyObject *obj = (PyObject *) _obj; - PyArray_CastScalarToCtype(obj, outValue, PyArray_DescrFromType(NPY_DOUBLE)); - return NULL; + PyObject *obj = (PyObject *) _obj; + PyArray_CastScalarToCtype(obj, outValue, PyArray_DescrFromType(NPY_DOUBLE)); + return NULL; } static void *PyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) @@ -217,41 +230,83 @@ static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, si return PyString_AS_STRING(newObj); } -static void *NpyDateTimeToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +static void *PandasDateTimeStructToJSON(pandas_datetimestruct *dts, JSONTypeContext *tc, void *outValue, size_t *_outLen) { - PyObject *obj = (PyObject *) _obj; - PyArray_CastScalarToCtype(obj, outValue, PyArray_DescrFromType(NPY_DATETIME)); + int base = ((PyObjectEncoder*) tc->encoder)->datetimeUnit; + + if (((PyObjectEncoder*) tc->encoder)->datetimeIso) + { + PRINTMARK(); + *_outLen = (size_t) get_datetime_iso_8601_strlen(0, base); + GET_TC(tc)->cStr = PyObject_Malloc(sizeof(char) * (*_outLen)); + if (!GET_TC(tc)->cStr) + { + PyErr_NoMemory(); + ((JSONObjectEncoder*) tc->encoder)->errorMsg = ""; + return NULL; + } + + if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, 0, base, -1, NPY_UNSAFE_CASTING)) + { + PRINTMARK(); + *_outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; + } + else + { + PRINTMARK(); + PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); + PyObject_Free(GET_TC(tc)->cStr); + return NULL; + } + } + else + { + PRINTMARK(); + *((JSINT64*)outValue) = pandas_datetimestruct_to_datetime(base, dts); return NULL; + } +} + +static void *NpyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +{ + PRINTMARK(); + pandas_datetimestruct dts; + PyDatetimeScalarObject *obj = (PyDatetimeScalarObject *) _obj; + + pandas_datetime_to_datetimestruct(obj->obval, obj->obmeta.base, &dts); + return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); } -static void *PyDateTimeToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { pandas_datetimestruct dts; PyObject *obj = (PyObject *) _obj; - dts.year = PyDateTime_GET_YEAR(obj); - dts.month = PyDateTime_GET_MONTH(obj); - dts.day = PyDateTime_GET_DAY(obj); - dts.hour = PyDateTime_DATE_GET_HOUR(obj); - dts.min = PyDateTime_DATE_GET_MINUTE(obj); - dts.sec = PyDateTime_DATE_GET_SECOND(obj); - dts.us = PyDateTime_DATE_GET_MICROSECOND(obj); - dts.ps = dts.as = 0; - *((JSINT64*)outValue) = (JSINT64) pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts); - return NULL; + + if (!convert_pydatetime_to_datetimestruct(obj, &dts, NULL, 1)) + { + PRINTMARK(); + return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); + } + else + { + if (!PyErr_Occurred()) + { + PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); + } + ((JSONObjectEncoder*) tc->encoder)->errorMsg = ""; + return NULL; + } } -static void *PyDateToINT64(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) +static void *NpyDatetime64ToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { pandas_datetimestruct dts; PyObject *obj = (PyObject *) _obj; - dts.year = PyDateTime_GET_YEAR(obj); - dts.month = PyDateTime_GET_MONTH(obj); - dts.day = PyDateTime_GET_DAY(obj); - dts.hour = dts.min = dts.sec = dts.ps = dts.as = 0; - *((JSINT64*)outValue) = (JSINT64) pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts); - return NULL; + pandas_datetime_to_datetimestruct(PyLong_AsLongLong(obj), PANDAS_FR_ns, &dts); + return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); } //============================================================================= @@ -265,7 +320,6 @@ int NpyArr_iterNextNone(JSOBJ _obj, JSONTypeContext *tc) void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { PyArrayObject *obj; - PyArray_Descr *dtype; NpyArrContext *npyarr; if (GET_TC(tc)->newObj) @@ -290,17 +344,7 @@ void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) return; } - // uber hack to support datetime64[ns] arrays - if (PyArray_DESCR(obj)->type_num == NPY_DATETIME) { - npyarr->was_datetime64 = 1; - dtype = PyArray_DescrFromType(NPY_INT64); - obj = (PyArrayObject *) PyArray_CastToType(obj, dtype, 0); - } else { - npyarr->was_datetime64 = 0; - } - npyarr->array = (PyObject*) obj; - npyarr->getitem = (PyArray_GetItemFunc*) PyArray_DESCR(obj)->f->getitem; npyarr->dataptr = PyArray_DATA(obj); npyarr->ndim = PyArray_NDIM(obj) - 1; npyarr->curdim = 0; @@ -338,10 +382,6 @@ void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) if (npyarr) { - if (npyarr->was_datetime64) { - Py_XDECREF(npyarr->array); - } - if (GET_TC(tc)->itemValue != npyarr->array) { Py_XDECREF(GET_TC(tc)->itemValue); @@ -384,6 +424,11 @@ int NpyArr_iterNextItem(JSOBJ _obj, JSONTypeContext *tc) PRINTMARK(); npyarr = GET_TC(tc)->npyarr; + if (PyErr_Occurred()) + { + return 0; + } + if (GET_TC(tc)->itemValue != npyarr->array) { Py_XDECREF(GET_TC(tc)->itemValue); @@ -395,7 +440,7 @@ int NpyArr_iterNextItem(JSOBJ _obj, JSONTypeContext *tc) return 0; } - GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); + GET_TC(tc)->itemValue = PyArray_ToScalar(npyarr->dataptr, npyarr->array); npyarr->dataptr += npyarr->stride; npyarr->index[npyarr->stridedim]++; @@ -408,6 +453,12 @@ int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) PRINTMARK(); npyarr = GET_TC(tc)->npyarr; + if (PyErr_Occurred()) + { + PRINTMARK(); + return 0; + } + if (npyarr->curdim >= npyarr->ndim || npyarr->index[npyarr->stridedim] >= npyarr->dim) { // innermost dimension, start retrieving item values @@ -720,8 +771,8 @@ char *List_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) void Index_iterBegin(JSOBJ obj, JSONTypeContext *tc) { GET_TC(tc)->index = 0; - GET_TC(tc)->citemName = PyObject_Malloc(20 * sizeof(char)); - if (!GET_TC(tc)->citemName) + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); + if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } @@ -731,7 +782,7 @@ void Index_iterBegin(JSOBJ obj, JSONTypeContext *tc) int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { Py_ssize_t index; - if (!GET_TC(tc)->citemName) + if (!GET_TC(tc)->cStr) { return 0; } @@ -740,13 +791,13 @@ int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->citemName, "name", sizeof(char)*5); + memcpy(GET_TC(tc)->cStr, "name", sizeof(char)*5); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); } else if (index == 1) { - memcpy(GET_TC(tc)->citemName, "data", sizeof(char)*5); + memcpy(GET_TC(tc)->cStr, "data", sizeof(char)*5); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); } else @@ -762,10 +813,6 @@ int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) void Index_iterEnd(JSOBJ obj, JSONTypeContext *tc) { - if (GET_TC(tc)->citemName) - { - PyObject_Free(GET_TC(tc)->citemName); - } PRINTMARK(); } @@ -776,8 +823,8 @@ JSOBJ Index_iterGetValue(JSOBJ obj, JSONTypeContext *tc) char *Index_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { - *outLen = strlen(GET_TC(tc)->citemName); - return GET_TC(tc)->citemName; + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; } //============================================================================= @@ -787,9 +834,9 @@ void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; GET_TC(tc)->index = 0; - GET_TC(tc)->citemName = PyObject_Malloc(20 * sizeof(char)); + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); enc->outputFormat = VALUES; // for contained series - if (!GET_TC(tc)->citemName) + if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } @@ -799,7 +846,7 @@ void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { Py_ssize_t index; - if (!GET_TC(tc)->citemName) + if (!GET_TC(tc)->cStr) { return 0; } @@ -808,19 +855,19 @@ int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->citemName, "name", sizeof(char)*5); + memcpy(GET_TC(tc)->cStr, "name", sizeof(char)*5); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "name"); } else if (index == 1) { - memcpy(GET_TC(tc)->citemName, "index", sizeof(char)*6); + memcpy(GET_TC(tc)->cStr, "index", sizeof(char)*6); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { - memcpy(GET_TC(tc)->citemName, "data", sizeof(char)*5); + memcpy(GET_TC(tc)->cStr, "data", sizeof(char)*5); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); } else @@ -838,10 +885,6 @@ void Series_iterEnd(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; enc->outputFormat = enc->originalOutputFormat; - if (GET_TC(tc)->citemName) - { - PyObject_Free(GET_TC(tc)->citemName); - } PRINTMARK(); } @@ -852,8 +895,8 @@ JSOBJ Series_iterGetValue(JSOBJ obj, JSONTypeContext *tc) char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { - *outLen = strlen(GET_TC(tc)->citemName); - return GET_TC(tc)->citemName; + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; } //============================================================================= @@ -863,9 +906,9 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; GET_TC(tc)->index = 0; - GET_TC(tc)->citemName = PyObject_Malloc(20 * sizeof(char)); + GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); enc->outputFormat = VALUES; // for contained series & index - if (!GET_TC(tc)->citemName) + if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } @@ -875,7 +918,7 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { Py_ssize_t index; - if (!GET_TC(tc)->citemName) + if (!GET_TC(tc)->cStr) { return 0; } @@ -884,19 +927,19 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) Py_XDECREF(GET_TC(tc)->itemValue); if (index == 0) { - memcpy(GET_TC(tc)->citemName, "columns", sizeof(char)*8); + memcpy(GET_TC(tc)->cStr, "columns", sizeof(char)*8); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "columns"); } else if (index == 1) { - memcpy(GET_TC(tc)->citemName, "index", sizeof(char)*6); + memcpy(GET_TC(tc)->cStr, "index", sizeof(char)*6); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "index"); } else if (index == 2) { - memcpy(GET_TC(tc)->citemName, "data", sizeof(char)*5); + memcpy(GET_TC(tc)->cStr, "data", sizeof(char)*5); GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); } else @@ -914,10 +957,6 @@ void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder* enc = (PyObjectEncoder*) tc->encoder; enc->outputFormat = enc->originalOutputFormat; - if (GET_TC(tc)->citemName) - { - PyObject_Free(GET_TC(tc)->citemName); - } PRINTMARK(); } @@ -928,8 +967,8 @@ JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { - *outLen = strlen(GET_TC(tc)->citemName); - return GET_TC(tc)->citemName; + *outLen = strlen(GET_TC(tc)->cStr); + return GET_TC(tc)->cStr; } //============================================================================= @@ -1023,15 +1062,12 @@ void NpyArr_freeLabels(char** labels, npy_intp len) char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_intp num) { // NOTE this function steals a reference to labels. - PyArray_Descr *dtype = NULL; PyArrayObject* labelsTmp = NULL; PyObject* item = NULL; npy_intp i, stride, len; - // npy_intp bufsize = 32768; char** ret; char *dataptr, *cLabel, *origend, *origst, *origoffset; char labelBuffer[NPY_JSON_BUFSIZE]; - PyArray_GetItemFunc* getitem; PRINTMARK(); if (PyArray_SIZE(labels) < num) @@ -1058,20 +1094,12 @@ char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_in origend = enc->end; origoffset = enc->offset; - if (PyArray_DESCR(labels)->type_num == NPY_DATETIME) { - dtype = PyArray_DescrFromType(NPY_INT64); - labelsTmp = labels; - labels = (PyArrayObject *) PyArray_CastToType(labels, dtype, 0); - Py_DECREF(labelsTmp); - } - stride = PyArray_STRIDE(labels, 0); dataptr = PyArray_DATA(labels); - getitem = PyArray_DESCR(labels)->f->getitem; for (i = 0; i < num; i++) { - item = getitem(dataptr, labels); + item = PyArray_ToScalar(dataptr, labels); if (!item) { NpyArr_freeLabels(ret, num); @@ -1150,7 +1178,7 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) pc->index = 0; pc->size = 0; pc->longValue = 0; - pc->citemName = NULL; + pc->cStr = NULL; pc->npyarr = NULL; pc->rowLabels = NULL; pc->columnLabels = NULL; @@ -1158,6 +1186,7 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) pc->rowLabelsLen = 0; pc->columnLabelsLen = 0; + if (PyIter_Check(obj)) { PRINTMARK(); @@ -1194,9 +1223,32 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } else + if (PyArray_IsScalar(obj, Datetime)) + { + PRINTMARK(); + if (((PyDatetimeScalarObject*) obj)->obval == get_nat()) { + PRINTMARK(); + tc->type = JT_NULL; + return; + } + + PRINTMARK(); + pc->PyTypeToJSON = NpyDateTimeToJSON; + if (enc->datetimeIso) + { + tc->type = JT_UTF8; + } + else + { + tc->type = JT_LONG; + } + return; + } + else if (PyInt_Check(obj)) { PRINTMARK(); + #ifdef _LP64 pc->PyTypeToJSON = PyIntToINT64; tc->type = JT_LONG; #else @@ -1205,6 +1257,14 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } else + if (PyArray_IsScalar(obj, Bool)) + { + PRINTMARK(); + PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_BOOL)); + tc->type = (GET_TC(tc)->longValue) ? JT_TRUE : JT_FALSE; + return; + } + else if (PyArray_IsScalar(obj, Integer)) { PRINTMARK(); @@ -1266,24 +1326,27 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } else - if (PyArray_IsScalar(obj, Datetime)) - { - PRINTMARK(); - pc->PyTypeToJSON = NpyDateTimeToINT64; tc->type = JT_LONG; - return; - } - else - if (PyDateTime_Check(obj)) - { - PRINTMARK(); - pc->PyTypeToJSON = PyDateTimeToINT64; tc->type = JT_LONG; - return; - } - else - if (PyDate_Check(obj)) + if (PyDateTime_Check(obj) || PyDate_Check(obj)) { + if (PyObject_TypeCheck(obj, cls_nat)) + { + PRINTMARK(); + tc->type = JT_NULL; + return; + } + PRINTMARK(); - pc->PyTypeToJSON = PyDateToINT64; tc->type = JT_LONG; + pc->PyTypeToJSON = PyDateTimeToJSON; + if (enc->datetimeIso) + { + PRINTMARK(); + tc->type = JT_UTF8; + } + else + { + PRINTMARK(); + tc->type = JT_LONG; + } return; } else @@ -1348,7 +1411,7 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } else - if (PyObject_TypeCheck(obj, (PyTypeObject*) cls_index)) + if (PyObject_TypeCheck(obj, cls_index)) { if (enc->outputFormat == SPLIT) { @@ -1373,7 +1436,7 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } else - if (PyObject_TypeCheck(obj, (PyTypeObject*) cls_series)) + if (PyObject_TypeCheck(obj, cls_series)) { if (enc->outputFormat == SPLIT) { @@ -1392,7 +1455,7 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) PRINTMARK(); tc->type = JT_OBJECT; pc->columnLabelsLen = PyArray_SIZE(obj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "index"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(PyObject_GetAttrString(obj, "index"), "values"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); if (!pc->columnLabels) { goto INVALID; @@ -1438,7 +1501,7 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } else - if (PyObject_TypeCheck(obj, (PyTypeObject*) cls_dataframe)) + if (PyObject_TypeCheck(obj, cls_dataframe)) { if (enc->outputFormat == SPLIT) { @@ -1482,7 +1545,7 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) PRINTMARK(); tc->type = JT_OBJECT; pc->rowLabelsLen = PyArray_DIM(pc->newObj, 0); - pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "index"), (JSONObjectEncoder*) enc, pc->rowLabelsLen); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(PyObject_GetAttrString(obj, "index"), "values"), (JSONObjectEncoder*) enc, pc->rowLabelsLen); if (!pc->rowLabels) { goto INVALID; @@ -1507,7 +1570,7 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) goto INVALID; } pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(obj, "index"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(PyObject_GetAttrString(obj, "index"), "values"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); if (!pc->columnLabels) { NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); @@ -1573,12 +1636,14 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) { - Py_XDECREF(GET_TC(tc)->newObj); - NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen); - NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); + PRINTMARK(); + Py_XDECREF(GET_TC(tc)->newObj); + NpyArr_freeLabels(GET_TC(tc)->rowLabels, GET_TC(tc)->rowLabelsLen); + NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); - PyObject_Free(tc->prv); - tc->prv = NULL; + PyObject_Free(GET_TC(tc)->cStr); + PyObject_Free(tc->prv); + tc->prv = NULL; } const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) @@ -1639,7 +1704,7 @@ char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) { - static char *kwlist[] = { "obj", "ensure_ascii", "double_precision", "encode_html_chars", "orient", NULL}; + static char *kwlist[] = { "obj", "ensure_ascii", "double_precision", "encode_html_chars", "orient", "date_unit", "iso_dates", NULL}; char buffer[65536]; char *ret; @@ -1649,6 +1714,8 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) int idoublePrecision = 10; // default double precision setting PyObject *oencodeHTMLChars = NULL; char *sOrient = NULL; + char *sdateFormat = NULL; + PyObject *oisoDates = 0; PyObjectEncoder pyEncoder = { @@ -1677,11 +1744,13 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) JSONObjectEncoder* encoder = (JSONObjectEncoder*) &pyEncoder; pyEncoder.npyCtxtPassthru = NULL; + pyEncoder.datetimeIso = 0; + pyEncoder.datetimeUnit = PANDAS_FR_ms; pyEncoder.outputFormat = COLUMNS; PRINTMARK(); - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOs", kwlist, &oinput, &oensureAscii, &idoublePrecision, &oencodeHTMLChars, &sOrient)) + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssO", kwlist, &oinput, &oensureAscii, &idoublePrecision, &oencodeHTMLChars, &sOrient, &sdateFormat, &oisoDates)) { return NULL; } @@ -1736,6 +1805,40 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) } } + if (sdateFormat != NULL) + { + if (strcmp(sdateFormat, "s") == 0) + { + pyEncoder.datetimeUnit = PANDAS_FR_s; + } + else + if (strcmp(sdateFormat, "ms") == 0) + { + pyEncoder.datetimeUnit = PANDAS_FR_ms; + } + else + if (strcmp(sdateFormat, "us") == 0) + { + pyEncoder.datetimeUnit = PANDAS_FR_us; + } + else + if (strcmp(sdateFormat, "ns") == 0) + { + pyEncoder.datetimeUnit = PANDAS_FR_ns; + } + else + { + PyErr_Format (PyExc_ValueError, "Invalid value '%s' for option 'date_unit'", sdateFormat); + return NULL; + } + } + + if (oisoDates != NULL && PyObject_IsTrue(oisoDates)) + { + pyEncoder.datetimeIso = 1; + } + + pyEncoder.originalOutputFormat = pyEncoder.outputFormat; PRINTMARK(); ret = JSON_EncodeObject (oinput, encoder, buffer, sizeof (buffer)); @@ -1743,11 +1846,13 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) if (PyErr_Occurred()) { + PRINTMARK(); return NULL; } if (encoder->errorMsg) { + PRINTMARK(); if (ret != buffer) { encoder->free (ret); diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index bb3512e532b0e..698d9a81af387 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1276,7 +1276,7 @@ cdef inline _get_datetime64_nanos(object val): else: return ival -cdef inline int64_t cast_from_unit(object unit, object ts): +cdef inline int64_t cast_from_unit(object unit, object ts) except -1: """ return a casting of the unit represented to nanoseconds round the fractional part of a float to our precision, p """ if unit == 'D': diff --git a/setup.py b/setup.py index 58513c9d4077d..956b9b13db2ce 100755 --- a/setup.py +++ b/setup.py @@ -475,7 +475,8 @@ def pxd(name): ext.sources[0] = root + suffix ujson_ext = Extension('pandas.json', - depends=['pandas/src/ujson/lib/ultrajson.h'], + depends=['pandas/src/ujson/lib/ultrajson.h', + 'pandas/src/numpy_helper.h'], sources=['pandas/src/ujson/python/ujson.c', 'pandas/src/ujson/python/objToJSON.c', 'pandas/src/ujson/python/JSONtoObj.c', @@ -531,7 +532,8 @@ def pxd(name): 'tests/data/*.xls', 'tests/data/*.xlsx', 'tests/data/*.table', - 'tests/data/*.html'], + 'tests/data/*.html', + 'tests/test_json/data/*.json'], 'pandas.tools': ['tests/*.csv'], 'pandas.tests': ['data/*.pickle', 'data/*.csv'],