Skip to content

Commit

Permalink
BUG: fix GH8989 to parse nanoseconds with %f format
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Dec 5, 2014
1 parent ea2489d commit d6e4337
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 13 deletions.
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v0.15.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,14 +78,15 @@ Enhancements
- Added ``gbq.generate_bq_schema`` function to the gbq module (:issue:`8325`).
- ``Series`` now works with map objects the same way as generators (:issue:`8909`).
- Added context manager to ``HDFStore`` for automatic closing (:issue:`8791`).
- ``to_datetime`` gains an ``exact`` keyword to allow for a format to not require an exact match for a provided format string (if its ``False). ``exact`` defaults to ``True`` (meaning that exact matching is still the default) (:issue:`8904`)

.. _whatsnew_0152.performance:

Performance
~~~~~~~~~~~
- Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`)

- Performance boost for ``to_datetime`` conversions with a passed ``format=`` kw, and the new ``exact=False`` for kw with non-exact matching (:issue:`8904`)
- Performance boost for ``to_datetime`` conversions with a passed ``format=``, and the ``exact=False`` (:issue:`8904`)

.. _whatsnew_0152.experimental:

Expand Down Expand Up @@ -143,6 +144,7 @@ Bug Fixes

- Report a ``TypeError`` when invalid/no paramaters are passed in a groupby (:issue:`8015`)
- Regression in DatetimeIndex iteration with a Fixed/Local offset timezone (:issue:`8890`)
- Bug in ``to_datetime`` when parsing a nanoseconds using the ``%f`` format (:issue:`8989`)
- Bug in packaging pandas with ``py2app/cx_Freeze`` (:issue:`8602`, :issue:`8831`)
- Bug in ``groupby`` signatures that didn't include \*args or \*\*kwargs (:issue:`8733`).
- ``io.data.Options`` now raises ``RemoteDataError`` when no expiry dates are available from Yahoo and when it receives no data from Yahoo (:issue:`8761`), (:issue:`8783`).
Expand Down
18 changes: 17 additions & 1 deletion pandas/tseries/tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4125,14 +4125,30 @@ def test_to_datetime_format_time(self):

def test_to_datetime_with_non_exact(self):

# 8904
# exact kw
if sys.version_info < (2, 7):
raise nose.SkipTest('on python version < 2.7')

s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z']*10000)
s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z'])
result = to_datetime(s,format='%d%b%y',exact=False)
expected = to_datetime(s.str.extract('(\d+\w+\d+)'),format='%d%b%y')
assert_series_equal(result, expected)

def test_parse_nanoseconds_with_formula(self):

# GH8989
# trunctaing the nanoseconds when a format was provided
for v in ["2012-01-01 09:00:00.000000001",
"2012-01-01 09:00:00.000001",
"2012-01-01 09:00:00.001",
"2012-01-01 09:00:00.001000",
"2012-01-01 09:00:00.001000000",
]:
expected = pd.to_datetime(v)
result = pd.to_datetime(v, format="%Y-%m-%d %H:%M:%S.%f")
self.assertEqual(result,expected)

def test_to_datetime_format_weeks(self):
data = [
['2009324', '%Y%W%w', Timestamp('2009-08-13')],
Expand Down
7 changes: 4 additions & 3 deletions pandas/tseries/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,10 +194,11 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
box : boolean, default True
If True returns a DatetimeIndex, if False returns ndarray of values
format : string, default None
strftime to parse time, eg "%d/%m/%Y"
strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
all the way up to nanoseconds
exact : boolean, True by default
if True, require an exact format match
if False, search for a matching format non-exclusive to the endpoints
If True, require an exact format match.
If False, allow the format to match anywhere in the target string.
coerce : force errors to NaT (False by default)
unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
(e.g. a unix timestamp), which is an integer/float number
Expand Down
18 changes: 11 additions & 7 deletions pandas/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2137,8 +2137,9 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe
Py_ssize_t i, n = len(values)
pandas_datetimestruct dts
ndarray[int64_t] iresult
int year, month, day, minute, hour, second, fraction, weekday, julian, tz
int year, month, day, minute, hour, second, weekday, julian, tz
int week_of_year, week_of_year_start
int64_t us, ns
object val, group_key, ampm, found
dict found_key

Expand Down Expand Up @@ -2237,7 +2238,7 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe

year = 1900
month = day = 1
hour = minute = second = fraction = 0
hour = minute = second = ns = us = 0
tz = -1
# Default to -1 to signify that values not known; not critical to have,
# though
Expand Down Expand Up @@ -2302,9 +2303,11 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe
second = int(found_dict['S'])
elif parse_code == 10:
s = found_dict['f']
# Pad to always return microseconds.
s += "0" * (6 - len(s))
fraction = int(s)
# Pad to always return nanoseconds
s += "0" * (9 - len(s))
us = long(s)
ns = us % 1000
us = us / 1000
elif parse_code == 11:
weekday = locale_time.f_weekday.index(found_dict['A'].lower())
elif parse_code == 12:
Expand Down Expand Up @@ -2369,7 +2372,8 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coe
dts.hour = hour
dts.min = minute
dts.sec = second
dts.us = fraction
dts.us = us
dts.ps = ns * 1000

iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts)
try:
Expand Down Expand Up @@ -4311,7 +4315,7 @@ class TimeRE(dict):
base.__init__({
# The " \d" part of the regex is to make %c from ANSI C work
'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
'f': r"(?P<f>[0-9]{1,6})",
'f': r"(?P<f>[0-9]{1,9})",
'H': r"(?P<H>2[0-3]|[0-1]\d|\d)",
'I': r"(?P<I>1[0-2]|0[1-9]|[1-9])",
'j': r"(?P<j>36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])",
Expand Down
2 changes: 1 addition & 1 deletion vb_suite/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def date_range(start=None, end=None, periods=None, freq=None):
setup = common_setup + """
s = Series(['19MAY11','19MAY11:00:00:00']*100000)
"""
timeseries_with_format = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \
timeseries_with_format_no_exact = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \
setup, start_date=datetime(2014, 11, 26))
timeseries_with_format_replace = Benchmark("to_datetime(s.str.replace(':\S+$',''),format='%d%b%y')", \
setup, start_date=datetime(2014, 11, 26))
Expand Down

0 comments on commit d6e4337

Please sign in to comment.