Skip to content

Commit

Permalink
PERF: add exact kw to to_datetime to enable faster regex format parsi…
Browse files Browse the repository at this point in the history
…ng for datetimes (GH8904)
  • Loading branch information
jreback committed Dec 5, 2014
1 parent 526f33c commit ea2489d
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 19 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.15.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ Performance
~~~~~~~~~~~
- Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`)

- Performance boost for ``to_datetime`` conversions with a passed ``format=`` kw, and the new ``exact=False`` for kw with non-exact matching (:issue:`8904`)

.. _whatsnew_0152.experimental:

Experimental
Expand Down
10 changes: 10 additions & 0 deletions pandas/tseries/tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4123,6 +4123,16 @@ def test_to_datetime_format_time(self):
for s, format, dt in data:
self.assertEqual(to_datetime(s, format=format), dt)

def test_to_datetime_with_non_exact(self):

if sys.version_info < (2, 7):
raise nose.SkipTest('on python version < 2.7')

s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z']*10000)
result = to_datetime(s,format='%d%b%y',exact=False)
expected = to_datetime(s.str.extract('(\d+\w+\d+)'),format='%d%b%y')
assert_series_equal(result, expected)

def test_to_datetime_format_weeks(self):
data = [
['2009324', '%Y%W%w', Timestamp('2009-08-13')],
Expand Down
7 changes: 5 additions & 2 deletions pandas/tseries/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def _guess_datetime_format_for_array(arr, **kwargs):
return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)

def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
format=None, coerce=False, unit='ns',
format=None, exact=True, coerce=False, unit='ns',
infer_datetime_format=False):
"""
Convert argument to datetime.
Expand All @@ -195,6 +195,9 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
If True returns a DatetimeIndex, if False returns ndarray of values
format : string, default None
strftime to parse time, eg "%d/%m/%Y"
exact : boolean, True by default
if True, require an exact format match
if False, search for a matching format non-exclusive to the endpoints
coerce : force errors to NaT (False by default)
unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
(e.g. a unix timestamp), which is an integer/float number
Expand Down Expand Up @@ -273,7 +276,7 @@ def _convert_listlike(arg, box, format):
if result is None:
try:
result = tslib.array_strptime(
arg, format, coerce=coerce
arg, format, exact=exact, coerce=coerce
)
except (tslib.OutOfBoundsDatetime):
if errors == 'raise':
Expand Down
62 changes: 45 additions & 17 deletions pandas/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2123,13 +2123,24 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce):
raise ValueError("Invalid type for timedelta scalar: %s" % type(ts))
return ts.astype('timedelta64[ns]')

def array_strptime(ndarray[object] values, object fmt, coerce=False):
def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coerce=False):
"""
Parameters
----------
values : ndarray of string-like objects
fmt : string-like regex
exact : matches must be exact if True, search if False
coerce : if invalid values found, coerce to NaT
"""

cdef:
Py_ssize_t i, n = len(values)
pandas_datetimestruct dts
ndarray[int64_t] iresult
int year, month, day, minute, hour, second, fraction, weekday, julian
object val
int year, month, day, minute, hour, second, fraction, weekday, julian, tz
int week_of_year, week_of_year_start
object val, group_key, ampm, found
dict found_key

global _TimeRE_cache, _regex_cache
with _cache_lock:
Expand Down Expand Up @@ -2198,19 +2209,32 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False):
else:
val = str(val)

found = format_regex.match(val)
if not found:
if coerce:
iresult[i] = iNaT
continue
raise ValueError("time data %r does not match format %r" %
(values[i], fmt))
if len(val) != found.end():
if coerce:
iresult[i] = iNaT
continue
raise ValueError("unconverted data remains: %s" %
values[i][found.end():])
# exact matching
if exact:
found = format_regex.match(val)
if not found:
if coerce:
iresult[i] = iNaT
continue
raise ValueError("time data %r does not match format %r (match)" %
(values[i], fmt))
if len(val) != found.end():
if coerce:
iresult[i] = iNaT
continue
raise ValueError("unconverted data remains: %s" %
values[i][found.end():])

# search
else:
found = format_regex.search(val)
if not found:
if coerce:
iresult[i] = iNaT
continue
raise ValueError("time data %r does not match format %r (search)" %
(values[i], fmt))

year = 1900
month = day = 1
hour = minute = second = fraction = 0
Expand Down Expand Up @@ -4368,10 +4392,14 @@ _TimeRE_cache = TimeRE()
_CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache
_regex_cache = {}

def _calc_julian_from_U_or_W(year, week_of_year, day_of_week, week_starts_Mon):
cdef _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon):
"""Calculate the Julian day based on the year, week of the year, and day of
the week, with week_start_day representing whether the week of the year
assumes the week starts on Sunday or Monday (6 or 0)."""

cdef:
int first_weekday, week_0_length, days_to_week

first_weekday = datetime_date(year, 1, 1).weekday()
# If we are dealing with the %U directive (week starts on Sunday), it's
# easier to just shift the view to Sunday being the first day of the
Expand Down
8 changes: 8 additions & 0 deletions vb_suite/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,14 @@ def date_range(start=None, end=None, periods=None, freq=None):
Benchmark('to_datetime(strings,format="%Y%m%d")', setup,
start_date=datetime(2012, 7, 1))

setup = common_setup + """
s = Series(['19MAY11','19MAY11:00:00:00']*100000)
"""
timeseries_with_format = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \
setup, start_date=datetime(2014, 11, 26))
timeseries_with_format_replace = Benchmark("to_datetime(s.str.replace(':\S+$',''),format='%d%b%y')", \
setup, start_date=datetime(2014, 11, 26))

# ---- infer_freq
# infer_freq

Expand Down

0 comments on commit ea2489d

Please sign in to comment.