PERF: add exact kw to to_datetime to enable faster regex format parsi…

…ng for datetimes (GH8904)
blbradley · Dec 5, 2014 · ea2489d · ea2489d
1 parent 526f33c
commit ea2489d
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 19 deletions.
diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt
@@ -85,6 +85,8 @@ Performance
 ~~~~~~~~~~~
 - Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`)
 
+- Performance boost for ``to_datetime`` conversions with a passed ``format=`` kw, and the new ``exact=False`` for kw with non-exact matching (:issue:`8904`)
+
 .. _whatsnew_0152.experimental:
 
 Experimental

diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py
@@ -4123,6 +4123,16 @@ def test_to_datetime_format_time(self):
         for s, format, dt in data:
             self.assertEqual(to_datetime(s, format=format), dt)
 
+    def test_to_datetime_with_non_exact(self):
+
+        if sys.version_info < (2, 7):
+            raise nose.SkipTest('on python version < 2.7')
+
+        s = Series(['19MAY11','foobar19MAY11','19MAY11:00:00:00','19MAY11 00:00:00Z']*10000)
+        result = to_datetime(s,format='%d%b%y',exact=False)
+        expected = to_datetime(s.str.extract('(\d+\w+\d+)'),format='%d%b%y')
+        assert_series_equal(result, expected)
+
     def test_to_datetime_format_weeks(self):
         data = [
                 ['2009324', '%Y%W%w', Timestamp('2009-08-13')],

diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py
@@ -174,7 +174,7 @@ def _guess_datetime_format_for_array(arr, **kwargs):
         return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
 
 def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
-                format=None, coerce=False, unit='ns',
+                format=None, exact=True, coerce=False, unit='ns',
                 infer_datetime_format=False):
     """
     Convert argument to datetime.
@@ -195,6 +195,9 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
         If True returns a DatetimeIndex, if False returns ndarray of values
     format : string, default None
         strftime to parse time, eg "%d/%m/%Y"
+    exact : boolean, True by default
+        if True, require an exact format match
+        if False, search for a matching format non-exclusive to the endpoints
     coerce : force errors to NaT (False by default)
     unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
         (e.g. a unix timestamp), which is an integer/float number
@@ -273,7 +276,7 @@ def _convert_listlike(arg, box, format):
                 if result is None:
                     try:
                         result = tslib.array_strptime(
-                            arg, format, coerce=coerce
+                            arg, format, exact=exact, coerce=coerce
                         )
                     except (tslib.OutOfBoundsDatetime):
                         if errors == 'raise':

diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx
@@ -2123,13 +2123,24 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce):
         raise ValueError("Invalid type for timedelta scalar: %s" % type(ts))
     return ts.astype('timedelta64[ns]')
 
-def array_strptime(ndarray[object] values, object fmt, coerce=False):
+def array_strptime(ndarray[object] values, object fmt, bint exact=True, bint coerce=False):
+    """
+    Parameters
+    ----------
+    values : ndarray of string-like objects
+    fmt : string-like regex
+    exact : matches must be exact if True, search if False
+    coerce : if invalid values found, coerce to NaT
+    """
+
     cdef:
         Py_ssize_t i, n = len(values)
         pandas_datetimestruct dts
         ndarray[int64_t] iresult
-        int year, month, day, minute, hour, second, fraction, weekday, julian
-        object val
+        int year, month, day, minute, hour, second, fraction, weekday, julian, tz
+        int week_of_year, week_of_year_start
+        object val, group_key, ampm, found
+        dict found_key
 
     global _TimeRE_cache, _regex_cache
     with _cache_lock:
@@ -2198,19 +2209,32 @@ def array_strptime(ndarray[object] values, object fmt, coerce=False):
             else:
                 val = str(val)
 
-        found = format_regex.match(val)
-        if not found:
-            if coerce:
-                iresult[i] = iNaT
-                continue
-            raise ValueError("time data %r does not match format %r" %
-                             (values[i], fmt))
-        if len(val) != found.end():
-            if coerce:
-                iresult[i] = iNaT
-                continue
-            raise ValueError("unconverted data remains: %s" %
-                              values[i][found.end():])
+        # exact matching
+        if exact:
+            found = format_regex.match(val)
+            if not found:
+                if coerce:
+                    iresult[i] = iNaT
+                    continue
+                raise ValueError("time data %r does not match format %r (match)" %
+                                 (values[i], fmt))
+            if len(val) != found.end():
+                if coerce:
+                    iresult[i] = iNaT
+                    continue
+                raise ValueError("unconverted data remains: %s" %
+                                  values[i][found.end():])
+
+        # search
+        else:
+            found = format_regex.search(val)
+            if not found:
+                if coerce:
+                    iresult[i] = iNaT
+                    continue
+                raise ValueError("time data %r does not match format %r (search)" %
+                                 (values[i], fmt))
+
         year = 1900
         month = day = 1
         hour = minute = second = fraction = 0
@@ -4368,10 +4392,14 @@ _TimeRE_cache = TimeRE()
 _CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache
 _regex_cache = {}
 
-def _calc_julian_from_U_or_W(year, week_of_year, day_of_week, week_starts_Mon):
+cdef _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon):
     """Calculate the Julian day based on the year, week of the year, and day of
     the week, with week_start_day representing whether the week of the year
     assumes the week starts on Sunday or Monday (6 or 0)."""
+
+    cdef:
+        int first_weekday,  week_0_length, days_to_week
+
     first_weekday = datetime_date(year, 1, 1).weekday()
     # If we are dealing with the %U directive (week starts on Sunday), it's
     # easier to just shift the view to Sunday being the first day of the

diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py
@@ -156,6 +156,14 @@ def date_range(start=None, end=None, periods=None, freq=None):
     Benchmark('to_datetime(strings,format="%Y%m%d")', setup,
               start_date=datetime(2012, 7, 1))
 
+setup = common_setup + """
+s = Series(['19MAY11','19MAY11:00:00:00']*100000)
+"""
+timeseries_with_format = Benchmark("to_datetime(s,format='%d%b%y',exact=False)", \
+     setup, start_date=datetime(2014, 11, 26))
+timeseries_with_format_replace = Benchmark("to_datetime(s.str.replace(':\S+$',''),format='%d%b%y')", \
+     setup, start_date=datetime(2014, 11, 26))
+
 # ---- infer_freq
 # infer_freq