Skip to content

Commit

Permalink
PERF: Improve performance of CustomBusinessDay
Browse files Browse the repository at this point in the history
  • Loading branch information
bjonen committed Oct 4, 2014
1 parent fccd7fe commit 68f6268
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 52 deletions.
1 change: 1 addition & 0 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,7 @@ Performance
- Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`)
- Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`).
- Performance benchmarking of groupby for large value of ngroups (:issue:`6787`)
- Performance improvement in ``CustomBusinessDay``, ``CustomBusinessMonth`` (:issue:`8236`)



Expand Down
117 changes: 74 additions & 43 deletions pandas/tseries/offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,12 +225,12 @@ def _should_cache(self):
return self.isAnchored() and self._cacheable

def _params(self):
attrs = [(k, v) for k, v in compat.iteritems(vars(self))
if (k not in ['kwds', 'name', 'normalize',
'busdaycalendar']) and (k[0] != '_')]
attrs.extend(list(self.kwds.items()))
all_paras = dict(list(vars(self).items()) + list(self.kwds.items()))
if 'holidays' in all_paras and not all_paras['holidays']:
all_paras.pop('holidays')
exclude = ['kwds', 'name','normalize', 'calendar']
attrs = [(k, v) for k, v in all_paras.items() if (k not in exclude ) and (k[0] != '_')]
attrs = sorted(set(attrs))

params = tuple([str(self.__class__)] + attrs)
return params

Expand Down Expand Up @@ -547,38 +547,57 @@ class CustomBusinessDay(BusinessDay):
holidays : list
list/array of dates to exclude from the set of valid business days,
passed to ``numpy.busdaycalendar``
calendar : HolidayCalendar instance
instance of AbstractHolidayCalendar that provide the list of holidays
calendar : pd.HolidayCalendar or np.busdaycalendar
"""

_cacheable = False
_prefix = 'C'

def __init__(self, n=1, normalize=False, **kwds):
def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri',
holidays=None, calendar=None, **kwds):
self.n = int(n)
self.normalize = normalize
self.kwds = kwds
self.offset = kwds.get('offset', timedelta(0))
self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri')

if 'calendar' in kwds:
holidays = kwds['calendar'].holidays()
else:
holidays = kwds.get('holidays', [])
calendar, holidays = self.get_calendar(weekmask=weekmask,
holidays=holidays,
calendar=calendar)
# CustomBusinessDay instances are identified by the
# following two attributes. See DateOffset._params()
# holidays, weekmask

self.kwds['weekmask'] = self.weekmask = weekmask
self.kwds['holidays'] = self.holidays = holidays
self.kwds['calendar'] = self.calendar = calendar

def get_calendar(self, weekmask, holidays, calendar):
'''Generate busdaycalendar'''
if isinstance(calendar, np.busdaycalendar):
if not holidays:
holidays = tuple(calendar.holidays)
elif not isinstance(holidays, tuple):
holidays = tuple(holidays)
else:
# trust that calendar.holidays and holidays are
# consistent
pass
return calendar, holidays

if holidays is None:
holidays = []
try:
holidays = holidays + calendar.holidays().tolist()
except AttributeError:
pass
holidays = [self._to_dt64(dt, dtype='datetime64[D]') for dt in
holidays]
self.holidays = tuple(sorted(holidays))
self.kwds['holidays'] = self.holidays
holidays = tuple(sorted(holidays))

self._set_busdaycalendar()
kwargs = {'weekmask': weekmask}
if holidays:
kwargs['holidays'] = holidays

def _set_busdaycalendar(self):
if self.holidays:
kwargs = {'weekmask':self.weekmask,'holidays':self.holidays}
else:
kwargs = {'weekmask':self.weekmask}
try:
self.busdaycalendar = np.busdaycalendar(**kwargs)
busdaycalendar = np.busdaycalendar(**kwargs)
except:
# Check we have the required numpy version
from distutils.version import LooseVersion
Expand All @@ -589,17 +608,23 @@ def _set_busdaycalendar(self):
np.__version__)
else:
raise
return busdaycalendar, holidays

def __getstate__(self):
"""Return a pickleable state"""
state = self.__dict__.copy()
del state['busdaycalendar']
del state['calendar']
return state

def __setstate__(self, state):
"""Reconstruct an instance from a pickled state"""
self.__dict__ = state
self._set_busdaycalendar()
calendar, holidays = self.get_calendar(weekmask=self.weekmask,
holidays=self.holidays,
calendar=None)
self.kwds['calendar'] = self.calendar = calendar
self.kwds['holidays'] = self.holidays = holidays
self.kwds['weekmask'] = state['weekmask']

@apply_wraps
def apply(self, other):
Expand All @@ -613,7 +638,7 @@ def apply(self, other):
np_dt = np.datetime64(date_in.date())

np_incr_dt = np.busday_offset(np_dt, self.n, roll=roll,
busdaycal=self.busdaycalendar)
busdaycal=self.calendar)

dt_date = np_incr_dt.astype(datetime)
result = datetime.combine(dt_date, date_in.time())
Expand All @@ -635,7 +660,6 @@ def _to_dt64(dt, dtype='datetime64'):
# > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]')
# numpy.datetime64('2013-05-01T02:00:00.000000+0200')
# Thus astype is needed to cast datetime to datetime64[D]

if getattr(dt, 'tzinfo', None) is not None:
i8 = tslib.pydt_to_i8(dt)
dt = tslib.tz_convert_single(i8, 'UTC', dt.tzinfo)
Expand All @@ -649,7 +673,7 @@ def onOffset(self, dt):
if self.normalize and not _is_normalized(dt):
return False
day64 = self._to_dt64(dt,'datetime64[D]')
return np.is_busday(day64, busdaycal=self.busdaycalendar)
return np.is_busday(day64, busdaycal=self.calendar)


class MonthOffset(SingleConstructorOffset):
Expand Down Expand Up @@ -767,7 +791,6 @@ def onOffset(self, dt):
_prefix = 'BMS'



class CustomBusinessMonthEnd(BusinessMixin, MonthOffset):
"""
**EXPERIMENTAL** DateOffset of one custom business month
Expand All @@ -788,18 +811,22 @@ class CustomBusinessMonthEnd(BusinessMixin, MonthOffset):
holidays : list
list/array of dates to exclude from the set of valid business days,
passed to ``numpy.busdaycalendar``
calendar : pd.HolidayCalendar or np.busdaycalendar
"""

_cacheable = False
_prefix = 'CBM'
def __init__(self, n=1, normalize=False, **kwds):
def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri',
holidays=None, calendar=None, **kwds):
self.n = int(n)
self.normalize = normalize
self.kwds = kwds
self.offset = kwds.get('offset', timedelta(0))
self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri')
self.cbday = CustomBusinessDay(n=self.n, **kwds)
self.m_offset = MonthEnd()
self.cbday = CustomBusinessDay(n=self.n, normalize=normalize,
weekmask=weekmask, holidays=holidays,
calendar=calendar, **kwds)
self.m_offset = MonthEnd(n=1, normalize=normalize, **kwds)
self.kwds['calendar'] = self.cbday.calendar # cache numpy calendar

@apply_wraps
def apply(self,other):
Expand All @@ -817,11 +844,11 @@ def apply(self,other):
n -= 1
elif other > cur_cmend and n <= -1:
n += 1
new = cur_mend + n * MonthEnd()

new = cur_mend + n * self.m_offset
result = self.cbday.rollback(new)
return result

class CustomBusinessMonthBegin(BusinessMixin, MonthOffset):
"""
**EXPERIMENTAL** DateOffset of one custom business month
Expand All @@ -842,18 +869,22 @@ class CustomBusinessMonthBegin(BusinessMixin, MonthOffset):
holidays : list
list/array of dates to exclude from the set of valid business days,
passed to ``numpy.busdaycalendar``
calendar : pd.HolidayCalendar or np.busdaycalendar
"""

_cacheable = False
_prefix = 'CBMS'
def __init__(self, n=1, normalize=False, **kwds):
def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri',
holidays=None, calendar=None, **kwds):
self.n = int(n)
self.normalize = normalize
self.kwds = kwds
self.offset = kwds.get('offset', timedelta(0))
self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri')
self.cbday = CustomBusinessDay(n=self.n, normalize=normalize, **kwds)
self.m_offset = MonthBegin(normalize=normalize)
self.cbday = CustomBusinessDay(n=self.n, normalize=normalize,
weekmask=weekmask, holidays=holidays,
calendar=calendar, **kwds)
self.m_offset = MonthBegin(n=1, normalize=normalize, **kwds)
self.kwds['calendar'] = self.cbday.calendar # cache numpy calendar

@apply_wraps
def apply(self,other):
Expand All @@ -872,8 +903,8 @@ def apply(self,other):
n += 1
elif dt_in < cur_cmbegin and n >= 1:
n -= 1
new = cur_mbegin + n * MonthBegin()

new = cur_mbegin + n * self.m_offset
result = self.cbday.rollforward(new)
return result

Expand Down
Binary file added pandas/tseries/tests/data/cday-0.14.1.pickle
Binary file not shown.
44 changes: 40 additions & 4 deletions pandas/tseries/tests/test_offsets.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from datetime import date, datetime, timedelta
from dateutil.relativedelta import relativedelta
from pandas.compat import range
Expand All @@ -22,6 +23,7 @@
from pandas.tseries.tools import parse_time_string
import pandas.tseries.offsets as offsets

from pandas.io.pickle import read_pickle
from pandas.tslib import NaT, Timestamp
import pandas.tslib as tslib
from pandas.util.testing import assertRaisesRegexp
Expand Down Expand Up @@ -848,6 +850,24 @@ def test_calendar(self):
dt = datetime(2014, 1, 17)
assertEq(CDay(calendar=calendar), dt, datetime(2014, 1, 21))

def test_roundtrip_pickle(self):
def _check_roundtrip(obj):
unpickled = self.round_trip_pickle(obj)
self.assertEqual(unpickled, obj)
_check_roundtrip(self.offset)
_check_roundtrip(self.offset2)
_check_roundtrip(self.offset*2)

def test_pickle_compat_0_14_1(self):
hdays = [datetime(2013,1,1) for ele in range(4)]

pth = tm.get_data_path()

cday0_14_1 = read_pickle(os.path.join(pth, 'cday-0.14.1.pickle'))
cday = CDay(holidays=hdays)
self.assertEqual(cday, cday0_14_1)


class CustomBusinessMonthBase(object):
_multiprocess_can_split_ = True

Expand Down Expand Up @@ -894,6 +914,15 @@ def test_offsets_compare_equal(self):
offset2 = self._object()
self.assertFalse(offset1 != offset2)

def test_roundtrip_pickle(self):
def _check_roundtrip(obj):
unpickled = self.round_trip_pickle(obj)
self.assertEqual(unpickled, obj)
_check_roundtrip(self._object())
_check_roundtrip(self._object(2))
_check_roundtrip(self._object()*2)


class TestCustomBusinessMonthEnd(CustomBusinessMonthBase, Base):
_object = CBMonthEnd

Expand Down Expand Up @@ -1006,8 +1035,12 @@ def test_holidays(self):

def test_datetimeindex(self):
from pandas.tseries.holiday import USFederalHolidayCalendar
self.assertEqual(DatetimeIndex(start='20120101',end='20130101',freq=CBMonthEnd(calendar=USFederalHolidayCalendar())).tolist()[0],
datetime(2012,1,31))
hcal = USFederalHolidayCalendar()
freq = CBMonthEnd(calendar=hcal)

self.assertEqual(DatetimeIndex(start='20120101',end='20130101',
freq=freq).tolist()[0],
datetime(2012,1,31))

class TestCustomBusinessMonthBegin(CustomBusinessMonthBase, Base):
_object = CBMonthBegin
Expand Down Expand Up @@ -1120,8 +1153,11 @@ def test_holidays(self):
self.assertEqual(dt + 2*bm_offset,datetime(2012,2,3))

def test_datetimeindex(self):
self.assertEqual(DatetimeIndex(start='20120101',end='20130101',freq=CBMonthBegin(calendar=USFederalHolidayCalendar())).tolist()[0],
datetime(2012,1,3))
hcal = USFederalHolidayCalendar()
cbmb = CBMonthBegin(calendar=hcal)
self.assertEqual(DatetimeIndex(start='20120101', end='20130101',
freq=cbmb).tolist()[0],
datetime(2012,1,3))


def assertOnOffset(offset, date, expected):
Expand Down
35 changes: 30 additions & 5 deletions vb_suite/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,15 +285,20 @@ def date_range(start=None, end=None, periods=None, freq=None):
setup = common_setup + """
import datetime as dt
import pandas as pd
import pandas.tseries.holiday
import numpy as np
date = dt.datetime(2011,1,1)
dt64 = np.datetime64('2011-01-01 09:00Z')
hcal = pd.tseries.holiday.USFederalHolidayCalendar()
day = pd.offsets.Day()
year = pd.offsets.YearBegin()
cday = pd.offsets.CustomBusinessDay()
cme = pd.offsets.CustomBusinessMonthEnd()
cmb = pd.offsets.CustomBusinessMonthBegin(calendar=hcal)
cme = pd.offsets.CustomBusinessMonthEnd(calendar=hcal)
cdayh = pd.offsets.CustomBusinessDay(calendar=hcal)
"""
timeseries_day_incr = Benchmark("date + day",setup)

Expand All @@ -306,15 +311,26 @@ def date_range(start=None, end=None, periods=None, freq=None):
timeseries_custom_bday_incr = \
Benchmark("date + cday",setup)

timeseries_custom_bday_decr = \
Benchmark("date - cday",setup)

timeseries_custom_bday_apply = \
Benchmark("cday.apply(date)",setup)

timeseries_custom_bday_apply_dt64 = \
Benchmark("cday.apply(dt64)",setup)

# Increment by n
timeseries_custom_bday_incr_n = \
Benchmark("date + 10 * cday",setup)
timeseries_custom_bday_cal_incr = \
Benchmark("date + 1 * cdayh",setup)

timeseries_custom_bday_cal_decr = \
Benchmark("date - 1 * cdayh",setup)

timeseries_custom_bday_cal_incr_n = \
Benchmark("date + 10 * cdayh",setup)

timeseries_custom_bday_cal_incr_neg_n = \
Benchmark("date - 10 * cdayh",setup)

# Increment custom business month
timeseries_custom_bmonthend_incr = \
Expand All @@ -323,6 +339,16 @@ def date_range(start=None, end=None, periods=None, freq=None):
timeseries_custom_bmonthend_incr_n = \
Benchmark("date + 10 * cme",setup)

timeseries_custom_bmonthend_decr_n = \
Benchmark("date - 10 * cme",setup)

timeseries_custom_bmonthbegin_incr_n = \
Benchmark("date + 10 * cmb",setup)

timeseries_custom_bmonthbegin_decr_n = \
Benchmark("date - 10 * cmb",setup)


#----------------------------------------------------------------------
# month/quarter/year start/end accessors

Expand Down Expand Up @@ -357,4 +383,3 @@ def iter_n(iterable, n=None):
timeseries_iter_datetimeindex_preexit = Benchmark('iter_n(idx1, M)', setup)

timeseries_iter_periodindex_preexit = Benchmark('iter_n(idx2, M)', setup)

0 comments on commit 68f6268

Please sign in to comment.