-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_model.py
78 lines (58 loc) · 3 KB
/
train_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import argparse
import duckdb
import pandas
from lifetimes import BetaGeoFitter
con = duckdb.connect(database=':memory:', read_only=False)
pandas.set_option('display.max_rows', None)
pandas.set_option('display.max_columns', None)
def parse_args():
parser = argparse.ArgumentParser(description='Build a Churn Model')
parser.add_argument('-p', '--parquet', help="the path to the Parquet file with the time series data.")
parser.add_argument('-e', '--event', help='the event we are going to use.')
return parser.parse_args()
def create_features(db):
max_date = con.execute("select max(timestamp) from parquet_scan('" + db + "')").fetchall()[0][0]
initial_view = "create view filtered as select distinct user_id, timestamp as date " \
"from parquet_scan('" + db + "') where event='" + args.event + "'"
first_iteration = "create view first_iteration as select user_id, count(date)-1 as freq, min(date) as first, " \
"max(date) as last from filtered " \
"group by user_id "
features = "create view features as select user_id, freq, last - first as recency, DATE '" + str(max_date) + \
"' - first as T from first_iteration"
con.execute(initial_view)
con.execute(first_iteration)
con.execute(features)
# we need todo this because for some unknown reason the intervals that come back into the dataframe are all wrong,
# this is probably a bug with the duckdb implementation. Very ungly hack to turn an interval into a count of days,
# this is because of the limitations of duckdb in terms of date functions.
con.execute(
"select user_id, freq, "
"extract('year' from recency) as recency_year, "
"extract('month' from recency) as recency_month, "
"extract('day' from recency) as recency_day, "
"extract('year' from t) as t_year, "
"extract('month' from t) as t_month, "
"extract('day' from t) as t_day "
"from features")
db_result = con.df()
# calculate number of days from years, months and days. We consider 30.436875 days in a month on an average.
db_result['t'] = round(
db_result['t_year'] * 365 + db_result['t_month'] * 30.436875 + db_result['t_day'], 0)
db_result['recency'] = round(
db_result['recency_year'] * 365 + db_result['recency_month'] * 30.436875 + db_result['recency_day'], 0)
db_result = db_result.drop(['recency_year', 'recency_day', 'recency_month', 't_year', 't_month', 't_day'], axis=1)
print(db_result)
filtered_freqs = db_result[db_result['freq'] > 0]
model = BetaGeoFitter()
model.fit(filtered_freqs['freq'], filtered_freqs['recency'], filtered_freqs['t'])
print(model.summary)
model.save_model('churn_model.pkl')
if __name__ == '__main__':
args = parse_args()
if args.parquet is None:
print("Where are the Events dude?")
quit()
if args.event is None:
print("What event should I use dude?")
quit()
create_features(args.parquet)