-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRFM.R
162 lines (135 loc) · 5.02 KB
/
RFM.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
setwd("/Users/jaoming/Downloads")
library(tidyverse) # already includes tidyr, readr, stringr
library(readxl)
data = read_xlsx("EXAMPLE_BEVERAGE_DATA_THAILAND.xlsx")
# number of unique values in each column
data %>%
filter(channel == "Hypermarkets") %>%
summarise_all(n_distinct)
# number of unique storenames
data %>%
filter(channel == "Hypermarkets") %>%
select(STORENAME) %>%
distinct()
# cleaning of data
clean_data = data %>%
select(HOUSEID,
date1,
date2,
Date3,
weekday,
channel,
category,
STORENAME,
spend,
DEMOG_REGION,
BRAND_E1,
packdesc,
PACKAGE)
# data with the demograghics
demo_data = data %>%
select(HOUSEID,
date1,
date2,
Date3,
weekday,
channel,
DEMOG_HHINCOME:DEMOG_WORKINGSTATUS)
clean_data$spend = as.double(clean_data$spend)
cleaner_data = clean_data %>%
mutate(Date_1 = as.Date(sapply(str_split(clean_data$date1, pattern = " w/e "), FUN = function(x) {x[2]})),
Date_2 = as.Date(sapply(str_split(clean_data$date2, pattern = " w/e "), FUN = function(x) {x[2]})),
Date_3 = as.Date(Date3)) %>%
select(HOUSEID, Date_1, Date_2, Date_3, spend, weekday, DEMOG_REGION, BRAND_E1, channel, category)
cleaner_data$weekday = factor(cleaner_data$weekday,
levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"),
ordered = TRUE)
cleaner_data = cleaner_data %>%
mutate(realdate = Date_1 + as.integer(weekday) - 7) %>% # getting the real date
select(HOUSEID, realdate, spend, DEMOG_REGION, BRAND_E1, channel, category)
# RFM (Recency Frequency Monetary)
# Finding out how many transactions were made per House ID
total_visits = cleaner_data %>%
group_by(HOUSEID) %>%
summarise(TotalVisits = n()) %>%
arrange(desc(TotalVisits)) # this will show the top 10 House ID's that purchase this product the most
ggplot(total_visits, aes(TotalVisits)) +
geom_histogram(binwidth = 100)
# Finding out how much each House ID spends
total_spending = cleaner_data %>%
group_by(HOUSEID) %>%
summarise(TotalSpending = sum(spend)) %>%
arrange(desc(TotalSpending))
# plot
ggplot(total_spending, aes(TotalSpending)) +
geom_histogram(binwidth = 2500)
# Finding out how long it has been since the family bought the product (taking reference from the start of 2019)
ReferenceDate = as.Date("2019/01/01") # since this date is far later than the ones shown in the file
last_visit = cleaner_data %>%
mutate(recency = abs(realdate - ReferenceDate)) %>%
group_by(HOUSEID) %>%
summarise(Recency = min(recency)) %>%
arrange(Recency)
# plot
ggplot(last_visit, aes(Recency)) +
geom_histogram(binwidth = 50)
## Ranking for RFM
number_of_houses = length(unique(cleaner_data$HOUSEID))
last_visit = last_visit %>% arrange(Recency)
total_spending = total_spending %>% arrange(desc(TotalSpending))
total_visits = total_visits %>% arrange(desc(TotalVisits))
# map qunatiles splits all the values into 5 groups based on the order it was arranged in
map_quantiles <- function(vect, num_groups=5) {
ranks <- order(vect)
result <- rep(0,length(vect))
one_unit <- floor(length(vect) / num_groups)
for (index in 1:num_groups) {
if (index == num_groups) {
result[(index - 1) * one_unit < ranks] <- index
} else {
result[(index - 1) * one_unit < ranks & ranks <= index * one_unit] <- index
}
}
return(result)
}
total_visits$rfm2 = map_quantiles(total_visits$TotalVisits)
total_spending$rfm3 = map_quantiles(total_spending$TotalSpending)
last_visit$rfm1 = 6 - map_quantiles(last_visit$Recency) # because lower the value means more recent
# creating the rfm dataframe
final_rfm = as.data.frame(unique(cleaner_data$HOUSEID))
colnames(final_rfm) = "HOUSEID"
final_rfm = final_rfm %>%
left_join(last_visit, by = "HOUSEID") %>%
left_join(total_visits, by = "HOUSEID") %>%
left_join(total_spending, by = "HOUSEID") %>%
mutate(rfmscore = (rfm1*100)+(rfm2*10)+(rfm3))
# adding the area the family shops at into the dataframe
data %>% group_by(HOUSEID, DEMOG_AREA) %>% summarise(n = n())
# shows that each HOUSEID only goes to one DEMOG_AREA
house_to_area = data %>%
group_by(HOUSEID, DEMOG_AREA) %>%
summarise(n = n()) %>%
select(HOUSEID, DEMOG_AREA)
demog_rfm = final_rfm %>% left_join(house_to_area, by = "HOUSEID")
# Others
# Finding out the spread of sources (where do people usually go to purchase this product)
channel_spread = data %>%
group_by(channel) %>%
summarise(n = n()) %>%
arrange(desc(n))
ggplot(channel_spread, aes(x = reorder(channel, n), y = n)) +
geom_col() +
labs(x = 'Channel',
y = 'Count') +
coord_flip()
# Finding out which store people go to the most
store_spread = data %>%
group_by(STORENAME) %>%
summarise(n = n()) %>%
arrange(desc(n))
# show the top 10
ggplot(store_spread[1:10,], aes(x = reorder(STORENAME, n), y = n)) +
geom_col() +
labs(x = 'Store',
y = 'Count') +
coord_flip()