-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathcbc-data-handling.R
348 lines (324 loc) · 13.9 KB
/
cbc-data-handling.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
# Copyright 2019 Google LLC.
# SPDX-License-Identifier: Apache-2.0
# CBC data handling and creation routines (other than import-export!)
# FUNCTIONS:
# generateMNLrandomTab() : Create a design matrix from a list of attributes
# pickMNLwinningCards() : Given utilities, choose the winning tasks from design matrix
# generateRNDpws() : Create random utilities for simulated respondents
# fillCBCwinners() :
# expandCBCwinners() :
#############################################
# generateMNLrandomTab()
#############################################
# function(attrLevels, cards=3, respondents=200, trials=12,
# balanced.sample=TRUE, best.of=100, verbose=TRUE, no.output=FALSE)
#
# Create a "TAB" style CBC design
#
# WARNING: for didactic & simulation rather than survey purposes. Designs are
# slightly optimized for level balance, but not for other concerns such as
# D-efficiency and optimal balanced overlap.
#
# PARAMETERS
# attrLevels = list of how many levels to create for each attribute
# (length = # of attributes; each entry = # of levels)
# cards = concepts shown per CBC task
# respondents = how many sets of trials to generate
# trials = how many trials to generate for each "respondent"
# balanced.sample = whether to minimize reuse of concept levels
# TRUE = No overlap within trials (if possible). Do not resample
# a level within attribute in a trial
# FALSE = Just generate cards randomly with level resampling
# according to uniform distribution
# best.of = how many designs to evaluate; function will return the single
# best-balanced design among them
# verbose = show output as it goes
# no.output = whether to run silently
#
# NOTE:
# in v0.2, all respondents must have the same number of cards and trials
# see "estimateMNLfromDesign()" for more details
#
# SAMPLE CODE
# a design with 10 attributes of 3-7 levels
# tmp.attrs <- c(4,3,4,5,5,3,4,5,2,7)
# create random CBC cards for that design
# tmp.tab <- generateMNLrandomTab(tmp.attrs, respondents=200, trials=8)
# convert those to a design-coded dummy matrix
# tmp.des <- convertSSItoDesign(tmp.tab)
#
#' Simple experimental design for choice-based conjoint analysis
#'
#' @description
#' Create an experimental design for an exactly rectangular version of
#' choice-based conjoint analysis (same number of concepts and trials
#' for every respondent).
#' This function is recommended primarily for simple surveys and
#' didactic purposes; it does not do advanced optimization.
#' It will attempt to find level balance through iterative selection across
#' multiple randomized designs.
#'
#' @param attrLevels A vector of integers, where each integer is the number of
#' levels for its respective attribute. For example, if a product has 3
#' brands, 4 performance levels, and 3 price points, this would be
#' \code{c(3, 4, 3)}.
#' @param cards The number of concepts to show at one time.
#' @param respondents The number of designs to generate.
#' @param trials How many tasks will be given to each respondent
#' @param balanced.sample Whether to attempt to do level balancing across
#' attributes, i.e., to have fewer duplications of any level on a given
#' task.
#' @param best.of How many iterations to consider before selecting the
#' optimized one.
#' @param verbose Outputs more information as it works.
#' @param no.output Whether to suppress all output.
#'
#' @returns A data frame for the experimental design, with a row for
#' each respondent * trial * concept, with columns for the attribute levels.
#'
#' @seealso [generateRNDpws] to create a set of random utilities for
#' a specific attribute list, [pickMNLwinningCards] to "answer" the survey
#' design according to utilities.
#'
generateMNLrandomTab <- function(attrLevels, cards=3, respondents=200,
trials=12, balanced.sample=TRUE, best.of=50,
verbose=TRUE, no.output=FALSE)
{
cat("Searching for a balanced design ...\n")
# create a matrix to hold the result
rnd.tab <- matrix(0,ncol=length(attrLevels),nrow=cards*respondents*trials)
# and figure the starting point for a "best" design
att.base <- 1/rep(attrLevels,attrLevels) # ideal balance proportions
hold.mss <- sum(att.base^2)
# and one to hold trials along the way
rnd.trial <- matrix(0,ncol=length(attrLevels),nrow=cards*respondents*trials)
for (i in 1:best.of)
{
# generate a candidate trial
# try to balance the attribute levels ?
if (balanced.sample) {
# figure out which attr levels can be balanced and which can't
which.balanced <- which(attrLevels >= cards)
which.replace <- which(attrLevels < cards)
# generate the columns that can be balanced
# (sample without replacement from attribute levels)
if (any(which.balanced)) {
# generate balanced sets for every combination where possible
for (ii in 1:(respondents*trials))
{
samp.row <- sapply(attrLevels[which.balanced],
sample, cards, replace=FALSE)
rnd.trial[((ii-1)*cards+1):(ii*cards), which.balanced] <- samp.row
}
}
# and now the columns that can't balance (sample with replacement)
if (any(which.replace)) {
samp.replace <- sapply(attrLevels[which.replace],
sample, cards*trials*respondents,replace=TRUE)
rnd.trial[,which.replace] <- samp.replace
}
} else {
# just generate them all randomly
rnd.trial <- sapply(attrLevels,
sample, cards*trials*respondents, replace=TRUE)
}
# is the candidate better than the previous candidate ?
# convert it to a design matrix
new.des <- convertSSItoDesign(as.data.frame(rnd.trial), no.output=TRUE)
# calculate how much it differs from the ideal and compare to held trial
new.mss <- sum((colMeans(new.des)-att.base)^2)
if (new.mss < hold.mss) {
rnd.tab <- rnd.trial
hold.mss <- new.mss
if (verbose & (i>1)) {
cat("Improved design found on trial: ", i ," SSE = " , new.mss , "\n")
}
}
}
rnd.tab <- data.frame(rnd.tab)
if (!is.null(names(attrLevels))) {
names(rnd.tab) <- names(attrLevels)
}
return(rnd.tab)
}
#############################################
# pickMNLwinningCards
#############################################
# function(design.mat, pws=rep(0, ncol(design.mat)), cards=3,
# verbose=TRUE, vec.format="WIN")
#
# Given a design matrix and vector of part worths, returns a vector of which
# card wins each comparison set
# NOTE: In v0.2, this function is primarily for test purposes -- it only uses
# aggregate level part worths, plus optional random noise
#
# PARAMETERS
# design.mat : the design matrix in dummy-coded (0/1) format
# pws : list of part worths to use
# NOTE: currently uses aggregate-level PWs only
# cards : number of cards per trial
# noise : add randomness into the choices?
# p.noise : fraction of choices that will be noisy, if noise==TRUE
# verbose : output more as it goes
# vec.format : format of the output, either "WIN" == 0/1 | "ANS" = 1,2,3, etc
#
# OUTPUT
# a vector of the winning cards for the design
#' Choose the winning concepts in a CBC design matrix, according to utilities
#'
#' @param design.mat A design matrix as created by \code{generateMNLrandomTab}
#' @param pws A vector of utility coefficients for each column in
#' \code{design.mat}. If missing, these are set to 0 and all winners will be
#' drawn randomly.
#' @param cards How many concepts are seen on each trial in the design matric.
#' @param noise Whether to randomly make some choices (instead of using the
#' utilties)
#' @param p.noise The odds of randomly choosing a winner (instead of using
#' the utilities)
#' @param use.MNL Not used currently, placeholder for future.
#' @param verbose Whether to output information as it runs.
#' @param vec.format "WIN" will mark the winning row in the design matrix
#' with a "1" and all others with "0". This would match a typical regression
#' model to estimate coefficients from the observations.
#' "ANS" will put the row number of the winning concept into the rows for
#' each trial; some other software expects that format.
#'
#' @return A vector identifying which concept won each trial, according to the
#' design matrix and supplied utilities.
#' @seealso [generateMNLrandomTab] to create a design matrix,
#' [generateRNDpws] to create a random set of utilities.
#'
pickMNLwinningCards <- function(design.mat, pws=rep(0, ncol(design.mat)),
cards=3, noise=FALSE, p.noise=0.3,
use.MNL=TRUE, ## to do -- choose according to MNL roulette
verbose=TRUE, vec.format="WIN")
{
# how many sets of comparisons are there?
n.trial <- nrow(design.mat)/cards
vec.win <- rep(0, n.trial)
## pws.mat <- as.matrix(pws) ## ** TO DO: handle vector OR a matrix
pws.mat <- pws
# iterate over every set of comparisons and find the winner
# ** TO DO
# ** would be nice to vectorize this -- [chris] see preference share prediction routines in portfolio model code
for (i in 1:n.trial)
{
card.mat <- design.mat[((i-1)*cards+1):(i*cards),]
card.util <- exp(pws.mat %*% t(card.mat))
which.win <- which(card.util==max(card.util))
if (noise) {
if (runif(1) < p.noise) {
which.win <- sample(1:cards, 1)
}
}
if (length(which.win) > 1) {
which.win <- sample(which.win, 1) # choose among multiple winners randomly
}
vec.win[i] <- which.win
if (verbose) {
if (i/2000 == floor(i/2000)) {
cat("Processing trial: ",i,"\n")
}
}
}
# expand this to the desired format (WIN==1/0 format by card; ANS=card number)
if (vec.format == "WIN") { # cards marked 1/0 according to whether they won or not
vec.tmp <- ifelse(rep(vec.win,each=cards)==rep(c(1:cards),n.trial),1,0)
vec.win <- vec.tmp
} else if (vec.format == "ANS") {
vec.tmp <- rep(vec.win,each=cards)
vec.win <- vec.tmp
} else {
warning("Parameter 'vec.format' specified incorrectly (should be 'WIN' or 'ANS'). Raw card list returned (1 card per set).")
}
return(vec.win)
}
#############################################
# generateRNDpws
#############################################
# function(attrs)
#
# given a vector of attributes/features, create random zero-sum part worths
#
# PARAMETERS
# attrs : a vector where each element represents how many levels there are for the corresponding attribute
# e.g., c(2,3,2) would be 3 attributes, with 2 levels, 3 levels, and 2 levels respectively
# mu : mean of the randomly drawn part-worths, if you want to ensure some of them are far from zero
# of course the return vector will still be zero-sum
# stdev : sd of the randomly drawn part-worths, if you want to ensure some different scale factor
#
# OUTPUT
# a vector of zero-centered (within attribute) random normal partworths (mean=mu, sd=stdev)
#
#' Generate some zero-centered part worths (utilities) for CBC design
#'
#' @param attrs A vector with the attribute/level design, as noted for
#' \code{generateMNLrandomTab}#'
#' @param mu The mean of the random utilities
#' @param sdev Standard deviation of the utilities
#'
#' @returns A vector of random values that sum to 0 for each subset of values
#' that correspond to one attribute in a CBC attribute/level design.
#'
#' @seealso [generateMNLrandomTab] to create a design matrix,
#' [pickMNLwinningCards] to select the winning concepts from it, given
#' some set of utilities (as from this function).
generateRNDpws <- function(attrs, mu=0, sdev=1)
{
pw.vec <- NULL
for (i in 1:length(attrs))
{
if (attrs[i] > 1) { # more than 1 feature so OK to generate
pw.car <- rnorm(attrs[i]-1, mu, sdev)
pw.cdr <- -1*sum(pw.car) # the final pw needed to make zero-sum
pw.vec <- c(pw.vec, sample(c(pw.car, pw.cdr))) # shuffle and add to list
} else if (attrs[1] < 1) { # attribute with <1 level
warning("Attribute level is missing (# features < 1).")
} else { # exactly 1 level so PW must be 0
pw.vec <- c(pw.vec, 0)
}
}
return(pw.vec)
}
#########################
# fillCBCwinners()
#########################
#
# utility function. In a vector of CBC choices, replaces any NAs with random choices
#
# PARAMETERS
# win.cards = vector of CBC winning cards, e.g., c(1,3,2,3,2,1,1,3,2...)
# OUTPUT
# vector with NA values replaced by random choices in [1..max(win.card)]
#
fillCBCwinners <- function(win.cards) {
if (any(!is.na(win.cards))) {
win.cards[is.na(win.cards)] <- sample(max(win.cards, na.rm=T), sum(is.na(win.cards)), replace=TRUE)
} else {
warning ("All cards in fillCBCwinners are NA. Assuming cards per trial = 3.")
win.cards[is.na(win.cards)] <- sample(3, sum(is.na(win.cards)), replace=TRUE)
}
return(win.cards)
}
#########################
# expandCBCwinners()
#########################
#
# utility function. In a vector of CBC choices, replaces any NAs with random choices
#
# PARAMETERS
# win.cards = vector of CBC winning cards, e.g., c(1,3,2,3,2,1,1,3,2...)
# cards = how many cards per trial
# fill = whether to replace NA values (missing responses) with random responses
# OUTPUT
# vector marking each concept as 0/1 for lost/won, of length (cards*length(win.cards))
#
expandCBCwinners <- function(win.cards, cards=3, fill=TRUE) {
if (fill) {
win.cards <- fillCBCwinners(win.cards)
}
win.exp <- rep(0, length(win.cards)*cards) # vector to hold card winners. by default all choices are "no"
win.ind <- seq(from=0, to=length(win.exp)-1, by=3) + win.cards # the 1/0 location of just the winning cards
win.exp[win.ind] <- 1 # set the winners
return(win.exp)
}