forked from hadley/plyr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjoin.r
158 lines (136 loc) · 5.17 KB
/
join.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#' Join two data frames together.
#'
#' Join, like merge, is designed for the types of problems
#' where you would use a sql join.
#'
#' The four join types return:
#'
#' \itemize{
#' \item \code{inner}: only rows with matching keys in both x and y
#' \item \code{left}: all rows in x, adding matching columns from y
#' \item \code{right}: all rows in y, adding matching columns from x
#' \item \code{full}: all rows in x with matching columns in y, then the
#' rows of y that don't match x.
#' }
#'
#' Note that from plyr 1.5, \code{join} will (by default) return all matches,
#' not just the first match, as it did previously.
#'
#' Unlike merge, preserves the order of x no matter what join type is used.
#' If needed, rows from y will be added to the bottom. Join is often faster
#' than merge, although it is somewhat less featureful - it currently offers
#' no way to rename output or merge on different variables in the x and y
#' data frames.
#'
#' @param x data frame
#' @param y data frame
#' @param by character vector of variable names to join by
#' @param type type of join: left (default), right, inner or full. See
#' details for more information.
#' @param match how should duplicate ids be matched? Either match just the
#' \code{"first"} matching row, or match \code{"all"} matching rows.
#' @keywords manip
#' @export
#' @examples
#' first <- ddply(baseball, "id", summarise, first = min(year))
#' system.time(b2 <- merge(baseball, first, by = "id", all.x = TRUE))
#' system.time(b3 <- join(baseball, first, by = "id"))
#'
#' b2 <- arrange(b2, id, year, stint)
#' b3 <- arrange(b3, id, year, stint)
#' stopifnot(all.equal(b2, b3))
join <- function(x, y, by = intersect(names(x), names(y)), type = "left", match = "all") {
type <- match.arg(type, c("left", "right", "inner", "full"))
match <- match.arg(match, c("first", "all"))
if (missing(by)) {
message("Joining by: ", paste(by, collapse = ", "))
}
switch(match,
"first" = join_first(x, y, by, type),
"all" = join_all(x, y, by, type))
}
join_first <- function(x, y, by, type) {
keys <- join.keys(x, y, by = by)
new.cols <- setdiff(names(y), by)
if (type == "inner") {
x.match <- match(keys$y, keys$x, 0)
y.match <- match(keys$x, keys$y, 0)
cbind(x[x.match, , drop = FALSE], y[y.match, new.cols, drop = FALSE])
} else if (type == "left") {
y.match <- match(keys$x, keys$y)
y.matched <- unrowname(y[y.match, new.cols, drop = FALSE])
cbind(x, y.matched)
} else if (type == "right") {
if (any(duplicated(keys$y))) {
stop("Duplicated key in y", call. = FALSE)
}
new.cols <- setdiff(names(x), by)
x.match <- match(keys$y, keys$x)
x.matched <- unrowname(x[x.match, , drop = FALSE])
cbind(y, x.matched[, new.cols, drop = FALSE])
} else if (type == "full") {
# x with matching y's then any unmatched ys
y.match <- match(keys$x, keys$y)
y.matched <- unrowname(y[y.match, new.cols, drop = FALSE])
y.unmatch <- is.na(match(keys$y, keys$x))
rbind.fill(cbind(x, y.matched), y[y.unmatch, , drop = FALSE])
}
}
# Basic idea to perform a full cartesian product of the two data frames
# and then evaluate which rows meet the merging criteria. But that is
# horrendously inefficient, so we do various types of hashing, implemented
# in R as split_indices
join_all <- function(x, y, by, type) {
new.cols <- setdiff(names(y), by)
if (type == "inner") {
ids <- join_ids(x, y, by)
out <- cbind(x[ids$x, , drop = FALSE], y[ids$y, new.cols, drop = FALSE])
} else if (type == "left") {
ids <- join_ids(x, y, by, all = TRUE)
out <- cbind(x[ids$x, , drop = FALSE], y[ids$y, new.cols, drop = FALSE])
} else if (type == "right") {
# Flip x and y, but make sure to put new columns in the right place
new.cols <- setdiff(names(x), by)
ids <- join_ids(y, x, by, all = TRUE)
out <- cbind(y[ids$x, , drop = FALSE], x[ids$y, new.cols, drop = FALSE])
} else if (type == "full") {
# x's with all matching y's, then non-matching y's - just the same as
# join.first
ids <- join_ids(x, y, by, all = TRUE)
matched <- cbind(x[ids$x, , drop = FALSE],
y[ids$y, new.cols, drop = FALSE])
unmatched <- y[setdiff(seq_len(nrow(y)), ids$y), , drop = FALSE]
out <- rbind.fill(matched, unmatched)
}
unrowname(out)
}
join_ids <- function(x, y, by, all = FALSE) {
keys <- join.keys(x, y, by = by)
ys <- split_indices(seq_along(keys$y), keys$y, keys$n)
length(ys) <- keys$n
if (all) {
# replace NULL with NA to preserve those x's without matching y's
nulls <- vapply(ys, function(x) length(x) == 0, logical(1))
ys[nulls] <- list(NA)
}
ys <- ys[keys$x]
xs <- rep(seq_along(keys$x), vapply(ys, length, numeric(1)))
list(x = xs, y = unlist(ys))
}
#' Join keys.
#' Given two data frames, create a unique key for each row.
#'
#' @param x data frame
#' @param y data frame
#' @param by character vector of variable names to join by
#' @keywords internal
#' @export
join.keys <- function(x, y, by) {
joint <- rbind.fill(x[by], y[by])
keys <- id(joint, drop = TRUE)
list(
x = keys[1:nrow(x)],
y = keys[-(1:nrow(x))],
n = attr(keys, "n")
)
}