Skip to content

Commit

Permalink
Support for Dropbox's content hash (#158)
Browse files Browse the repository at this point in the history
* Support for Dropbox's content hash

As outlined in #157 - this implements dropbox's content hash
algorithm (https://www.dropbox.com/developers/reference/content-hash)

Most of the tests do not require dropbox access.
  • Loading branch information
richfitz authored and ClaytonJY committed Apr 6, 2018
1 parent 32bab93 commit b6ac801
Show file tree
Hide file tree
Showing 4 changed files with 197 additions and 0 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
export("%>%")
export(drop_acc)
export(drop_auth)
export(drop_content_hash)
export(drop_copy)
export(drop_create)
export(drop_delete)
Expand Down
61 changes: 61 additions & 0 deletions R/drop_content_hash.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#' Compute a "content hash" using the same algorithm as dropbox. This
#' can be used to verify the content against the \code{content_hash}
#' field returned in \code{\link{drop_dir}}.
#'
#' Dropbox returns a hash of file contents in \code{\link{drop_dir}}.
#' However, this is not a straightforward file hash. Instead the file
#' is divided into 4MB chunks, each of those is hashed and then the
#' concatenation of the hashes is itself hashed (see
#' \href{https://www.dropbox.com/developers/reference/content-hash}{this
#' page} in the dropbox developer documentation for the details).
#' It's entirely unclear why it does not compute a hash of the file
#' itself, but here we are.
#'
#' @title Compute Dropbox's content hash for one or more files
#'
#' @param file A vector of filenames
#'
#' @return A character vector the same length as \code{file}. Each
#' element is 64 character string which is the unique hash. Two
#' files that have the same hash have the same contents. Compare
#' this hash of a local file with the \code{content_hash} field from
#' \code{\link{drop_dir}} to see if you have the same file as
#' dropbox.
#'
#' @export
#' @examples
#' \dontrun{
#' write.csv(mtcars, file = "mtt.csv")
#' drop_upload("mtt.csv")
#' files <- drop_dir()
#' # Dropbox's reported hash
#' files$content_hash[files$name == "mtt.csv"]
#' # Our computed hash:
#' drop_content_hash("mtt.csv")
#' }
drop_content_hash <- function(file) {
if (!is.character(file)) {
stop("Expected 'file' to be a character vector")
}
if (length(file) != 1L) {
return(vapply(file, drop_content_hash, character(1), USE.NAMES = FALSE))
}

con <- file(file, "rb")
on.exit(close(con))

block_size <- 4L * 1024L * 1024L

n <- ceiling(file.size(file) / block_size)
h <- vector("list", n)
for (i in seq_len(n)) {
bytes <- readBin(con, raw(1), block_size)
h[[i]] <- sha256(bytes, raw = TRUE)
}

sha256(unlist(h))
}

sha256 <- function(bytes, raw = FALSE) {
digest::digest(bytes, algo = "sha256", serialize = FALSE, raw = raw)
}
45 changes: 45 additions & 0 deletions man/drop_content_hash.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

90 changes: 90 additions & 0 deletions tests/testthat/test-08-rdrop2-content-hash.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
context("testing drop_content_hash")

test_that("known good values", {
# These were generated by hand and tested - they serve as regression
# tests and do not require dropbox access.

four_mb <- 4L * 1024L * 1024L

# one short string
b1 <- charToRaw("hello rdrop2")
# one complete block
b2 <- as.raw(rep(0:255, length.out = four_mb))
# one block plus one byte
b3 <- c(b2, as.raw(255L))
# several blocks
b4 <- as.raw(rep(0:255, length.out = 3.5 * four_mb))

h1 <- "1aa2b7623dfff520c4abdc1227d10bc4e229b74005b66b3458345830e556f4de"
h2 <- "894bbb52d1212d6bcbe9967f1a2169138c4d4af0c8dfbaeae86cd1d3f0c03faf"
h3 <- "9149387a91f71c7c2149b8427d15526b71c1a38d6c6f999ad71486a2ce788d57"
h4 <- "f61d3ae93fa4f7646d37949fc0885141bf682faa9a130896b93459c650335d0f"

write_bytes <- function(bytes) {
path <- tempfile()
writeBin(bytes, path)
path
}

f1 <- write_bytes(b1)
f2 <- write_bytes(b2)
f3 <- write_bytes(b3)
f4 <- write_bytes(b4)

on.exit(unlink(c(f1, f2, f3, f4)))

expect_equal(drop_content_hash(f1), h1)
expect_equal(drop_content_hash(f2), h2)
expect_equal(drop_content_hash(f3), h3)
expect_equal(drop_content_hash(f4), h4)

## This is what is actually going on:
expect_equal(sha256(sha256(b1, TRUE), FALSE), h1)
expect_equal(sha256(sha256(b2, TRUE), FALSE), h2)
expect_equal(sha256(c(sha256(b3[seq_len(four_mb)], TRUE),
sha256(b3[-seq_len(four_mb)], TRUE)),
FALSE), h3)
})

test_that("vectorisation", {
write_bytes <- function(bytes) {
path <- tempfile()
writeBin(bytes, path)
path
}

paths <- vapply(1:5, function(.) write_bytes(sample(as.raw(0:255))),
character(1))
hash <- vapply(paths, drop_content_hash, character(1))

expect_equal(drop_content_hash(paths), unname(hash))
expect_equal(drop_content_hash(character(0)), character(0))
i <- c(1, 1, 2, 3)
expect_equal(drop_content_hash(paths[i]), unname(hash)[i])
})

test_that("content_hash agrees with dropbox", {
skip_on_cran()

# create folders and objects
folder_name <- traceless("test-drop_dir")
drop_create(folder_name)

file_name <- traceless("test-drop_dir.csv")
write.csv(mtcars, file_name)
drop_upload(file_name, path = folder_name)

on.exit({
drop_delete(folder_name)
unlink(file_name)
})

info <- drop_dir(folder_name)
expect_equal(drop_content_hash(file_name), info$content_hash)
})


test_that("input validation", {
expect_error(drop_content_hash(1:10),
"Expected 'file' to be a character vector")
})

0 comments on commit b6ac801

Please sign in to comment.