-
Notifications
You must be signed in to change notification settings - Fork 59
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support for Dropbox's content hash (#158)
* Support for Dropbox's content hash As outlined in #157 - this implements dropbox's content hash algorithm (https://www.dropbox.com/developers/reference/content-hash) Most of the tests do not require dropbox access.
- Loading branch information
Showing
4 changed files
with
197 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#' Compute a "content hash" using the same algorithm as dropbox. This | ||
#' can be used to verify the content against the \code{content_hash} | ||
#' field returned in \code{\link{drop_dir}}. | ||
#' | ||
#' Dropbox returns a hash of file contents in \code{\link{drop_dir}}. | ||
#' However, this is not a straightforward file hash. Instead the file | ||
#' is divided into 4MB chunks, each of those is hashed and then the | ||
#' concatenation of the hashes is itself hashed (see | ||
#' \href{https://www.dropbox.com/developers/reference/content-hash}{this | ||
#' page} in the dropbox developer documentation for the details). | ||
#' It's entirely unclear why it does not compute a hash of the file | ||
#' itself, but here we are. | ||
#' | ||
#' @title Compute Dropbox's content hash for one or more files | ||
#' | ||
#' @param file A vector of filenames | ||
#' | ||
#' @return A character vector the same length as \code{file}. Each | ||
#' element is 64 character string which is the unique hash. Two | ||
#' files that have the same hash have the same contents. Compare | ||
#' this hash of a local file with the \code{content_hash} field from | ||
#' \code{\link{drop_dir}} to see if you have the same file as | ||
#' dropbox. | ||
#' | ||
#' @export | ||
#' @examples | ||
#' \dontrun{ | ||
#' write.csv(mtcars, file = "mtt.csv") | ||
#' drop_upload("mtt.csv") | ||
#' files <- drop_dir() | ||
#' # Dropbox's reported hash | ||
#' files$content_hash[files$name == "mtt.csv"] | ||
#' # Our computed hash: | ||
#' drop_content_hash("mtt.csv") | ||
#' } | ||
drop_content_hash <- function(file) { | ||
if (!is.character(file)) { | ||
stop("Expected 'file' to be a character vector") | ||
} | ||
if (length(file) != 1L) { | ||
return(vapply(file, drop_content_hash, character(1), USE.NAMES = FALSE)) | ||
} | ||
|
||
con <- file(file, "rb") | ||
on.exit(close(con)) | ||
|
||
block_size <- 4L * 1024L * 1024L | ||
|
||
n <- ceiling(file.size(file) / block_size) | ||
h <- vector("list", n) | ||
for (i in seq_len(n)) { | ||
bytes <- readBin(con, raw(1), block_size) | ||
h[[i]] <- sha256(bytes, raw = TRUE) | ||
} | ||
|
||
sha256(unlist(h)) | ||
} | ||
|
||
sha256 <- function(bytes, raw = FALSE) { | ||
digest::digest(bytes, algo = "sha256", serialize = FALSE, raw = raw) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
context("testing drop_content_hash") | ||
|
||
test_that("known good values", { | ||
# These were generated by hand and tested - they serve as regression | ||
# tests and do not require dropbox access. | ||
|
||
four_mb <- 4L * 1024L * 1024L | ||
|
||
# one short string | ||
b1 <- charToRaw("hello rdrop2") | ||
# one complete block | ||
b2 <- as.raw(rep(0:255, length.out = four_mb)) | ||
# one block plus one byte | ||
b3 <- c(b2, as.raw(255L)) | ||
# several blocks | ||
b4 <- as.raw(rep(0:255, length.out = 3.5 * four_mb)) | ||
|
||
h1 <- "1aa2b7623dfff520c4abdc1227d10bc4e229b74005b66b3458345830e556f4de" | ||
h2 <- "894bbb52d1212d6bcbe9967f1a2169138c4d4af0c8dfbaeae86cd1d3f0c03faf" | ||
h3 <- "9149387a91f71c7c2149b8427d15526b71c1a38d6c6f999ad71486a2ce788d57" | ||
h4 <- "f61d3ae93fa4f7646d37949fc0885141bf682faa9a130896b93459c650335d0f" | ||
|
||
write_bytes <- function(bytes) { | ||
path <- tempfile() | ||
writeBin(bytes, path) | ||
path | ||
} | ||
|
||
f1 <- write_bytes(b1) | ||
f2 <- write_bytes(b2) | ||
f3 <- write_bytes(b3) | ||
f4 <- write_bytes(b4) | ||
|
||
on.exit(unlink(c(f1, f2, f3, f4))) | ||
|
||
expect_equal(drop_content_hash(f1), h1) | ||
expect_equal(drop_content_hash(f2), h2) | ||
expect_equal(drop_content_hash(f3), h3) | ||
expect_equal(drop_content_hash(f4), h4) | ||
|
||
## This is what is actually going on: | ||
expect_equal(sha256(sha256(b1, TRUE), FALSE), h1) | ||
expect_equal(sha256(sha256(b2, TRUE), FALSE), h2) | ||
expect_equal(sha256(c(sha256(b3[seq_len(four_mb)], TRUE), | ||
sha256(b3[-seq_len(four_mb)], TRUE)), | ||
FALSE), h3) | ||
}) | ||
|
||
test_that("vectorisation", { | ||
write_bytes <- function(bytes) { | ||
path <- tempfile() | ||
writeBin(bytes, path) | ||
path | ||
} | ||
|
||
paths <- vapply(1:5, function(.) write_bytes(sample(as.raw(0:255))), | ||
character(1)) | ||
hash <- vapply(paths, drop_content_hash, character(1)) | ||
|
||
expect_equal(drop_content_hash(paths), unname(hash)) | ||
expect_equal(drop_content_hash(character(0)), character(0)) | ||
i <- c(1, 1, 2, 3) | ||
expect_equal(drop_content_hash(paths[i]), unname(hash)[i]) | ||
}) | ||
|
||
test_that("content_hash agrees with dropbox", { | ||
skip_on_cran() | ||
|
||
# create folders and objects | ||
folder_name <- traceless("test-drop_dir") | ||
drop_create(folder_name) | ||
|
||
file_name <- traceless("test-drop_dir.csv") | ||
write.csv(mtcars, file_name) | ||
drop_upload(file_name, path = folder_name) | ||
|
||
on.exit({ | ||
drop_delete(folder_name) | ||
unlink(file_name) | ||
}) | ||
|
||
info <- drop_dir(folder_name) | ||
expect_equal(drop_content_hash(file_name), info$content_hash) | ||
}) | ||
|
||
|
||
test_that("input validation", { | ||
expect_error(drop_content_hash(1:10), | ||
"Expected 'file' to be a character vector") | ||
}) |