latest

eleonore-schneeg · Apr 11, 2023 · f94ad3b · f94ad3b
1 parent 49b0b3d
commit f94ad3b
Show file tree

Hide file tree

Showing 29 changed files with 460 additions and 9 deletions.
diff --git a/.github/workflows/r_package_dev_ele_new.yaml b/.github/workflows/r_package_dev_ele_new.yaml
@@ -46,6 +46,6 @@ jobs:
 
       - name: push service
         run: |
-          docker push eleonoreschneeg/omix:${{ env.packageVersion }}
-          docker push eleonoreschneeg/omix:latest
-          docker push eleonoreschneeg/omix:dev
+          docker push eleonore-schneeg/omix:${{ env.packageVersion }}
+          docker push eleonore-schneeg/omix:latest
+          docker push eleonore-schneeg/omix:dev
diff --git a/Dockerfile b/Dockerfile
@@ -171,6 +171,8 @@ statmod \
 systemfonts \
 viridis \
 visNetwork \
+intNMF \
+ActivePathways \
 && rm -rf /tmp/downloaded_packages
 
 ## Install Bioconductor packages

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(DE_multiomics)
 export(GSEA)
+export(ask_chatgpt)
 export(batch_correction_protein)
 export(cell_type_enrichment)
 export(clustering_DE_analysis)
@@ -16,10 +17,12 @@ export(filter_protein)
 export(format_res_deseq)
 export(format_res_limma)
 export(generate_multiassay)
+export(getClustNum)
 export(integrate_with_DIABLO)
 export(integrate_with_MBPLS)
 export(integrate_with_MEIFESTO)
 export(integrate_with_MOFA)
+export(integrate_with_iCluster)
 export(integrate_with_sMBPLS)
 export(integrative_results_supervised)
 export(integrative_results_unsupervised)
@@ -31,6 +34,7 @@ export(multiomics_network_cluster)
 export(multiomics_network_matrix)
 export(pathway_analysis_enrichr)
 export(plot_OpenTarget)
+export(plot_optimal_cluster)
 export(process_protein)
 export(process_rna)
 export(processed_proteomics)
@@ -45,6 +49,7 @@ export(volcano_interactive)
 export(volcano_interactive_comparison)
 export(volcano_plot_deseq)
 export(volcano_plot_limma)
+import(SNFtool)
 import(ggplot2)
 import(igraph)
 importFrom(AnnotationDbi,mapIds)
@@ -55,6 +60,7 @@ importFrom(DESeq2,results)
 importFrom(EWCE,bootstrap_enrichment_test)
 importFrom(EWCE,ewce_plot)
 importFrom(EnsDb.Hsapiens.v86,EnsDb.Hsapiens.v86)
+importFrom(IntNMF,nmf.opt.k)
 importFrom(MOFA2,correlate_factors_with_covariates)
 importFrom(MOFA2,create_mofa)
 importFrom(MOFA2,get_default_data_options)
@@ -112,11 +118,14 @@ importFrom(enrichR,enrichr)
 importFrom(enrichplot,pairwise_termsim)
 importFrom(fgsea,collapsePathways)
 importFrom(fgsea,fgsea)
+importFrom(ggplot2,alpha)
 importFrom(ggplot2,ggsave)
 importFrom(ggplot2,ggtitle)
 importFrom(ggpubr,ggscatter)
 importFrom(ggrepel,geom_text_repel)
 importFrom(httr,POST)
+importFrom(httr,add_headers)
+importFrom(iClusterPlus,iClusterBayes)
 importFrom(jsonlite,fromJSON)
 importFrom(limma,contrasts.fit)
 importFrom(limma,eBayes)
@@ -132,6 +141,8 @@ importFrom(mixOmics,block.spls)
 importFrom(mixOmics,block.splsda)
 importFrom(mixOmics,selectVar)
 importFrom(mixOmics,tune.block.splsda)
+importFrom(mogsa,mbpca)
+importFrom(mogsa,moGap)
 importFrom(msigdbr,msigdbr)
 importFrom(org.Hs.eg.db,org.Hs.eg.db)
 importFrom(org.Mm.eg.db,org.Mm.eg.db)
@@ -154,6 +165,7 @@ importFrom(stats,as.formula)
 importFrom(stats,model.matrix)
 importFrom(stats,prcomp)
 importFrom(stats,reorder)
+importFrom(stringr,str_trim)
 importFrom(stringr,str_wrap)
 importFrom(tibble,enframe)
 importFrom(tidyr,pivot_wider)

diff --git a/R/chatgpt.R b/R/chatgpt.R
@@ -0,0 +1,27 @@
+#' Calls chatGPT with a specific prompt
+#'
+#' @param prompt Character vector
+#' @param api_key Personal API key
+#'
+#' @return ChatGPT answer
+#' @export
+#' @importFrom httr add_headers
+#' @importFrom stringr str_trim
+
+ask_chatgpt <- function(prompt,
+                        api_key="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") {
+  response <- POST(
+    url = "https://api.openai.com/v1/chat/completions",
+    httr::add_headers(Authorization = paste("Bearer", api_key)),
+    content_type_json(),
+    encode = "json",
+    body = list(
+      model = "gpt-3.5-turbo",
+      messages = list(list(
+        role = "user",
+        content = prompt
+      ))
+    )
+  )
+  stringr::str_trim(content(response)$choices[[1]]$message$content)
+}
diff --git a/R/integration_visualisations.R b/R/integration_visualisations.R
@@ -439,3 +439,42 @@ community_graph <- function(igraph,
 
   return(res)
 }
+
+
+#' Plot optimal number of clusters from `getClustNum()`
+#'
+#' @param optk1 Cluster Prediction Index
+#' @param optk2 Gap statistics
+#'
+#' @return plot
+#' @export
+#' @family Plotting
+#' @importFrom ggplot2 alpha
+
+plot_optimal_cluster <- function(optk1,
+                                 optk2,
+                                 try.N.clust) {
+  par(bty = "o", mgp = c(1.9, .33, 0), mar = c(3.1, 3.1, 2.1, 3.1) + .1, las = 1, tcl = -.25)
+  plot(NULL, NULL,
+       xlim = c(min(try.N.clust), max(try.N.clust)),
+       ylim = c(0, 1),
+       xlab = "Number of Multi-Omics Clusters", ylab = ""
+  )
+  rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col = "#EAE9E9", border = FALSE)
+  grid(col = "white", lty = 1, lwd = 1.5)
+  points(try.N.clust, apply(optk1, 1, mean), pch = 19, col = ggplot2::alpha("#224A8D"), cex = 1.5)
+  lines(try.N.clust, apply(optk1, 1, mean), col = "#224A8D", lwd = 2, lty = 4)
+  mtext("Cluster Prediction Index", side = 2, line = 2, cex = 1.5, col = "#224A8D", las = 3)
+
+  par(new = TRUE, xpd = FALSE)
+  plot(NULL, NULL,
+       xlim = c(min(try.N.clust), max(try.N.clust)),
+       ylim = c(0, 1),
+       xlab = "", ylab = "", xaxt = "n", yaxt = "n"
+  )
+  points(try.N.clust, optk2$gap, pch = 19, col = ggplot2::alpha("#E51718", 0.8), cex = 1.5)
+  lines(try.N.clust, optk2$gap, col = "#E51718", lwd = 2, lty = 4)
+  axis(side = 4, at = seq(0, 1, 0.2), labels = c("0.0", "0.2", "0.4", "0.6", "0.8", "1.0"))
+  mtext("Gap-statistics", side = 4, line = 2, las = 3, cex = 1.5, col = "#E51718")
+}
+
diff --git a/R/integrative_models.R b/R/integrative_models.R
@@ -117,9 +117,10 @@ integrate_with_sMBPLS <- function(multimodal_omics,
                                   Y,
                                   design,
                                   ncomp,
-                                  list.keepX = list(mRNA = c(50),
-                                                    proteins = c(50))
-                                  ) {
+                                  list.keepX = list(
+                                    mRNA = c(50),
+                                    proteins = c(50)
+                                  )) {
   multimodal_omics <- lapply(multimodal_omics, t)
   X <- list(
     mRNA = multimodal_omics[[1]],
@@ -324,8 +325,7 @@ integrate_with_MEIFESTO <- function(multimodal_omics,
                                     num_factors = 5,
                                     scale_views = TRUE,
                                     metadata,
-                                    time = "pseudotime"
-                                    ) {
+                                    time = "pseudotime") {
   # python_path <- Sys.which("python")
   # reticulate::use_python(python_path, required = NULL)
 
@@ -373,3 +373,178 @@ integrate_with_MEIFESTO <- function(multimodal_omics,
     model = model
   ))
 }
+
+#' @name getClustNum
+#' @title Get estimation of optimal clustering number
+#' @description This function provides two measurements (i.e., clustering prediction index [CPI] and Gap-statistics) and aims to search the optimal number for multi-omics integrative clustering. In short, the peaks reach by the red (CPI) and blue (Gap-statistics) lines should be referred to determine `N.clust`.
+#' @param data List of matrices.
+#' @param is.binary A logicial vector to indicate if the subdata is binary matrix of 0 and 1 such as mutation.
+#' @param try.N.clust A integer vector to indicate possible choices of number of clusters.
+#' @param center A logical value to indicate if the variables should be centered. TRUE by default.
+#' @param scale A logical value to indicate if the variables should be scaled. FALSE by default.
+#' @export
+#' @return A figure that helps to choose the optimal clustering number (argument of `N.clust`) for `get%algorithm_name%()` or `getMOIC()`, and a list contains the following components:
+#'
+#'         \code{CPI}   possible cluster number identified by clustering prediction index
+#'
+#'         \code{Gapk}  possible cluster number identified by Gap-statistics
+#'         
+#' @family Multi-omic integration
+#' @importFrom IntNMF nmf.opt.k
+#' @importFrom  mogsa mbpca moGap
+#' @import SNFtool
+#' @importFrom ggplot2 alpha
+#' @importFrom dplyr %>%
+#' @examples # There is no example and please refer to vignette.
+#' @references Chalise P, Fridley BL (2017). Integrative clustering of multi-level omic data based on non-negative matrix factorization algorithm. PLoS One, 12(5):e0176278.
+#'
+#' Tibshirani, R., Walther, G., Hastie, T. (2001). Estimating the number of data clusters via the Gap statistic. J R Stat Soc Series B Stat Methodol, 63(2):411-423.
+#' 
+getClustNum <- function(data = NULL,
+                        is.binary = rep(FALSE, length(data)),
+                        try.N.clust = 2:8,
+                        center = TRUE,
+                        scale = TRUE) {
+  # check data
+  n_dat <- length(data)
+  if (n_dat > 2) {
+    stop("current version of Omix can support up to 2 datasets.")
+  }
+  if (n_dat < 2) {
+    stop("current version of Omix  needs at least 2 omics data.")
+  }
+
+  data.backup <- data # save a backup
+
+  #--------------------------------------------#
+  # Cluster Prediction Index (CPI) from IntNMF #
+  # remove features that made of categories not equal to 2 otherwise Error in svd(X) : a dimension is zero
+  if (!all(!is.binary)) {
+    bindex <- which(is.binary == TRUE)
+    for (i in bindex) {
+      a <- which(rowSums(data[[i]]) == 0)
+      b <- which(rowSums(data[[i]]) == ncol(data[[i]]))
+      if (length(a) > 0) {
+        data[[i]] <- data[[i]][which(rowSums(data[[i]]) != 0), ] # remove all zero
+      }
+
+      if (length(b) > 0) {
+        data[[i]] <- data[[i]][which(rowSums(data[[i]]) != ncol(data[[i]])), ] # remove all one
+      }
+
+      if (length(a) + length(b) > 0) {
+        message(paste0("--", names(data)[i], ": a total of ", length(a) + length(b), " features were removed due to the categories were not equal to 2!"))
+      }
+    }
+  }
+
+  # In order to make the input data fit non-negativity constraint of intNMF,
+  # the values of the data were shifted to positive direction by adding absolute value of the smallest negative number.
+  # Further, each data was rescaled by dividing by maximum value of the data to make the magnitudes comparable (between 0 and 1) across the several datasets.
+  dat <- lapply(data, function(dd) {
+    if (!all(dd >= 0)) dd <- pmax(dd + abs(min(dd)), 0) + .Machine$double.eps # .Machine$double.eps as The smallest positive floating-point number x
+    dd <- dd / max(dd)
+    return(dd %>% as.matrix())
+  })
+
+  # dat <- lapply(dat, t)
+  dat <- lapply(dat, function(x) t(x) + .Machine$double.eps)
+
+  message("calculating Cluster Prediction Index...")
+  optk1 <- IntNMF::nmf.opt.k(
+    dat = dat,
+    n.runs = 5,
+    n.fold = 5,
+    k.range = try.N.clust,
+    st.count = 10,
+    maxiter = 100,
+    make.plot = FALSE
+  )
+  optk1 <- as.data.frame(optk1)
+
+  #-------------------------------#
+  # Gap-statistics from MoCluster #
+  message("calculating Gap-statistics...")
+  moas <- data.backup %>% mogsa::mbpca(
+    ncomp = 2,
+    k = "all",
+    method = "globalScore",
+    center = center,
+    scale = scale,
+    moa = TRUE,
+    svd.solver = "fast",
+    maxiter = 1000,
+    verbose = FALSE
+  )
+  gap <- mogsa::moGap(moas, K.max = max(try.N.clust), cluster = "hclust", plot = FALSE)
+  optk2 <- as.data.frame(gap$Tab)[-1, ] # remove k=1
+
+
+  N.clust <- as.numeric(which.max(apply(optk1, 1, mean) + optk2$gap)) + 1
+  if (length(N.clust) == 0) {
+    message("--fail to define the optimal cluster number!")
+    N.clust <- "null"
+  }
+
+
+  if (N.clust > 1) {
+    message(paste0("--the imputed optimal cluster number is ", N.clust, " arbitrarily, but it would be better referring to other priori knowledge."))
+  }
+  # return(list(N.clust = N.clust, CPI = optk1, Gapk = optk2, Eigen = optk3))
+  return(list(N.clust = N.clust, CPI = optk1, Gapk = optk2))
+}
+
+#'  Vertical integration with iCluster
+#'
+#' @param multimodal_omics multimodal_object generated by `.get_multimodal_object`
+#' @param try.N.clust Range of number of potential clusters in the optimisation step
+#' @export
+#' 
+#' @return a list object with `multimodal_object` and `model` slots
+#' 
+#' @family Multi-omic integration
+#' @importFrom iClusterPlus iClusterBayes
+
+integrate_with_iCluster <- function(multimodal_omics,
+                                    try.N.clust = 2:6) {
+  cli::cli_alert_success("Optimising the number of cluster (this make take a while")
+
+  args <- list(
+    data = multimodal_omics,
+    is.binary = c(F, F),
+    try.N.clust = try.N.clust,
+    center = TRUE,
+    scale = TRUE)
+
+  optk.i <- do.call(getClustNum, args)
+
+  optimal_n <- optk.i$N.clust
+
+  m1 <- t(multimodal_omics[[1]])
+  m2 <- t(multimodal_omics[[2]])
+
+  cli::cli_alert_success("Integration in progress (this make take a while)")
+  model <- iClusterPlus::iClusterBayes(
+    dt1 = m1,
+    dt2 = m2,
+    dt3 = NULL, dt4 = NULL, dt5 = NULL, dt6 = NULL,
+    type = c("gaussian", "gaussian"),
+    K = optimal_n - 1,
+    n.burnin = 1000,
+    n.draw = 1200,
+    prior.gamma = rep(0.1, 6),
+    sdev = 0.5,
+    beta.var.scale = 1,
+    thin = 1,
+    pp.cutoff = 0.5
+  )
+  return(list(
+    multimodal_object = multimodal_omics,
+    model = model,
+    tuning = list(
+      optk.i = optk.i,
+      try.N.clust = try.N.clust
+    )
+  ))
+}
+
diff --git a/man/ask_chatgpt.Rd b/man/ask_chatgpt.Rd