Skip to content

Commit

Permalink
Update evaluations
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeff1995 committed Aug 31, 2021
1 parent 5d75109 commit 91c178e
Show file tree
Hide file tree
Showing 34 changed files with 801 additions and 217 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# GLUE (Graph-Linked Unified Embedding)

[![license-badge](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
![version-badge](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/Jeff1995/e704b2f886ff6a37477311b90fdf7efa/raw/version.json)
[![pypi-badge](https://img.shields.io/pypi/v/scglue)](https://pypi.org/project/scglue)
[![docs-badge](https://readthedocs.org/projects/scglue/badge/?version=latest)](https://scglue.readthedocs.io/en/latest/?badge=latest)
[![build-badge](https://github.com/gao-lab/GLUE/actions/workflows/build.yml/badge.svg)](https://github.com/gao-lab/GLUE/actions/workflows/build.yml)
[![coverage-badge](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/Jeff1995/e704b2f886ff6a37477311b90fdf7efa/raw/coverage.json)](https://github.com/gao-lab/GLUE/actions/workflows/build.yml)
Expand All @@ -26,7 +26,6 @@ For more details, please check out our [preprint](https://www.biorxiv.org/conten
├── packrat # Reproducible R environment via packrat
├── env.yaml # Reproducible Python environment via conda
├── setup.py # Setup script for the Python package
├── release.sh # Script for releasing a new version
├── LICENSE
└── README.md
```
Expand Down
60 changes: 57 additions & 3 deletions data/collect/10x-Multiome-Pbmc10k.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
"metadata": {},
"outputs": [],
"source": [
"import anndata\n",
"import numpy as np\n",
"import pandas as pd\n",
"import networkx as nx\n",
"import scanpy as sc\n",
"import scipy.sparse\n",
"from networkx.algorithms.bipartite import biadjacency_matrix\n",
"\n",
"import scglue"
Expand Down Expand Up @@ -54,6 +56,17 @@
"atac"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"frags2rna = scglue.data.bedmap2anndata(\"../download/10x-Multiome-Pbmc10k/pbmc_granulocyte_sorted_10k_atac_fragments.bedmap.gz\")\n",
"frags2rna.obs.index.name, frags2rna.var.index.name = \"cells\", \"genes\"\n",
"frags2rna"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -111,6 +124,17 @@
"atac.var.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"frags2rna.obs[\"domain\"] = \"scATAC-seq\"\n",
"frags2rna.obs[\"protocol\"] = \"10x Multiome\"\n",
"frags2rna.obs[\"dataset\"] = \"10x-Multiome-Pbmc10k-FRAGS2RNA\""
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -135,7 +159,8 @@
"outputs": [],
"source": [
"rna.obs = rna.obs.join(meta)\n",
"atac.obs = atac.obs.join(meta)"
"atac.obs = atac.obs.join(meta)\n",
"frags2rna.obs = frags2rna.obs.join(meta)"
]
},
{
Expand All @@ -145,7 +170,8 @@
"outputs": [],
"source": [
"rna = rna[meta.index, :]\n",
"atac = atac[meta.index, :]"
"atac = atac[meta.index, :]\n",
"frags2rna = frags2rna[meta.index, :]"
]
},
{
Expand Down Expand Up @@ -193,6 +219,16 @@
"atac"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"frags2rna = frags2rna[mask, :]\n",
"frags2rna"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -259,6 +295,23 @@
"atac"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"missing_vars = list(set(rna.var_names).difference(frags2rna.var_names))\n",
"frags2rna = anndata.concat([\n",
" frags2rna, anndata.AnnData(\n",
" X=scipy.sparse.csr_matrix((frags2rna.shape[0], len(missing_vars))),\n",
" obs=pd.DataFrame(index=frags2rna.obs_names), var=pd.DataFrame(index=missing_vars)\n",
" )\n",
"], axis=1, merge=\"first\")\n",
"frags2rna = frags2rna[:, rna.var_names].copy() # Keep the same features as RNA\n",
"frags2rna"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -290,7 +343,8 @@
"outputs": [],
"source": [
"rna.write_h5ad(\"../dataset/10x-Multiome-Pbmc10k-RNA.h5ad\", compression=\"gzip\")\n",
"atac.write_h5ad(\"../dataset/10x-Multiome-Pbmc10k-ATAC.h5ad\", compression=\"gzip\")"
"atac.write_h5ad(\"../dataset/10x-Multiome-Pbmc10k-ATAC.h5ad\", compression=\"gzip\")\n",
"frags2rna.write_h5ad(\"../dataset/10x-Multiome-Pbmc10k-FRAGS2RNA.h5ad\", compression=\"gzip\")"
]
}
],
Expand Down
3 changes: 2 additions & 1 deletion data/collect/Chen-2019.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,8 @@
"outputs": [],
"source": [
"rna.write(\"../dataset/Chen-2019-RNA.h5ad\", compression=\"gzip\")\n",
"atac.write(\"../dataset/Chen-2019-ATAC.h5ad\", compression=\"gzip\")"
"atac.write(\"../dataset/Chen-2019-ATAC.h5ad\", compression=\"gzip\")\n",
"!touch ../dataset/Chen-2019-FRAGS2RNA.h5ad # Sham file"
]
}
],
Expand Down
56 changes: 52 additions & 4 deletions data/collect/Ma-2020.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
"rna_counts = pd.read_table(\"../../data/download/Ma-2020/GSM4156608_skin.late.anagen.rna.counts.txt.gz\", index_col=0)\n",
"rna_obs = pd.DataFrame(index=rna_counts.columns)\n",
"pd.DataFrame(index=rna_counts.index)\n",
"rna_obs.index = np.vectorize(lambda x: x.replace(\",\", \".\"))(rna_obs.index)\n",
"rna_obs.index = rna_obs.index.str.replace(\",\", \".\")\n",
"rna_var = pd.DataFrame(index=rna_counts.index)\n",
"rna_obs.index.name, rna_var.index.name = \"cells\", \"genes\"\n",
"rna = anndata.AnnData(\n",
Expand Down Expand Up @@ -152,6 +152,25 @@
"atac.var[\"genome\"] = \"mm10\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# FRAGS2RNA"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"frags2rna = scglue.data.bedmap2anndata(\"../../data/download/Ma-2020/GSM4156597_skin.late.anagen.atac.fragments.bedmap.gz\")\n",
"frags2rna.obs.index = frags2rna.obs.index.str.replace(\",\", \".\")\n",
"frags2rna.obs.index.name, frags2rna.var.index.name = \"cells\", \"genes\"\n",
"frags2rna"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -213,7 +232,7 @@
" xs[-1] = atac_bc_map[xs[-1]]\n",
" return \".\".join(xs)\n",
"\n",
"cell_type[\"atac.bc\"] = map_atac_bc(cell_type[\"atac.bc\"])"
"cell_type[\"atac.bc.mapped\"] = map_atac_bc(cell_type[\"atac.bc\"])"
]
},
{
Expand All @@ -232,10 +251,21 @@
"metadata": {},
"outputs": [],
"source": [
"atac = atac[cell_type[\"atac.bc\"].to_numpy(), :]\n",
"atac = atac[cell_type[\"atac.bc.mapped\"].to_numpy(), :]\n",
"atac.obs[\"cell_type\"] = cell_type[\"celltype\"].to_numpy()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"frags2rna = frags2rna[cell_type[\"atac.bc\"].to_numpy(), :]\n",
"frags2rna.obs[\"cell_type\"] = cell_type[\"celltype\"].to_numpy()\n",
"frags2rna.obs.index = atac.obs.index"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -302,6 +332,23 @@
"atac"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"missing_vars = list(set(rna.var_names).difference(frags2rna.var_names))\n",
"frags2rna = anndata.concat([\n",
" frags2rna, anndata.AnnData(\n",
" X=scipy.sparse.csr_matrix((frags2rna.shape[0], len(missing_vars))),\n",
" obs=pd.DataFrame(index=frags2rna.obs_names), var=pd.DataFrame(index=missing_vars)\n",
" )\n",
"], axis=1, merge=\"first\")\n",
"frags2rna = frags2rna[:, rna.var_names].copy() # Keep the same features as RNA\n",
"frags2rna"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -333,7 +380,8 @@
"outputs": [],
"source": [
"rna.write(\"../../data/dataset/Ma-2020-RNA.h5ad\", compression=\"gzip\")\n",
"atac.write(\"../../data/dataset/Ma-2020-ATAC.h5ad\", compression=\"gzip\")"
"atac.write(\"../../data/dataset/Ma-2020-ATAC.h5ad\", compression=\"gzip\")\n",
"frags2rna.write(\"../../data/dataset/Ma-2020-FRAGS2RNA.h5ad\", compression=\"gzip\")"
]
}
],
Expand Down
2 changes: 2 additions & 0 deletions data/download/10x-Multiome-Pbmc10k/preprocess.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ set -e

Rscript wnn.r # Produces: wnn_meta_data.csv
Rscript doubletfinder.r # Produces: doubletfinder_inference.csv
zcat pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz | LC_ALL=C sort -k1,1 -k2,2n -k3,3n > pbmc_granulocyte_sorted_10k_atac_fragments.sorted.bed
bedmap --ec --delim "\t" --echo --echo-map-id ../../genome/gencode.v35.chr_patch_hapl_scaff.genes_with_promoters.sorted.bed pbmc_granulocyte_sorted_10k_atac_fragments.sorted.bed | gzip > pbmc_granulocyte_sorted_10k_atac_fragments.bedmap.gz
2 changes: 1 addition & 1 deletion data/download/10x-Multiome-Pbmc10k/wnn.r
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
source("../../../.Rprofile", chdir = TRUE)
suppressPackageStartupMessages({
source("../../../.Rprofile", chdir = TRUE)
library(Seurat)
library(Signac)
library(EnsDb.Hsapiens.v86)
Expand Down
2 changes: 2 additions & 0 deletions data/download/Ma-2020/preprocess.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@
set -e

tar xf GSE140203_RAW.tar
zcat GSM4156597_skin.late.anagen.atac.fragments.bed.gz | LC_ALL=C sort -k1,1 -k2,2n -k3,3n > GSM4156597_skin.late.anagen.atac.fragments.sorted.bed
bedmap --ec --delim "\t" --echo --echo-map-id ../../genome/gencode.vM25.chr_patch_hapl_scaff.genes_with_promoters.sorted.bed GSM4156597_skin.late.anagen.atac.fragments.sorted.bed | gzip > GSM4156597_skin.late.anagen.atac.fragments.bedmap.gz
44 changes: 44 additions & 0 deletions data/genome/extract_genes_promoters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python

r"""
Extract genes with promoters BED files from GTF file
"""

import argparse
import pathlib

import scglue


def parse_args() -> argparse.Namespace:
r"""
Parse command line arguments
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"--input-gtf", dest="input_gtf", type=pathlib.Path, required=True,
help="Path to input GTF file"
)
parser.add_argument(
"--promoter-len", dest="promoter_len", type=int, default=2000,
help="Promoter length"
)
parser.add_argument(
"--output-bed", dest="output_bed", type=pathlib.Path, required=True,
help="Path to output BED file"
)
return parser.parse_args()


def main(args: argparse.Namespace):
r"""
Main function
"""
gtf = scglue.genomics.read_gtf(args.input_gtf).query("feature == 'gene'").split_attribute()
bed = gtf.to_bed(name="gene_name").expand(args.promoter_len, 0)
bed = bed.drop_duplicates(subset=["chrom", "chromStart", "chromEnd"])
bed.write_bed(args.output_bed, ncols=6)


if __name__ == "__main__":
main(parse_args())
12 changes: 12 additions & 0 deletions data/genome/preprocess.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

set -e

python extract_genes_promoters.py --input-gtf gencode.v19.chr_patch_hapl_scaff.annotation.gtf.gz --output-bed gencode.v19.chr_patch_hapl_scaff.genes_with_promoters.bed
python extract_genes_promoters.py --input-gtf gencode.v35.chr_patch_hapl_scaff.annotation.gtf.gz --output-bed gencode.v35.chr_patch_hapl_scaff.genes_with_promoters.bed
python extract_genes_promoters.py --input-gtf gencode.vM10.chr_patch_hapl_scaff.annotation.gtf.gz --output-bed gencode.vM10.chr_patch_hapl_scaff.genes_with_promoters.bed
python extract_genes_promoters.py --input-gtf gencode.vM25.chr_patch_hapl_scaff.annotation.gtf.gz --output-bed gencode.vM25.chr_patch_hapl_scaff.genes_with_promoters.bed
LC_ALL=C sort -k1,1 -k2,2n -k3,3n gencode.v19.chr_patch_hapl_scaff.genes_with_promoters.bed > gencode.v19.chr_patch_hapl_scaff.genes_with_promoters.sorted.bed
LC_ALL=C sort -k1,1 -k2,2n -k3,3n gencode.v35.chr_patch_hapl_scaff.genes_with_promoters.bed > gencode.v35.chr_patch_hapl_scaff.genes_with_promoters.sorted.bed
LC_ALL=C sort -k1,1 -k2,2n -k3,3n gencode.vM10.chr_patch_hapl_scaff.genes_with_promoters.bed > gencode.vM10.chr_patch_hapl_scaff.genes_with_promoters.sorted.bed
LC_ALL=C sort -k1,1 -k2,2n -k3,3n gencode.vM25.chr_patch_hapl_scaff.genes_with_promoters.bed > gencode.vM25.chr_patch_hapl_scaff.genes_with_promoters.sorted.bed
Loading

0 comments on commit 91c178e

Please sign in to comment.