Skip to content

Commit

Permalink
Merge pull request opentargets#19 from opentargets/ds_2585_finngen_st…
Browse files Browse the repository at this point in the history
…udy_fix

Slight update on creating Finngen studies.
  • Loading branch information
ireneisdoomed authored May 9, 2022
2 parents c558141 + 6c227fd commit 5f9c614
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 22 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ gcloud auth application-default login
sudo apt update
sudo apt install -yf \
openjdk-13-jre-headless \
python3-pip \
jq
python3-pip
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
chmod +x Miniconda3-latest-Linux-x86_64.sh
./Miniconda3-latest-Linux-x86_64.sh
Expand Down
20 changes: 14 additions & 6 deletions scripts/make_FINNGEN_study_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,27 @@ def main(input_path: str, output_path: str) -> None:
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)

# Read manifest
manifest = pd.read_json(input_path, lines=True).rename(
columns={
manifest = (
pd.read_json(input_path, orient='records')
.filter(items=['phenocode', 'phenosring', 'category', 'num_cases', 'num_controls'])

# When phenostring is not provided, phenotype extracted from the phenocode
.assign(phenostring=lambda df: df.apply(
lambda row: row['phenostring'] if row['phenostring'] and row['phenostring'] != '' else row['phenocode'],
axis=1)
)

# Renaming columns to accomodate OTG schema:
.rename(columns={
'phenocode': 'study_id',
'phenostring': 'trait',
'category': 'trait_category',
'num_cases': 'n_cases',
'num_controls': 'n_controls',
}
})
)
logging.info(f"{input_path} has been loaded. Formatting...")

keep_columns = ['study_id', 'trait', 'trait_category', 'n_cases', 'n_controls']
manifest = manifest[keep_columns]
logging.info(f"{input_path} has been loaded. Formatting...")

# Format table:
manifest['study_id'] = 'FINNGEN_R6_' + manifest['study_id']
Expand Down
13 changes: 5 additions & 8 deletions scripts/merge_study_tables.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
#!/usr/bin/env python
"""
Merges studies from different sources into one
"""
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Ed Mountjoy
#

import argparse
import logging
Expand All @@ -25,7 +21,6 @@ def main(in_gwascat: str, in_ukb: str, in_finngen: str, output_path: str) -> Non
logging.info(f"{len(gwas)} studies from GWAS Catalog have been loaded. Formatting...")
logging.info(f"{len(ukb)} studies from UK Biobank have been loaded. Formatting...")
logging.info(f"{len(finngen)} studies from Finngen have been loaded. Formatting...")


# Merge
merged = pd.concat([gwas, ukb, finngen], sort=False)
Expand All @@ -37,7 +32,6 @@ def main(in_gwascat: str, in_ukb: str, in_finngen: str, output_path: str) -> Non
logging.info(f"{len(merged)} studies have been saved in {output_path}. Exiting.")



def parse_args():
"""
Load command line args.
Expand All @@ -46,7 +40,10 @@ def parse_args():
parser.add_argument('--in_gwascat', metavar="<str>", type=str, required=True)
parser.add_argument('--in_ukb', metavar="<str>", type=str, required=True)
parser.add_argument('--in_finngen', metavar="<str>", type=str, required=True)
parser.add_argument('--output', metavar="<str>", help=("Output merged file in parquet format"), type=str, required=True)
parser.add_argument(
'--output', metavar="<str>", help=("Output merged file in parquet format"),
type=str, required=True
)
args = parser.parse_args()

return args
Expand Down
8 changes: 4 additions & 4 deletions scripts/process_ld.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
import numpy as np
import argparse
import pyspark.sql
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.types import DoubleType, StructType, StringType, IntegerType
from pyspark.sql.functions import col, pow, when, regexp_replace, split, lag, udf, log10, sqrt, desc
from pyspark.sql.window import Window
from scipy.stats import norm

Expand Down Expand Up @@ -92,7 +92,7 @@ def main():
coln.replace('R_', 'Z_'),
arctanh(col(coln))
)

# Compute weighted average across populations
data = data.withColumn('Z_overall',
(
Expand Down Expand Up @@ -122,7 +122,7 @@ def main():
data = data.filter(col('R2_overall') >= args.min_r2)

# Drop unneeded columns
data = data.drop(*['Z_overall','R_overall', 'R_AFR',
data = data.drop(*['Z_overall', 'R_overall', 'R_AFR',
'R_AMR', 'R_EAS', 'R_EUR', 'R_SAS', 'Z_AFR',
'Z_AMR', 'Z_EAS', 'Z_EUR', 'Z_SAS', 'index_variant_id'])

Expand Down
3 changes: 1 addition & 2 deletions snakefiles/study_and_top_loci_tables.Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -163,9 +163,8 @@ rule make_FINNGEN_studies_table:
study_table = tmpdir + '/{version}/FINNGEN_study_table.json'
shell:
"""
curl {params.finn_manifest} | jq -r '.[]| @json' > {tmpdir}/r6_finngen.json
python scripts/make_FINNGEN_study_table.py \
--input {tmpdir}/r6_finngen.json \
--input {params} \
--output {output}
"""

Expand Down

0 comments on commit 5f9c614

Please sign in to comment.