Skip to content

Commit

Permalink
Merge pull request opentargets#22 from opentargets/il-ukb-fix
Browse files Browse the repository at this point in the history
V2D Traits handling bug fix
  • Loading branch information
bruno-ariano authored Jul 8, 2022
2 parents 5f9c614 + 4284ee2 commit 8b9180e
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 24 deletions.
72 changes: 50 additions & 22 deletions scripts/make_UKB_study_table.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
"""
Processes UK Biobank's manifest to extract all studies and their metadata in the OTG format.
"""
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Ed Mountjoy
#

import argparse
import logging
from collections import OrderedDict
import re

import pandas as pd

Expand All @@ -19,7 +15,9 @@ def main(input_path: str, output_path: str) -> None:
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)

# Only keep required cols
to_keep = OrderedDict([("code", "study_id"), ("n_total", "n_total"), ("n_cases", "n_cases")])
to_keep = OrderedDict(
[("code", "study_id"), ('trait', 'trait_raw'), ("n_total", "n_total"), ("n_cases", "n_cases")]
)

# Load manifest
manifest = pd.read_csv(input_path, sep="\t", header=0, dtype=object).filter(items=to_keep).rename(columns=to_keep)
Expand All @@ -30,6 +28,9 @@ def main(input_path: str, output_path: str) -> None:
# Add other columns -------------------------------------------------------
#

# Process traits to a nicer format
manifest["trait_reported"] = manifest['trait_raw'].apply(make_trait_reported_string)

# Vector to get Neale or SAIGE studies
is_neale = manifest["study_id"].str.startswith("NEALE2_")
is_saige = manifest["study_id"].str.startswith("SAIGE_")
Expand All @@ -50,22 +51,25 @@ def main(input_path: str, output_path: str) -> None:
manifest.loc[:, "ancestry_replication"] = ""

# Ouput required columns
cols = OrderedDict(
[
("study_id", "study_id"),
("pmid", "pmid"),
("pub_date", "pub_date"),
("pub_journal", "pub_journal"),
("pub_title", "pub_title"),
("pub_author", "pub_author"),
("ancestry_initial", "ancestry_initial"),
("ancestry_replication", "ancestry_replication"),
("n_initial", "n_initial"),
("n_cases", "n_cases"),
("n_replication", "n_replication"),
]
)
manifest = manifest.loc[:, list(cols.keys())].rename(columns=cols)
cols = [
"study_id",
"pmid",
"pub_date",
"pub_journal",
"pub_title",
"pub_author",
"trait_reported",
"ancestry_initial",
"ancestry_replication",
"n_initial",
"n_cases",
"n_replication",
]

manifest = manifest.filter(items=cols).drop_duplicates()

# Assert trait_reported is not empty
assert manifest["trait_reported"].notna().all(), "There are studies where the trait is not defined."

# Write
manifest.to_json(args.output, orient="records", lines=True)
Expand All @@ -79,6 +83,30 @@ def to_int_safe(i):
return None


def make_trait_reported_string(s_raw):
'''Takes the raw trait name and outputs transformed name'''

# Replace any double spaces with single
s_raw = re.sub(r' +', r' ', s_raw)

# Assert no "|" in trait name
assert "|" not in s_raw, f"Reported trait ({s_raw}), contains invalid character."

# Split prefix
parts = s_raw.split(': ', 1)

# Move prefix to end if exists
if len(parts) == 2:
trait = " | ".join([parts[1], parts[0]])
else:
trait = s_raw

# Capitalise the first letter
trait = trait.capitalize()

return trait


def parse_args():
"""Load command line args"""
parser = argparse.ArgumentParser()
Expand Down
8 changes: 6 additions & 2 deletions scripts/make_disease_mapping_lut.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ def main(
.drop('proposed_efos', axis=1)
.explode('trait_efos')
)

assert len(genetics_mappings) == (
len(gwas_catalog_mappings.explode('trait_efos'))
+ len(valid_ukb.explode('proposed_efos'))
Expand All @@ -55,7 +54,7 @@ def main(
# A study/trait can be mapped to multiple EFOs, each with a different set of therapeutic areas.
# All the therapeutic areas and EFOs are collected into the same column. The most significant TA
# per study is extracted. The result of collecting these is a multidimensional array that must be flattened.
.groupby(['study_id', 'trait_reported'])
.groupby(['study_id', 'trait_reported'], dropna=False)
.agg({'therapeutic_areas': list, 'trait_efos': list})
.reset_index()
)
Expand All @@ -77,6 +76,11 @@ def main(
genetics_mappings_w_ta['study_id'].unique()
), 'WARNING! There are duplicated studies.'

# Assert no studies are lost in the process of adding the TA
assert len(genetics_mappings_w_ta.study_id.unique()) == len(
genetics_mappings.study_id.unique()
), 'WARNING! Some studies were lost in the process of adding the TA.'

# 4. Format and write output
genetics_mappings_w_ta.to_parquet(output_path)
logging.info(f'{output_path} successfully generated. Exiting.')
Expand Down

0 comments on commit 8b9180e

Please sign in to comment.