Merge pull request opentargets#22 from opentargets/il-ukb-fix

V2D Traits handling bug fix
thehyve · Jul 8, 2022 · 8b9180e · 8b9180e
2 parents 5f9c614 + 4284ee2
commit 8b9180e
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 24 deletions.
diff --git a/scripts/make_UKB_study_table.py b/scripts/make_UKB_study_table.py
@@ -1,15 +1,11 @@
 """
 Processes UK Biobank's manifest to extract all studies and their metadata in the OTG format.
 """
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Ed Mountjoy
-#
 
 import argparse
 import logging
 from collections import OrderedDict
+import re
 
 import pandas as pd
 
@@ -19,7 +15,9 @@ def main(input_path: str, output_path: str) -> None:
  logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
 
  # Only keep required cols
- to_keep = OrderedDict([("code", "study_id"), ("n_total", "n_total"), ("n_cases", "n_cases")])
+ to_keep = OrderedDict(
+ [("code", "study_id"), ('trait', 'trait_raw'), ("n_total", "n_total"), ("n_cases", "n_cases")]
+ )
 
  # Load manifest
  manifest = pd.read_csv(input_path, sep="\t", header=0, dtype=object).filter(items=to_keep).rename(columns=to_keep)
@@ -30,6 +28,9 @@ def main(input_path: str, output_path: str) -> None:
  # Add other columns -------------------------------------------------------
  #
 
+ # Process traits to a nicer format
+ manifest["trait_reported"] = manifest['trait_raw'].apply(make_trait_reported_string)
+
  # Vector to get Neale or SAIGE studies
  is_neale = manifest["study_id"].str.startswith("NEALE2_")
  is_saige = manifest["study_id"].str.startswith("SAIGE_")
@@ -50,22 +51,25 @@ def main(input_path: str, output_path: str) -> None:
  manifest.loc[:, "ancestry_replication"] = ""
 
  # Ouput required columns
- cols = OrderedDict(
- [
- ("study_id", "study_id"),
- ("pmid", "pmid"),
- ("pub_date", "pub_date"),
- ("pub_journal", "pub_journal"),
- ("pub_title", "pub_title"),
- ("pub_author", "pub_author"),
- ("ancestry_initial", "ancestry_initial"),
- ("ancestry_replication", "ancestry_replication"),
- ("n_initial", "n_initial"),
- ("n_cases", "n_cases"),
- ("n_replication", "n_replication"),
- ]
- )
- manifest = manifest.loc[:, list(cols.keys())].rename(columns=cols)
+ cols = [
+ "study_id",
+ "pmid",
+ "pub_date",
+ "pub_journal",
+ "pub_title",
+ "pub_author",
+ "trait_reported",
+ "ancestry_initial",
+ "ancestry_replication",
+ "n_initial",
+ "n_cases",
+ "n_replication",
+ ]
+
+ manifest = manifest.filter(items=cols).drop_duplicates()
+
+ # Assert trait_reported is not empty
+ assert manifest["trait_reported"].notna().all(), "There are studies where the trait is not defined."
 
  # Write
  manifest.to_json(args.output, orient="records", lines=True)
@@ -79,6 +83,30 @@ def to_int_safe(i):
  return None
 
 
+def make_trait_reported_string(s_raw):
+ '''Takes the raw trait name and outputs transformed name'''
+
+ # Replace any double spaces with single
+ s_raw = re.sub(r' +', r' ', s_raw)
+
+ # Assert no "|" in trait name
+ assert "|" not in s_raw, f"Reported trait ({s_raw}), contains invalid character."
+
+ # Split prefix
+ parts = s_raw.split(': ', 1)
+
+ # Move prefix to end if exists
+ if len(parts) == 2:
+ trait = " | ".join([parts[1], parts[0]])
+ else:
+ trait = s_raw
+
+ # Capitalise the first letter
+ trait = trait.capitalize()
+
+ return trait
+
+
 def parse_args():
  """Load command line args"""
  parser = argparse.ArgumentParser()

diff --git a/scripts/make_disease_mapping_lut.py b/scripts/make_disease_mapping_lut.py
@@ -36,7 +36,6 @@ def main(
  .drop('proposed_efos', axis=1)
  .explode('trait_efos')
  )
-
  assert len(genetics_mappings) == (
  len(gwas_catalog_mappings.explode('trait_efos'))
  + len(valid_ukb.explode('proposed_efos'))
@@ -55,7 +54,7 @@ def main(
  # A study/trait can be mapped to multiple EFOs, each with a different set of therapeutic areas.
  # All the therapeutic areas and EFOs are collected into the same column. The most significant TA
  # per study is extracted. The result of collecting these is a multidimensional array that must be flattened.
- .groupby(['study_id', 'trait_reported'])
+ .groupby(['study_id', 'trait_reported'], dropna=False)
  .agg({'therapeutic_areas': list, 'trait_efos': list})
  .reset_index()
  )
@@ -77,6 +76,11 @@ def main(
  genetics_mappings_w_ta['study_id'].unique()
  ), 'WARNING! There are duplicated studies.'
 
+ # Assert no studies are lost in the process of adding the TA
+ assert len(genetics_mappings_w_ta.study_id.unique()) == len(
+ genetics_mappings.study_id.unique()
+ ), 'WARNING! Some studies were lost in the process of adding the TA.'
+
  # 4. Format and write output
  genetics_mappings_w_ta.to_parquet(output_path)
  logging.info(f'{output_path} successfully generated. Exiting.')