Skip to content

Commit

Permalink
Added validation test for alphanumeric generic names with suffixed La…
Browse files Browse the repository at this point in the history
…tin specific names.
  • Loading branch information
donovan-h-parks committed Nov 8, 2021
1 parent 57019bf commit eed48f6
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 11 deletions.
59 changes: 51 additions & 8 deletions gtdb_species_clusters/pmc_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
is_placeholder_sp_epithet,
is_latin_sp_epithet,
is_suffixed_taxon,
is_alphanumeric_taxon,
is_suffixed_sp_epithet,
taxon_type,
specific_epithet_type,
test_same_epithet,
Expand Down Expand Up @@ -1511,7 +1513,7 @@ def validate_parent_child_placeholder_names(self, final_taxonomy):
This test primarily aims to catch simply typos where the parent and child
should in fact have identical names.
Example: f__JDFR-13 conflicts with g__JdFR-13
"""

Expand Down Expand Up @@ -1539,7 +1541,7 @@ def validate_placeholder_names_not_latin(self, final_taxonomy, lpsn):
is challenging even though they are often obvious to humans (e.g. g__Notlatin).
Right now, this test simply looks for names under 5 characters that appear to be
Latin under the assumption this is too short for valid Latin names.
Example: g__Gsub
"""

Expand All @@ -1552,14 +1554,45 @@ def validate_placeholder_names_not_latin(self, final_taxonomy, lpsn):

if not is_placeholder_taxon(taxon) and len(taxon[3:]) <= 5:
# Latin-looking name that is under 6 characters
invalid_name[taxon] = (taxon,)
invalid_name[taxon] = (gid, taxon)
else:
validated_count += 1

self.report_validation(
invalid_name,
validated_count,
" - identified {:,} GTDB Latin-looking names that are short and may in fact be placehold names (GTDB Taxon):".format(
len(invalid_name)))

def validate_sp_with_alphanumeric_generic_names(self, final_taxonomy):
"""Validate species with alphanumeric generic names do not have suffixed Latin specific names.
Examples:
s__DTPE01 lauensis_A
s__DRZC01 fontis_A
"""

invalid_name = {}
validated_count = 0
for gid, taxa in final_taxonomy.items():
genus = taxa[Taxonomy.GENUS_INDEX]
species = taxa[Taxonomy.SPECIES_INDEX]
specific = specific_epithet(species)

assert species.startswith(genus.replace('g__', 's__'))

if is_alphanumeric_taxon(genus) and is_suffixed_sp_epithet(specific):
# Latin-looking name that is under 6 characters
invalid_name[gid] = (gid, species)
else:
validated_count += 1

self.report_validation(
invalid_name,
validated_count,
" - identified {:,} GTDB species names with an alphanumeric generic name and suffixed Latin specific name:".format(
len(invalid_name)))

def validate_suffix_of_specific_names(self, final_taxonomy, cur_genomes, lpsn):
"""Validate suffix of specific names by comparison with LPSN names.
Expand Down Expand Up @@ -2438,15 +2471,25 @@ def run(self,
report_errors=True)

# validate parent-child placeholder names (e.g. f__JDFR-13 conflicts with g__JdFR-13)
self.logger.info('Validating parent-child placeholder names for simple typos.')
self.validate_parent_child_placeholder_names(final_taxonomy)

# Note: It was decided that this capitalization inconsistency is acceptable in order
# to avoid replacing long-standing placeholder names with slight modifications.
#self.logger.info('Validating parent-child placeholder names for simple typos.')
# self.validate_parent_child_placeholder_names(final_taxonomy)

# validate placeholder names that might be mistaken as Latin names (e.g., g__Gsub)
self.logger.info('Validating placeholder names that might be mistaken as Latin names.')
self.logger.info(
'Validating placeholder names that might be mistaken as Latin names.')
self.validate_placeholder_names_not_latin(final_taxonomy, lpsn)

# validate alphanumeric generic names do not have suffixed Latin specific names
# (e.g., s__DTPE01 lauensis_A is considered invalid)
self.logger.info(
'Validating species with alphanumeric generic names do not have suffixed Latin specific names.')
self.validate_sp_with_alphanumeric_generic_names(final_taxonomy)

# validate suffix of specific names by looking for small deviations relative to LPSN names
self.logger.info("Validating suffix of specific names.")
self.logger.info(
"Validating suffix of specific names by identifying small deviations from LPSN names.")
self.validate_suffix_of_specific_names(
final_taxonomy, cur_genomes, lpsn)

Expand Down
5 changes: 2 additions & 3 deletions gtdb_species_clusters/species_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,7 @@ def create_expanded_clusters(self,
assert(not self.new_gids and not self.updated_gids)

# read GTDB-Tk classifications for new and updated genomes
gtdbtk_classifications = read_gtdbtk_classifications(
gtdbtk_classify_file)
gtdbtk_classifications = read_gtdbtk_classifications(gtdbtk_classify_file)
self.logger.info(
f' - identified {len(gtdbtk_classifications):,} classifications')

Expand Down Expand Up @@ -199,7 +198,7 @@ def create_expanded_clusters(self,
if sp not in orig_sp_rid_map:
self.logger.error(
f'GTDB-Tk results indicated a new species for {gid}: {sp}')
sys.exit(-1)
#sys.exit(-1)

orig_rid = orig_sp_rid_map[sp]
if gid in self.new_gids:
Expand Down

0 comments on commit eed48f6

Please sign in to comment.