From 85521c41801aa628bf7bdda28e6459af61559ad2 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 22 Feb 2022 12:28:27 -0800 Subject: [PATCH 1/4] attach estimate biomass to api --- q2_katharoseq/_format.py | 22 +++++++++++++++ q2_katharoseq/_transformer.py | 16 +++++++++++ q2_katharoseq/_type.py | 3 ++ q2_katharoseq/plugin_setup.py | 52 ++++++++++++++++++++++++++++++++++- 4 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 q2_katharoseq/_format.py create mode 100644 q2_katharoseq/_transformer.py create mode 100644 q2_katharoseq/_type.py diff --git a/q2_katharoseq/_format.py b/q2_katharoseq/_format.py new file mode 100644 index 0000000..77e1cb9 --- /dev/null +++ b/q2_katharoseq/_format.py @@ -0,0 +1,22 @@ +import qiime2.plugin.model as model + + +STATS_HEADER = ['sample-id', + 'log_total_reads', + 'estimated_biomass_per_pcrrxn', + 'estimated_biomass_per_dnarxn', + 'extraction_mass_g', + 'estimated_cells_per_g', + 'log_estimated_cells_per_g'] + + +class EstimatedBiomassFmt(model.TextFileFormat): + def sniff(self): + line = open(str(self)).readline() + hdr = line.strip().split(',') + + return hdr == STATS_HEADER + + +EstimatedBiomassDirFmt = model.SingleFileDirectoryFormat( + 'EstimatedBiomassDirFmt', 'est_biomass.csv', EstimatedBiomassFmt) diff --git a/q2_katharoseq/_transformer.py b/q2_katharoseq/_transformer.py new file mode 100644 index 0000000..9c4c198 --- /dev/null +++ b/q2_katharoseq/_transformer.py @@ -0,0 +1,16 @@ +import pandas as pd + +from .plugin_setup import plugin +from ._format import EstimatedBiomassFmt + + +@plugin.register_transformer +def _1(data: pd.DataFrame) -> EstimatedBiomassFmt: + ff = EstimatedBiomassFmt() + data.to_csv(str(ff)) + return ff + + +@plugin.register_transformer +def _2(ff: EstimatedBiomassFmt) -> pd.DataFrame: + return pd.read_csv(str(ff), index_col='sample-id') diff --git a/q2_katharoseq/_type.py b/q2_katharoseq/_type.py new file mode 100644 index 0000000..8d97947 --- /dev/null +++ b/q2_katharoseq/_type.py @@ -0,0 +1,3 @@ +from qiime2.plugin import SemanticType + +EstimatedBiomass = SemanticType('EstimatedBiomass') diff --git a/q2_katharoseq/plugin_setup.py b/q2_katharoseq/plugin_setup.py index c40da7b..1ce8663 100644 --- a/q2_katharoseq/plugin_setup.py +++ b/q2_katharoseq/plugin_setup.py @@ -1,8 +1,11 @@ +import importlib from qiime2.plugin import (Plugin, Citations, Str, Int, MetadataColumn, Categorical, Numeric, Choices) from q2_types.feature_table import (FeatureTable, Frequency) -from . import read_count_threshold +from . import read_count_threshold, estimating_biomass import q2_katharoseq +from q2_katharoseq._type import EstimatedBiomass +from q2_katharoseq._format import EstimatedBiomassFmt, EstimatedBiomassDirFmt citations = Citations.load('citations.bib', package='q2_katharoseq') @@ -19,6 +22,12 @@ ) +plugin.register_formats(EstimatedBiomassFmt, EstimatedBiomassDirFmt) +plugin.register_semantic_types(EstimatedBiomass) +plugin.register_semantic_type_to_format(EstimatedBiomass, + artifact_format=EstimatedBiomassDirFmt) + + plugin.visualizers.register_function( function=read_count_threshold, inputs={ @@ -60,3 +69,44 @@ 'positive signal in samples with as few as 50 to 500 cells.', citations=[citations['minich2018']] ) + + +plugin.methods.register_function( + function=estimating_biomass, + inputs={}, + parameters={'total_reads': MetadataColumn[Numeric], + 'control_cell_extraction': MetadataColumn[Numeric], + 'positive_control_column': MetadataColumn[Categorical], + 'positive_control_value': Str, + 'extraction_mass_g': MetadataColumn[Categorical], + 'min_total_reads': Int, + 'pcr_template_vol': Int, + 'dna_extract_vol': Int}, + outputs=[('estimated_biomass', EstimatedBiomass)], + input_descriptions={}, + parameter_descriptions={ + 'total_reads': 'The total reads present in each sample.', + 'control_cell_extraction': 'The number of cells in the controls.', + 'positive_control_column': ( + 'The column in the sample metadata that describes which samples ' + 'are and are not controls.'), + 'positive_control_value': ( + 'The value in the control column that demarks which samples are ' + 'the positive controls.'), + 'extraction_mass_g': ( + 'The column in the sample metadata that describes the extraction ' + 'mass for the controls'), + 'min_total_reads': 'The minimum threshold to apply.', + 'pcr_template_vol': 'The PCR template volume.', + 'dna_extract_vol': 'The DNA extraction volume.'}, + output_descriptions={ + 'estimated_biomass': ( + 'A dataframe containing the details on estimated biomass') + }, + name='Estimate the biomass of samples using KatharoSeq controls.', + description='Estimate the biomass of samples using KatharoSeq controls.', + citations=[] +) + + +importlib.import_module('q2_katharoseq._transformer') From ab9e6160129231ca926403e8da64af28c2e567d7 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 22 Feb 2022 12:42:04 -0800 Subject: [PATCH 2/4] update test to use qiime2.Metadata --- .../input_estimating_biomass.tsv | 1 + q2_katharoseq/tests/test_method.py | 19 +++++-------------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv b/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv index a3002fc..b58cd50 100644 --- a/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv +++ b/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv @@ -1,4 +1,5 @@ sample_name total_reads control_cell_into_extraction extraction_mass_g positive_control +#q2:types numeric numeric categorical categorical 13414.plate1.h9 4 False 13414.plate1.b10 6 False 13414.plate1.e12 6 False diff --git a/q2_katharoseq/tests/test_method.py b/q2_katharoseq/tests/test_method.py index 0db6069..4137dee 100644 --- a/q2_katharoseq/tests/test_method.py +++ b/q2_katharoseq/tests/test_method.py @@ -166,26 +166,17 @@ def test_threshold(self): def test_estimating_biomass(self): fp = join(dirname(abspath(getfile(currentframe()))), 'support_files') - data = pd.read_csv( - f'{fp}/input_estimating_biomass.tsv', sep='\t', dtype={ - 'sample_name': str, 'total_reads': float, - 'control_cell_into_extraction': float, - 'extraction_mass_g': float, - 'positive_control': str}) - data.set_index('sample_name', inplace=True) + data = qiime2.Metadata.load(f'{fp}/input_estimating_biomass.tsv') obs = estimating_biomass( - total_reads=qiime2.NumericMetadataColumn(data['total_reads']), - control_cell_extraction=qiime2.NumericMetadataColumn( - data['control_cell_into_extraction']), + total_reads=data.get_column('total_reads'), + control_cell_extraction=data.get_column('control_cell_into_extraction'), # noqa min_total_reads=1150, positive_control_value='True', - positive_control_column=qiime2.CategoricalMetadataColumn( - data['positive_control']), + positive_control_column=data.get_column('positive_control'), pcr_template_vol=5, dna_extract_vol=60, - extraction_mass_g=qiime2.NumericMetadataColumn( - data['extraction_mass_g']) + extraction_mass_g=data.get_column('extraction_mass_g') ) exp = pd.read_csv( f'{fp}/output_estimating_biomass.tsv', sep='\t', index_col=0) From af87eeb7be8576ca7291e73e3ea8a38b326608d8 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 22 Feb 2022 12:45:11 -0800 Subject: [PATCH 3/4] fix q2type --- q2_katharoseq/_methods.py | 2 +- q2_katharoseq/tests/support_files/input_estimating_biomass.tsv | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/q2_katharoseq/_methods.py b/q2_katharoseq/_methods.py index 5bd63ca..4fbefa0 100644 --- a/q2_katharoseq/_methods.py +++ b/q2_katharoseq/_methods.py @@ -156,7 +156,7 @@ def estimating_biomass( positive_control_column: qiime2.CategoricalMetadataColumn, pcr_template_vol: int, dna_extract_vol: int, - extraction_mass_g: qiime2.CategoricalMetadataColumn) -> pd.DataFrame: + extraction_mass_g: qiime2.NumericMetadataColumn) -> pd.DataFrame: total_reads = total_reads.to_series() filtered = pd.DataFrame(total_reads[total_reads > min_total_reads]) diff --git a/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv b/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv index b58cd50..3453c7f 100644 --- a/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv +++ b/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv @@ -1,5 +1,5 @@ sample_name total_reads control_cell_into_extraction extraction_mass_g positive_control -#q2:types numeric numeric categorical categorical +#q2:types numeric numeric numeric categorical 13414.plate1.h9 4 False 13414.plate1.b10 6 False 13414.plate1.e12 6 False From 6f6434b6bce68ebc4bc907338a39654218bdfe69 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 22 Feb 2022 17:16:08 -0800 Subject: [PATCH 4/4] use strings from @jminich444 --- q2_katharoseq/plugin_setup.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/q2_katharoseq/plugin_setup.py b/q2_katharoseq/plugin_setup.py index 1ce8663..6c7d98f 100644 --- a/q2_katharoseq/plugin_setup.py +++ b/q2_katharoseq/plugin_setup.py @@ -78,15 +78,22 @@ 'control_cell_extraction': MetadataColumn[Numeric], 'positive_control_column': MetadataColumn[Categorical], 'positive_control_value': Str, - 'extraction_mass_g': MetadataColumn[Categorical], + 'extraction_mass_g': MetadataColumn[Numeric], 'min_total_reads': Int, 'pcr_template_vol': Int, 'dna_extract_vol': Int}, outputs=[('estimated_biomass', EstimatedBiomass)], input_descriptions={}, parameter_descriptions={ - 'total_reads': 'The total reads present in each sample.', - 'control_cell_extraction': 'The number of cells in the controls.', + 'total_reads': 'The total sum of the reads or ASVs for each sample.', + 'control_cell_extraction': ( + 'The estimated number of cells or genomes used as input to your ' + 'library prep. One may typically estimate this by determining the ' + 'total number of cells from a stock solution used to make ' + 'standard titrations. Each titration will have an estimated ' + 'number of microbial cells put into the extraction. The final ' + 'estimate will depend on the elution volume and the final volume ' + 'used into the library prep (e.g. 16S PCR).'), 'positive_control_column': ( 'The column in the sample metadata that describes which samples ' 'are and are not controls.'), @@ -94,11 +101,15 @@ 'The value in the control column that demarks which samples are ' 'the positive controls.'), 'extraction_mass_g': ( - 'The column in the sample metadata that describes the extraction ' - 'mass for the controls'), + 'The column in the sample metadata that describes the sample ' + '(e.g. stool, tissue, soil, etc) mass (in grams - typically ' + 'converted from mg)'), 'min_total_reads': 'The minimum threshold to apply.', - 'pcr_template_vol': 'The PCR template volume.', - 'dna_extract_vol': 'The DNA extraction volume.'}, + 'pcr_template_vol': ( + 'The volume of DNA used as template in the ' + 'library prep (PCR reaction)'), + 'dna_extract_vol': ( + 'The final elution volume used during DNA extraction')}, output_descriptions={ 'estimated_biomass': ( 'A dataframe containing the details on estimated biomass')