From 85521c41801aa628bf7bdda28e6459af61559ad2 Mon Sep 17 00:00:00 2001
From: Daniel McDonald <danielmcdonald@ucsd.edu>
Date: Tue, 22 Feb 2022 12:28:27 -0800
Subject: [PATCH 1/4] attach estimate biomass to api

---
 q2_katharoseq/_format.py      | 22 +++++++++++++++
 q2_katharoseq/_transformer.py | 16 +++++++++++
 q2_katharoseq/_type.py        |  3 ++
 q2_katharoseq/plugin_setup.py | 52 ++++++++++++++++++++++++++++++++++-
 4 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 q2_katharoseq/_format.py
 create mode 100644 q2_katharoseq/_transformer.py
 create mode 100644 q2_katharoseq/_type.py

diff --git a/q2_katharoseq/_format.py b/q2_katharoseq/_format.py
new file mode 100644
index 0000000..77e1cb9
--- /dev/null
+++ b/q2_katharoseq/_format.py
@@ -0,0 +1,22 @@
+import qiime2.plugin.model as model
+
+
+STATS_HEADER = ['sample-id',
+                'log_total_reads',
+                'estimated_biomass_per_pcrrxn',
+                'estimated_biomass_per_dnarxn',
+                'extraction_mass_g',
+                'estimated_cells_per_g',
+                'log_estimated_cells_per_g']
+
+
+class EstimatedBiomassFmt(model.TextFileFormat):
+    def sniff(self):
+        line = open(str(self)).readline()
+        hdr = line.strip().split(',')
+
+        return hdr == STATS_HEADER
+
+
+EstimatedBiomassDirFmt = model.SingleFileDirectoryFormat(
+    'EstimatedBiomassDirFmt', 'est_biomass.csv', EstimatedBiomassFmt)
diff --git a/q2_katharoseq/_transformer.py b/q2_katharoseq/_transformer.py
new file mode 100644
index 0000000..9c4c198
--- /dev/null
+++ b/q2_katharoseq/_transformer.py
@@ -0,0 +1,16 @@
+import pandas as pd
+
+from .plugin_setup import plugin
+from ._format import EstimatedBiomassFmt
+
+
+@plugin.register_transformer
+def _1(data: pd.DataFrame) -> EstimatedBiomassFmt:
+    ff = EstimatedBiomassFmt()
+    data.to_csv(str(ff))
+    return ff
+
+
+@plugin.register_transformer
+def _2(ff: EstimatedBiomassFmt) -> pd.DataFrame:
+    return pd.read_csv(str(ff), index_col='sample-id')
diff --git a/q2_katharoseq/_type.py b/q2_katharoseq/_type.py
new file mode 100644
index 0000000..8d97947
--- /dev/null
+++ b/q2_katharoseq/_type.py
@@ -0,0 +1,3 @@
+from qiime2.plugin import SemanticType
+
+EstimatedBiomass = SemanticType('EstimatedBiomass')
diff --git a/q2_katharoseq/plugin_setup.py b/q2_katharoseq/plugin_setup.py
index c40da7b..1ce8663 100644
--- a/q2_katharoseq/plugin_setup.py
+++ b/q2_katharoseq/plugin_setup.py
@@ -1,8 +1,11 @@
+import importlib
 from qiime2.plugin import (Plugin, Citations, Str, Int,
                            MetadataColumn, Categorical, Numeric, Choices)
 from q2_types.feature_table import (FeatureTable, Frequency)
-from . import read_count_threshold
+from . import read_count_threshold, estimating_biomass
 import q2_katharoseq
+from q2_katharoseq._type import EstimatedBiomass
+from q2_katharoseq._format import EstimatedBiomassFmt, EstimatedBiomassDirFmt
 
 
 citations = Citations.load('citations.bib', package='q2_katharoseq')
@@ -19,6 +22,12 @@
 )
 
 
+plugin.register_formats(EstimatedBiomassFmt, EstimatedBiomassDirFmt)
+plugin.register_semantic_types(EstimatedBiomass)
+plugin.register_semantic_type_to_format(EstimatedBiomass,
+                                        artifact_format=EstimatedBiomassDirFmt)
+
+
 plugin.visualizers.register_function(
     function=read_count_threshold,
     inputs={
@@ -60,3 +69,44 @@
                 'positive signal in samples with as few as 50 to 500 cells.',
     citations=[citations['minich2018']]
 )
+
+
+plugin.methods.register_function(
+    function=estimating_biomass,
+    inputs={},
+    parameters={'total_reads': MetadataColumn[Numeric],
+                'control_cell_extraction': MetadataColumn[Numeric],
+                'positive_control_column': MetadataColumn[Categorical],
+                'positive_control_value': Str,
+                'extraction_mass_g': MetadataColumn[Categorical],
+                'min_total_reads': Int,
+                'pcr_template_vol': Int,
+                'dna_extract_vol': Int},
+    outputs=[('estimated_biomass', EstimatedBiomass)],
+    input_descriptions={},
+    parameter_descriptions={
+        'total_reads': 'The total reads present in each sample.',
+        'control_cell_extraction': 'The number of cells in the controls.',
+        'positive_control_column': (
+            'The column in the sample metadata that describes which samples '
+            'are and are not controls.'),
+        'positive_control_value': (
+            'The value in the control column that demarks which samples are '
+            'the positive controls.'),
+        'extraction_mass_g': (
+            'The column in the sample metadata that describes the extraction '
+            'mass for the controls'),
+        'min_total_reads': 'The minimum threshold to apply.',
+        'pcr_template_vol': 'The PCR template volume.',
+        'dna_extract_vol': 'The DNA extraction volume.'},
+    output_descriptions={
+        'estimated_biomass': (
+            'A dataframe containing the details on estimated biomass')
+        },
+    name='Estimate the biomass of samples using KatharoSeq controls.',
+    description='Estimate the biomass of samples using KatharoSeq controls.',
+    citations=[]
+)
+
+
+importlib.import_module('q2_katharoseq._transformer')

From ab9e6160129231ca926403e8da64af28c2e567d7 Mon Sep 17 00:00:00 2001
From: Daniel McDonald <danielmcdonald@ucsd.edu>
Date: Tue, 22 Feb 2022 12:42:04 -0800
Subject: [PATCH 2/4] update test to use qiime2.Metadata

---
 .../input_estimating_biomass.tsv              |  1 +
 q2_katharoseq/tests/test_method.py            | 19 +++++--------------
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv b/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv
index a3002fc..b58cd50 100644
--- a/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv
+++ b/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv
@@ -1,4 +1,5 @@
 sample_name	total_reads	control_cell_into_extraction	extraction_mass_g	positive_control
+#q2:types	numeric	numeric	categorical	categorical
 13414.plate1.h9	4			False
 13414.plate1.b10	6			False
 13414.plate1.e12	6			False
diff --git a/q2_katharoseq/tests/test_method.py b/q2_katharoseq/tests/test_method.py
index 0db6069..4137dee 100644
--- a/q2_katharoseq/tests/test_method.py
+++ b/q2_katharoseq/tests/test_method.py
@@ -166,26 +166,17 @@ def test_threshold(self):
 
     def test_estimating_biomass(self):
         fp = join(dirname(abspath(getfile(currentframe()))), 'support_files')
-        data = pd.read_csv(
-            f'{fp}/input_estimating_biomass.tsv', sep='\t', dtype={
-                'sample_name': str, 'total_reads': float,
-                'control_cell_into_extraction': float,
-                'extraction_mass_g': float,
-                'positive_control': str})
-        data.set_index('sample_name', inplace=True)
+        data = qiime2.Metadata.load(f'{fp}/input_estimating_biomass.tsv')
 
         obs = estimating_biomass(
-            total_reads=qiime2.NumericMetadataColumn(data['total_reads']),
-            control_cell_extraction=qiime2.NumericMetadataColumn(
-                data['control_cell_into_extraction']),
+            total_reads=data.get_column('total_reads'),
+            control_cell_extraction=data.get_column('control_cell_into_extraction'),  # noqa
             min_total_reads=1150,
             positive_control_value='True',
-            positive_control_column=qiime2.CategoricalMetadataColumn(
-                data['positive_control']),
+            positive_control_column=data.get_column('positive_control'),
             pcr_template_vol=5,
             dna_extract_vol=60,
-            extraction_mass_g=qiime2.NumericMetadataColumn(
-                data['extraction_mass_g'])
+            extraction_mass_g=data.get_column('extraction_mass_g')
         )
         exp = pd.read_csv(
             f'{fp}/output_estimating_biomass.tsv', sep='\t', index_col=0)

From af87eeb7be8576ca7291e73e3ea8a38b326608d8 Mon Sep 17 00:00:00 2001
From: Daniel McDonald <danielmcdonald@ucsd.edu>
Date: Tue, 22 Feb 2022 12:45:11 -0800
Subject: [PATCH 3/4] fix q2type

---
 q2_katharoseq/_methods.py                                      | 2 +-
 q2_katharoseq/tests/support_files/input_estimating_biomass.tsv | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/q2_katharoseq/_methods.py b/q2_katharoseq/_methods.py
index 5bd63ca..4fbefa0 100644
--- a/q2_katharoseq/_methods.py
+++ b/q2_katharoseq/_methods.py
@@ -156,7 +156,7 @@ def estimating_biomass(
         positive_control_column: qiime2.CategoricalMetadataColumn,
         pcr_template_vol: int,
         dna_extract_vol: int,
-        extraction_mass_g: qiime2.CategoricalMetadataColumn) -> pd.DataFrame:
+        extraction_mass_g: qiime2.NumericMetadataColumn) -> pd.DataFrame:
 
     total_reads = total_reads.to_series()
     filtered = pd.DataFrame(total_reads[total_reads > min_total_reads])
diff --git a/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv b/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv
index b58cd50..3453c7f 100644
--- a/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv
+++ b/q2_katharoseq/tests/support_files/input_estimating_biomass.tsv
@@ -1,5 +1,5 @@
 sample_name	total_reads	control_cell_into_extraction	extraction_mass_g	positive_control
-#q2:types	numeric	numeric	categorical	categorical
+#q2:types	numeric	numeric	numeric	categorical
 13414.plate1.h9	4			False
 13414.plate1.b10	6			False
 13414.plate1.e12	6			False

From 6f6434b6bce68ebc4bc907338a39654218bdfe69 Mon Sep 17 00:00:00 2001
From: Daniel McDonald <danielmcdonald@ucsd.edu>
Date: Tue, 22 Feb 2022 17:16:08 -0800
Subject: [PATCH 4/4] use strings from @jminich444

---
 q2_katharoseq/plugin_setup.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/q2_katharoseq/plugin_setup.py b/q2_katharoseq/plugin_setup.py
index 1ce8663..6c7d98f 100644
--- a/q2_katharoseq/plugin_setup.py
+++ b/q2_katharoseq/plugin_setup.py
@@ -78,15 +78,22 @@
                 'control_cell_extraction': MetadataColumn[Numeric],
                 'positive_control_column': MetadataColumn[Categorical],
                 'positive_control_value': Str,
-                'extraction_mass_g': MetadataColumn[Categorical],
+                'extraction_mass_g': MetadataColumn[Numeric],
                 'min_total_reads': Int,
                 'pcr_template_vol': Int,
                 'dna_extract_vol': Int},
     outputs=[('estimated_biomass', EstimatedBiomass)],
     input_descriptions={},
     parameter_descriptions={
-        'total_reads': 'The total reads present in each sample.',
-        'control_cell_extraction': 'The number of cells in the controls.',
+        'total_reads': 'The total sum of the reads or ASVs for each sample.',
+        'control_cell_extraction': (
+            'The estimated number of cells or genomes used as input to your '
+            'library prep. One may typically estimate this by determining the '
+            'total number of cells from a stock solution used to make '
+            'standard titrations. Each titration will have an estimated '
+            'number of microbial cells put into the extraction. The final '
+            'estimate will depend on the elution volume and the final volume '
+            'used into the library prep (e.g. 16S PCR).'),
         'positive_control_column': (
             'The column in the sample metadata that describes which samples '
             'are and are not controls.'),
@@ -94,11 +101,15 @@
             'The value in the control column that demarks which samples are '
             'the positive controls.'),
         'extraction_mass_g': (
-            'The column in the sample metadata that describes the extraction '
-            'mass for the controls'),
+            'The column in the sample metadata that describes the sample '
+            '(e.g. stool, tissue, soil, etc) mass (in grams - typically '
+            'converted from mg)'),
         'min_total_reads': 'The minimum threshold to apply.',
-        'pcr_template_vol': 'The PCR template volume.',
-        'dna_extract_vol': 'The DNA extraction volume.'},
+        'pcr_template_vol': (
+            'The volume of DNA used as template in the '
+            'library prep (PCR reaction)'),
+        'dna_extract_vol': (
+            'The final elution volume used during DNA extraction')},
     output_descriptions={
         'estimated_biomass': (
             'A dataframe containing the details on estimated biomass')