diff --git a/README.md b/README.md index 9dc141b..eb2ddbe 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,9 @@ # KatharoSeq -An implementation of the KatharoSeq protocol, originally defined in [Minich et al 2018 mSystems](https://journals.asm.org/doi/10.1128/mSystems.00218-17). +![](https://github.com/qiime2/q2templates/workflows/ci/badge.svg) [![DOI](https://zenodo.org/badge/459343122.svg)](https://zenodo.org/badge/latestdoi/459343122) + + +An implementation of the KatharoSeq protocol, originally defined in [Minich et al 2018 mSystems](https://journals.asm.org/doi/10.1128/mSystems.00218-17) and in [Minich et al 2022](https://www.biorxiv.org/content/10.1101/2022.03.07.483203v1) ## Installation @@ -16,6 +19,8 @@ pip install -e . Computation assumes that the user has classified their 16S features against SILVA, and that the `FeatureTable[Frequency]` has been collapsed to the genus level. Please see the [`q2-feature-classifier`](https://docs.qiime2.org/2022.2/plugins/available/feature-classifier/classify-sklearn/) for detail on how to perform taxonomy classification, and the [`q2-taxa`](https://docs.qiime2.org/2022.2/plugins/available/taxa/collapse/) plugin for information on collapsing to a taxonomic level. If you need more information on how to process your data, please refer to one of the relevant tutorials that can be found [here](https://docs.qiime2.org/2022.2/tutorials/). For these examples, data from the Fish Microbiome Project (FMP): [Fish microbiomes 101: disentangling the rules governing marine fish mucosal microbiomes across 101 species](https://www.biorxiv.org/content/10.1101/2022.03.07.483203v1) paper will be used, and can be found in the `example` folder. +For a less stringent filter, an appropriate value for `--p-threshold` would be around 50. For a more strict filter, use a value like 90, as shown below in the example. + ## Read Count Threshold @@ -62,11 +67,11 @@ Finally in order to visualize the results from `estimating-biomass`, run `biomas ``` qiime katharoseq biomass-plot \ --i-table example/fmp_collapsed_table.qza \ - --m-control-cell-extraction-file example/fmp_metadata_mod.tsv \ + --m-control-cell-extraction-file example/fmp_metadata.tsv \ --m-control-cell-extraction-column control_cell_into_extraction \ --p-min-total-reads 1315 \ --p-positive-control-value control \ - --m-positive-control-column-file example/fmp_metadata_mod.tsv \ + --m-positive-control-column-file example/fmp_metadata.tsv \ --m-positive-control-column-column control_rct \ --o-visualization biomass_plot_fmp ``` diff --git a/q2_katharoseq/_methods.py b/q2_katharoseq/_methods.py index 5b45155..c641c11 100644 --- a/q2_katharoseq/_methods.py +++ b/q2_katharoseq/_methods.py @@ -14,7 +14,7 @@ 'd__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;' 'f__Clostridiaceae;g__Clostridium', 'd__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;' - 'o__Enterobacterales;f__Enterobacteriaceae;__', + 'o__Enterobacterales;f__Enterobacteriaceae;g__', 'd__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;' 'o__Enterobacterales;f__Enterobacteriaceae;' 'g__Escherichia-Shigella', @@ -66,6 +66,36 @@ def get_threshold(r1, r2, thresh): return min_freq +def fit_lm(table, + min_total_reads, + positive_control_column, + positive_control_value, + control_cell_extraction): + + total_reads = table.sum(axis=1) + filtered = pd.DataFrame(total_reads[total_reads > min_total_reads]) + filtered = filtered.rename(columns={0: 'total_reads'}) + filtered['log_total_reads'] = filtered.total_reads.apply(math.log10) + + positive_control_column = positive_control_column.to_series().loc[ + filtered.index] + positive_controls = positive_control_column[ + positive_control_column == positive_control_value] + positive_controls = filtered.loc[positive_controls.index] + + positive_controls['control_cell_extraction'] = \ + control_cell_extraction.to_series().loc[positive_controls.index] + positive_controls['log_control_cell_extraction'] = \ + positive_controls.control_cell_extraction.apply(math.log10) + + lm = LinearRegression() + lm.fit( + positive_controls.log_total_reads.values.reshape(-1, 1), + positive_controls.log_control_cell_extraction) + + return lm, filtered, positive_controls + + def read_count_threshold( output_dir: str, threshold: int, @@ -169,26 +199,11 @@ def estimating_biomass( dna_extract_vol: int, extraction_mass_g: qiime2.NumericMetadataColumn) -> pd.DataFrame: - total_reads = table.sum(axis=1) - filtered = pd.DataFrame(total_reads[total_reads > min_total_reads]) - filtered = filtered.rename(columns={0: 'total_reads'}) - filtered['log_total_reads'] = filtered.total_reads.apply(math.log10) - - positive_control_column = positive_control_column.to_series().loc[ - filtered.index] - positive_controls = positive_control_column[ - positive_control_column == positive_control_value] - positive_controls = filtered.loc[positive_controls.index] - - positive_controls['control_cell_extraction'] = \ - control_cell_extraction.to_series().loc[positive_controls.index] - positive_controls['log_control_cell_extraction'] = \ - positive_controls.control_cell_extraction.apply(math.log10) - - lm = LinearRegression() - lm.fit( - positive_controls.log_total_reads.values.reshape(-1, 1), - positive_controls.log_control_cell_extraction) + lm, filtered, positive_controls = fit_lm(table, + min_total_reads, + positive_control_column, + positive_control_value, + control_cell_extraction) filtered['estimated_biomass_per_pcrrxn'] = \ 10**((filtered.log_total_reads*lm.coef_[0])+lm.intercept_) @@ -216,26 +231,11 @@ def biomass_plot( positive_control_value: str, positive_control_column: qiime2.CategoricalMetadataColumn) -> None: - total_reads = table.sum(axis=1) - filtered = pd.DataFrame(total_reads[total_reads > min_total_reads]) - filtered = filtered.rename(columns={0: 'total_reads'}) - filtered['log_total_reads'] = filtered.total_reads.apply(math.log10) - - positive_control_column = positive_control_column.to_series().loc[ - filtered.index] - positive_controls = positive_control_column[ - positive_control_column == positive_control_value] - positive_controls = filtered.loc[positive_controls.index] - - positive_controls['control_cell_extraction'] = \ - control_cell_extraction.to_series().loc[positive_controls.index] - positive_controls['log_control_cell_extraction'] = \ - positive_controls.control_cell_extraction.apply(math.log10) - - lm = LinearRegression() - lm.fit( - positive_controls.log_total_reads.values.reshape(-1, 1), - positive_controls.log_control_cell_extraction) + lm, filtered, positive_controls = fit_lm(table, + min_total_reads, + positive_control_column, + positive_control_value, + control_cell_extraction) # MAKE PLOT y = positive_controls['log_control_cell_extraction'] diff --git a/q2_katharoseq/estimating_biomass_assets/index.html b/q2_katharoseq/estimating_biomass_assets/index.html index c717117..7dea9e1 100644 --- a/q2_katharoseq/estimating_biomass_assets/index.html +++ b/q2_katharoseq/estimating_biomass_assets/index.html @@ -10,7 +10,7 @@