fixed pandas future warnings, added vignettes

ryandkuster · Jan 8, 2024 · 97ca245 · 97ca245
1 parent d21b188
commit 97ca245
Show file tree

Hide file tree

Showing 12 changed files with 1,290 additions and 36 deletions.
diff --git a/readsynth.py b/readsynth.py
@@ -304,35 +304,30 @@ def process_genomes(args, genomes_df):
     genomes to be processed
     '''
     digest_ls, prob_ls = [], []
-    total_freqs = pd.DataFrame(columns=['length',
-                                        'sum_prob',
-                                        'name',
-                                        'counts_file'])
-
-    sys.stdout.write("%s" % ("□" * genomes_df.shape[0]))
-    sys.stdout.flush()
-    sys.stdout.write("\b" * (genomes_df.shape[0]+1)) # return to start of line, after '[' 
+    total_freqs = pd.DataFrame({'length': pd.Series(dtype='int64'),
+                                'sum_prob': pd.Series(dtype='float64'),
+                                'name': pd.Series(dtype='object'),
+                                'counts_file': pd.Series(dtype='object')})
 
     for idx in range(genomes_df.shape[0]):
         args.g = genomes_df.iloc[idx]['genome']
+        print(args.g)
         args.comp = genomes_df.iloc[idx]['abundance']
-        digest_file = os.path.join(args.o, 'raw_digest_' +
+        digest_file = os.path.join(args.o, 'raw_digests', 'raw_digest_' +
                                    os.path.basename(args.g) + '.csv')
 
         df = digest_genomes.main(args)
 
         if df.shape[0] == 0:
             digest_ls.append(None)
             prob_ls.append(None)
-            sys.stdout.write('□')
             continue
 
         digest_file = process_df(df, digest_file, args)
 
         if digest_file is None:
             digest_ls.append(None)
             prob_ls.append(None)
-            sys.stdout.write('□')
             continue
 
         prob_file, len_freqs = prob_n_copies.main(digest_file, args)
@@ -342,12 +337,8 @@ def process_genomes(args, genomes_df):
         tmp_df = pd.DataFrame(len_freqs.items(), columns=['length', 'sum_prob'])
         tmp_df['name'] = os.path.basename(args.g)
         tmp_df['counts_file'] = prob_file
-        total_freqs = pd.concat([total_freqs, tmp_df], axis=0)
-
-        sys.stdout.write('■')
-        sys.stdout.flush()
-
-    sys.stdout.write("\n")
+        if tmp_df.empty is False:
+            total_freqs = pd.concat([total_freqs, tmp_df], axis=0)
 
     total_freqs = total_freqs.reset_index(drop=True)
     genomes_df['digest_file'] = digest_ls
@@ -366,24 +357,23 @@ def process_genomes_iso(args, genomes_df):
     genomes to be processed
     '''
     digest_ls, prob_ls = [], []
-    total_freqs = pd.DataFrame(columns=['length', 'sum_prob', 'name', 'counts_file'])
-
-    sys.stdout.write("%s" % ("□" * genomes_df.shape[0]))
-    sys.stdout.flush()
-    sys.stdout.write("\b" * (genomes_df.shape[0]+1)) # return to start of line, after '[' 
+    total_freqs = pd.DataFrame({'length': pd.Series(dtype='int64'),
+                                'sum_prob': pd.Series(dtype='float64'),
+                                'name': pd.Series(dtype='object'),
+                                'counts_file': pd.Series(dtype='object')})
 
     for idx in range(genomes_df.shape[0]):
         args.g = genomes_df.iloc[idx]['genome']
+        print(args.g)
         args.comp = genomes_df.iloc[idx]['abundance']
-        digest_file = os.path.join(args.o, 'raw_digest_' +
+        digest_file = os.path.join(args.o, 'raw_digests', 'raw_digest_' +
                                    os.path.basename(args.g) + '.csv')
 
         df = digest_genomes_iso.main(args)
 
         if df.shape[0] == 0:
             digest_ls.append(None)
             prob_ls.append(None)
-            sys.stdout.write('□')
             continue
 
         digest_file = process_df_iso(df, digest_file, args)
@@ -393,12 +383,8 @@ def process_genomes_iso(args, genomes_df):
         tmp_df = pd.DataFrame(len_freqs.items(), columns=['length', 'sum_prob'])
         tmp_df['name'] = os.path.basename(args.g)
         tmp_df['counts_file'] = prob_file
-        total_freqs = pd.concat([total_freqs, tmp_df], axis=0)
-
-        sys.stdout.write('■')
-        sys.stdout.flush()
-
-    sys.stdout.write("\n")
+        if tmp_df.empty is False:
+            total_freqs = pd.concat([total_freqs, tmp_df], axis=0)
 
     total_freqs = total_freqs.reset_index(drop=True)
     genomes_df['digest_file'] = digest_ls
@@ -558,7 +544,7 @@ def save_individual_hist(prob_file, args):
                      binwidth=6,
                      alpha=0.75,
                      color='blue')
-        plt.savefig(os.path.join(args.o, 'hist_' +
+        plt.savefig(os.path.join(args.o, 'individual_histograms', 'hist_' +
                     os.path.basename(prob_file)[:-4] + '.png'),
                     facecolor='white', transparent=False)
         plt.close()
@@ -578,11 +564,11 @@ def save_combined_hist(total_freqs, image_name, weights, args):
         return
 
     old_legend = ax.legend_
-    handles = old_legend.legendHandles
+    handles = old_legend.legend_handles
     labels = [t.get_text() for t in old_legend.get_texts()]
     ax.legend(handles, labels, bbox_to_anchor=(1.02, 1), loc='upper left',
               borderaxespad=0)
-    plt.savefig(os.path.join(args.o, f'_{image_name}.pdf'),
+    plt.savefig(os.path.join(args.o, f'{image_name}.pdf'),
                 bbox_inches='tight')
     plt.close()
 
@@ -665,6 +651,9 @@ def simulate_error(command, sim_in, error_out):
         args.o = os.path.dirname(os.path.abspath(__file__))
     elif os.path.exists(args.o) is True:
         args.o = os.path.abspath(args.o)
+        os.mkdir(os.path.join(args.o, "raw_digests"))
+        os.mkdir(os.path.join(args.o, "individual_histograms"))
+        os.mkdir(os.path.join(args.o, "individual_counts"))
     else:
         sys.exit('directory not found at ' + os.path.abspath(args.o))
 

diff --git a/resources/images/fragment_distributions_small_test.png b/resources/images/fragment_distributions_small_test.png
diff --git a/resources/images/read_distributions_small_test.png b/resources/images/read_distributions_small_test.png
diff --git a/scripts/prob_n_copies.py b/scripts/prob_n_copies.py
@@ -3,17 +3,17 @@
 import os
 import pandas as pd
 
-
 def main(digest_file, args):
 
     df = pd.read_csv(digest_file)
     internal_max = df['internal'].max()
     copies_dt = copies_dict(internal_max, args)
 
     df = apply_approach(df, copies_dt)
-    prob_file = os.path.join(args.o, 'counts_' +
+    prob_file = os.path.join(args.o, 'individual_counts', 'counts_' +
                              os.path.basename(args.g) + '.csv')
     df.drop(df[df['probability'] == 0].index, inplace=True)
+    df['probability'] = df['probability'].astype(float)
 
     if df.duplicated(subset=['start','end']).any():
         df = bidirectional_weights(df)

diff --git a/scripts/prob_n_copies_iso.py b/scripts/prob_n_copies_iso.py
@@ -12,7 +12,7 @@ def main(digest_file, args):
     df = find_overlaps(df)
     df = calculate_prob(df)
 
-    prob_file = os.path.join(args.o, 'counts_' +
+    prob_file = os.path.join(args.o, 'individual_counts', 'counts_' +
                              os.path.basename(args.g) + '.csv')
     df['adj_prob'] = df['probability'] * args.comp
     df = df.reset_index(drop=True)

diff --git a/vignettes/README.md b/vignettes/README.md
@@ -0,0 +1,17 @@
+# Vignettes to test readsynth
+
+## Step 1: download sample metagenomes
+
+Within one of the vignette subdirectories, run the following shell script to download genomes into the "genomes" subdirectory.
+
+```
+bash download_genomes.sh
+```
+
+## Step 2: run readsynth
+
+You can investigate the "run_readsynth.sh" file to see the settings used before running as a script:
+
+```
+bash run_readsynth.sh
+```