Skip to content

Commit

Permalink
fixed pandas future warnings, added vignettes
Browse files Browse the repository at this point in the history
  • Loading branch information
ryandkuster committed Jan 8, 2024
1 parent d21b188 commit 97ca245
Show file tree
Hide file tree
Showing 12 changed files with 1,290 additions and 36 deletions.
55 changes: 22 additions & 33 deletions readsynth.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,35 +304,30 @@ def process_genomes(args, genomes_df):
genomes to be processed
'''
digest_ls, prob_ls = [], []
total_freqs = pd.DataFrame(columns=['length',
'sum_prob',
'name',
'counts_file'])

sys.stdout.write("%s" % ("□" * genomes_df.shape[0]))
sys.stdout.flush()
sys.stdout.write("\b" * (genomes_df.shape[0]+1)) # return to start of line, after '['
total_freqs = pd.DataFrame({'length': pd.Series(dtype='int64'),
'sum_prob': pd.Series(dtype='float64'),
'name': pd.Series(dtype='object'),
'counts_file': pd.Series(dtype='object')})

for idx in range(genomes_df.shape[0]):
args.g = genomes_df.iloc[idx]['genome']
print(args.g)
args.comp = genomes_df.iloc[idx]['abundance']
digest_file = os.path.join(args.o, 'raw_digest_' +
digest_file = os.path.join(args.o, 'raw_digests', 'raw_digest_' +
os.path.basename(args.g) + '.csv')

df = digest_genomes.main(args)

if df.shape[0] == 0:
digest_ls.append(None)
prob_ls.append(None)
sys.stdout.write('□')
continue

digest_file = process_df(df, digest_file, args)

if digest_file is None:
digest_ls.append(None)
prob_ls.append(None)
sys.stdout.write('□')
continue

prob_file, len_freqs = prob_n_copies.main(digest_file, args)
Expand All @@ -342,12 +337,8 @@ def process_genomes(args, genomes_df):
tmp_df = pd.DataFrame(len_freqs.items(), columns=['length', 'sum_prob'])
tmp_df['name'] = os.path.basename(args.g)
tmp_df['counts_file'] = prob_file
total_freqs = pd.concat([total_freqs, tmp_df], axis=0)

sys.stdout.write('■')
sys.stdout.flush()

sys.stdout.write("\n")
if tmp_df.empty is False:
total_freqs = pd.concat([total_freqs, tmp_df], axis=0)

total_freqs = total_freqs.reset_index(drop=True)
genomes_df['digest_file'] = digest_ls
Expand All @@ -366,24 +357,23 @@ def process_genomes_iso(args, genomes_df):
genomes to be processed
'''
digest_ls, prob_ls = [], []
total_freqs = pd.DataFrame(columns=['length', 'sum_prob', 'name', 'counts_file'])

sys.stdout.write("%s" % ("□" * genomes_df.shape[0]))
sys.stdout.flush()
sys.stdout.write("\b" * (genomes_df.shape[0]+1)) # return to start of line, after '['
total_freqs = pd.DataFrame({'length': pd.Series(dtype='int64'),
'sum_prob': pd.Series(dtype='float64'),
'name': pd.Series(dtype='object'),
'counts_file': pd.Series(dtype='object')})

for idx in range(genomes_df.shape[0]):
args.g = genomes_df.iloc[idx]['genome']
print(args.g)
args.comp = genomes_df.iloc[idx]['abundance']
digest_file = os.path.join(args.o, 'raw_digest_' +
digest_file = os.path.join(args.o, 'raw_digests', 'raw_digest_' +
os.path.basename(args.g) + '.csv')

df = digest_genomes_iso.main(args)

if df.shape[0] == 0:
digest_ls.append(None)
prob_ls.append(None)
sys.stdout.write('□')
continue

digest_file = process_df_iso(df, digest_file, args)
Expand All @@ -393,12 +383,8 @@ def process_genomes_iso(args, genomes_df):
tmp_df = pd.DataFrame(len_freqs.items(), columns=['length', 'sum_prob'])
tmp_df['name'] = os.path.basename(args.g)
tmp_df['counts_file'] = prob_file
total_freqs = pd.concat([total_freqs, tmp_df], axis=0)

sys.stdout.write('■')
sys.stdout.flush()

sys.stdout.write("\n")
if tmp_df.empty is False:
total_freqs = pd.concat([total_freqs, tmp_df], axis=0)

total_freqs = total_freqs.reset_index(drop=True)
genomes_df['digest_file'] = digest_ls
Expand Down Expand Up @@ -558,7 +544,7 @@ def save_individual_hist(prob_file, args):
binwidth=6,
alpha=0.75,
color='blue')
plt.savefig(os.path.join(args.o, 'hist_' +
plt.savefig(os.path.join(args.o, 'individual_histograms', 'hist_' +
os.path.basename(prob_file)[:-4] + '.png'),
facecolor='white', transparent=False)
plt.close()
Expand All @@ -578,11 +564,11 @@ def save_combined_hist(total_freqs, image_name, weights, args):
return

old_legend = ax.legend_
handles = old_legend.legendHandles
handles = old_legend.legend_handles
labels = [t.get_text() for t in old_legend.get_texts()]
ax.legend(handles, labels, bbox_to_anchor=(1.02, 1), loc='upper left',
borderaxespad=0)
plt.savefig(os.path.join(args.o, f'_{image_name}.pdf'),
plt.savefig(os.path.join(args.o, f'{image_name}.pdf'),
bbox_inches='tight')
plt.close()

Expand Down Expand Up @@ -665,6 +651,9 @@ def simulate_error(command, sim_in, error_out):
args.o = os.path.dirname(os.path.abspath(__file__))
elif os.path.exists(args.o) is True:
args.o = os.path.abspath(args.o)
os.mkdir(os.path.join(args.o, "raw_digests"))
os.mkdir(os.path.join(args.o, "individual_histograms"))
os.mkdir(os.path.join(args.o, "individual_counts"))
else:
sys.exit('directory not found at ' + os.path.abspath(args.o))

Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions scripts/prob_n_copies.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,17 @@
import os
import pandas as pd


def main(digest_file, args):

df = pd.read_csv(digest_file)
internal_max = df['internal'].max()
copies_dt = copies_dict(internal_max, args)

df = apply_approach(df, copies_dt)
prob_file = os.path.join(args.o, 'counts_' +
prob_file = os.path.join(args.o, 'individual_counts', 'counts_' +
os.path.basename(args.g) + '.csv')
df.drop(df[df['probability'] == 0].index, inplace=True)
df['probability'] = df['probability'].astype(float)

if df.duplicated(subset=['start','end']).any():
df = bidirectional_weights(df)
Expand Down
2 changes: 1 addition & 1 deletion scripts/prob_n_copies_iso.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def main(digest_file, args):
df = find_overlaps(df)
df = calculate_prob(df)

prob_file = os.path.join(args.o, 'counts_' +
prob_file = os.path.join(args.o, 'individual_counts', 'counts_' +
os.path.basename(args.g) + '.csv')
df['adj_prob'] = df['probability'] * args.comp
df = df.reset_index(drop=True)
Expand Down
17 changes: 17 additions & 0 deletions vignettes/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Vignettes to test readsynth

## Step 1: download sample metagenomes

Within one of the vignette subdirectories, run the following shell script to download genomes into the "genomes" subdirectory.

```
bash download_genomes.sh
```

## Step 2: run readsynth

You can investigate the "run_readsynth.sh" file to see the settings used before running as a script:

```
bash run_readsynth.sh
```
Loading

0 comments on commit 97ca245

Please sign in to comment.