Skip to content

Commit

Permalink
Added new test utilities, increased default BED test file length.
Browse files Browse the repository at this point in the history
 - New `copy_tempfile_for_inspection()` function for inspecting
   tempfiles that fail tests.
 - Added `scripts/seed_grinder.sh` which tries random seeds looking
   for failures.
  • Loading branch information
vsbuffalo committed Mar 5, 2024
1 parent 3b6cd34 commit 5567cc6
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 38 deletions.
16 changes: 16 additions & 0 deletions scripts/seed_grinder.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

START_SEED=1
END_SEED=10000

for SEED in $(seq $START_SEED $END_SEED); do
echo "Running tests with TEST_SEED=$SEED"
TEST_SEED=$SEED cargo test --test bedtools_validation
if [ $? -ne 0 ]; then
echo "Tests failed with TEST_SEED=$SEED"
exit 1
else
echo "Tests passed with TEST_SEED=$SEED"
fi
done

16 changes: 8 additions & 8 deletions src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -751,8 +751,8 @@ pub struct HistFeatures {
#[arg(short, long)]
chop: bool,

/// Assign each basepair exclusively to a single "feature set"
/// based on what features overlap it, and count the number of
/// Assign each basepair exclusively to a single "feature set"
/// based on what features overlap it, and count the number of
/// these per window. E.g. a region that overlaps "CDS" and "exon"
/// will be called a "CDS,exon" feature. Columns are sorted
/// in descending order by column sums. Headers will always be
Expand Down Expand Up @@ -924,8 +924,10 @@ impl HistFeatures {
.collect();

// Re-order the headers too by the sorted indices.
let feature_sets: Vec<_> = sorted_indices.iter()
.map(|i| feature_sets[*i].clone()).collect();
let feature_sets: Vec<_> = sorted_indices
.iter()
.map(|i| feature_sets[*i].clone())
.collect();

let mut headers = vec!["chrom".to_string(), "start".to_string(), "end".to_string()];
headers.extend(feature_sets);
Expand All @@ -940,8 +942,7 @@ impl HistFeatures {
}
}


// get column totals
// get column totals
fn column_totals(matrix: &Vec<Vec<Position>>) -> Vec<Position> {
if matrix.is_empty() || matrix[0].is_empty() {
return Vec::new();
Expand All @@ -959,12 +960,11 @@ fn column_totals(matrix: &Vec<Vec<Position>>) -> Vec<Position> {
}

totals
}
}

// get descending indices order
fn sorted_indices_by_values(values: &[Position]) -> Vec<usize> {
let mut indices: Vec<usize> = (0..values.len()).collect();
indices.sort_by_key(|&i| std::cmp::Reverse(values[i]));
indices
}

81 changes: 54 additions & 27 deletions src/test_utilities.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
//! Test cases and test utility functions.
//!
use std::{io::BufRead, path::PathBuf};
use std::{
env,
fs::{self, copy},
io::BufRead,
path::PathBuf,
};
use tempfile::{Builder, NamedTempFile};

use crate::{
commands::granges_random_bed,
Expand All @@ -22,12 +28,16 @@ use genomap::GenomeMap;
use indexmap::IndexMap;
#[cfg(feature = "ndarray")]
use ndarray::{Array1, Array2};
use rand::{rngs::StdRng, distributions::Uniform, seq::SliceRandom, Rng, SeedableRng};

// Set the seed for reproducible tests.
const SEED: u64 = 4;

use tempfile::{Builder, NamedTempFile};
use rand::{distributions::Uniform, rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};

// Use a default random seed, unless one is in the environment.
fn get_rng() -> StdRng {
let seed: u64 = env::var("TEST_SEED")
.unwrap_or_else(|_| "13".to_string())
.parse()
.expect("TEST_SEED must be a valid u64");
StdRng::seed_from_u64(seed)
}

/// Get the path to the `grange` command line tool after a build.
/// This is used for integration tests and benchmarks.
Expand Down Expand Up @@ -62,31 +72,29 @@ pub const MAX_CHROM_LEN: Position = 250_000_000;

/// Build a random range start/end on a sequence of `max_len`.
/// 0-indexed, right exclusive
pub fn random_range(chrom_len: Position) -> (Position, Position) {
let mut rng = StdRng::seed_from_u64(SEED);
pub fn random_range(chrom_len: Position, rng: &mut StdRng) -> (Position, Position) {
let len = rng.gen_range(MIN_LEN..MAX_LEN);
let start = rng.gen_range(0..chrom_len - len + 1);
(start, start + len)
}

/// Build random sequence lengths
pub fn random_seqlen() -> Position {
let mut rng = StdRng::seed_from_u64(SEED);
pub fn random_seqlen(rng: &mut StdRng) -> Position {
rng.gen_range(MIN_CHROM_LEN..=MAX_CHROM_LEN)
}

/// Sample a random chromosome
pub fn random_chrom() -> String {
let mut rng = StdRng::seed_from_u64(SEED);
pub fn random_chrom(rng: &mut StdRng) -> String {
format!("chr{}", rng.gen_range(1..NCHROM + 1))
}

/// Build random [`VecRanges`]
pub fn random_vecranges(n: usize) -> VecRanges<RangeEmpty> {
let seqlen = random_seqlen();
let mut rng = get_rng();
let seqlen = random_seqlen(&mut rng);
let mut vr = VecRanges::new(seqlen);
for _i in 0..n {
let (start, end) = random_range(seqlen);
let (start, end) = random_range(seqlen, &mut rng);
let range = RangeEmpty::new(start, end);
vr.push_range(range);
}
Expand All @@ -99,7 +107,7 @@ pub fn random_granges(
seqlens: &IndexMap<String, Position>,
num: usize,
) -> Result<GRangesEmpty<VecRangesEmpty>, GRangesError> {
let mut rng = StdRng::seed_from_u64(SEED);
let mut rng = get_rng();

let mut gr = GRangesEmpty::new_vec(seqlens);

Expand All @@ -109,24 +117,22 @@ pub fn random_granges(
let chrom_len = *seqlens
.get(seqname)
.ok_or_else(|| GRangesError::MissingSequence(seqname.clone()))?;
let (start, end) = random_range(chrom_len);
let (start, end) = random_range(chrom_len, &mut rng);
gr.push_range(seqname, start, end)?;
}
Ok(gr)
}

/// Generate random strings, e.g. for mock feature names.
fn generate_random_string(n: usize) -> String {
let mut rng = StdRng::seed_from_u64(SEED);
fn generate_random_string(n: usize, rng: &mut StdRng) -> String {
let letters: Vec<char> = ('a'..='z').collect();
let letters_dist = Uniform::from(0..letters.len());

(0..n).map(|_| letters[rng.sample(letters_dist)]).collect()
}

/// Generate a random float value, e.g. for a mock BED "score".
fn generate_random_uniform(start: f64, end: f64) -> f64 {
let mut rng = StdRng::seed_from_u64(SEED);
fn generate_random_uniform(start: f64, end: f64, rng: &mut StdRng) -> f64 {
let uniform = Uniform::new(start, end); // Specify the range
rng.sample(uniform)
}
Expand All @@ -137,7 +143,7 @@ pub fn random_granges_mock_bed5(
seqlens: &IndexMap<String, Position>,
num: usize,
) -> Result<GRanges<VecRangesIndexed, Vec<Bed5Addition>>, GRangesError> {
let mut rng = StdRng::seed_from_u64(SEED);
let mut rng = get_rng();

let mut gr = GRanges::new_vec(seqlens);

Expand All @@ -147,10 +153,10 @@ pub fn random_granges_mock_bed5(
let chrom_len = *seqlens
.get(seqname)
.ok_or_else(|| GRangesError::MissingSequence(seqname.clone()))?;
let (start, end) = random_range(chrom_len);
let (start, end) = random_range(chrom_len, &mut rng);
let bed5_cols = Bed5Addition {
name: generate_random_string(8),
score: Some(generate_random_uniform(0.0, 1.0)),
name: generate_random_string(8, &mut rng),
score: Some(generate_random_uniform(0.0, 1.0, &mut rng)),
};
gr.push_range(seqname, start, end, bed5_cols)?;
}
Expand All @@ -166,10 +172,31 @@ pub fn random_coitrees() -> COITrees<()> {

/// Get a temporary file with a .bed suffix.
pub fn temp_bedfile() -> NamedTempFile {
Builder::new()
let tempfile = Builder::new()
.suffix(".bed")
.tempfile()
.expect("Failed to create temp file")
.expect("Failed to create temp file");
tempfile
}

/// Copy a temp file for inspection, allowing the specification of a custom temporary directory name.
// AICODE -- co-written with AI
pub fn copy_tempfile_for_inspection(tempfile: impl Into<PathBuf>, new_file_name: &str) -> PathBuf {
let tempfile = tempfile.into();
// Check if the TEST_LOCAL environment variable is set
if env::var("TEST_LOCAL").is_ok() {
let dir_path = PathBuf::from("test_temp");
fs::create_dir_all(&dir_path).expect("could not create test_temp/");

let dest_path = dir_path.join(new_file_name);

copy(tempfile, &dest_path).expect("could not copy temp file");

dest_path
} else {
// If TEST_LOCAL is not set, simply return the path of the original temp file
tempfile
}
}

/// Create a random BED3 file based on the hg38 sequence lengths, and write to disk.
Expand Down
18 changes: 15 additions & 3 deletions tests/bedtools_validation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ fn validate_bedfloats(
// adjustable test sizes -- useful also for making very small
// so string diffs are easier to compare.
#[cfg(not(feature = "bench-big"))]
const BED_LENGTH: usize = 10;
const BED_LENGTH: usize = 100_000;
#[cfg(feature = "bench-big")]
const BED_LENGTH: usize = 1_000_000;

Expand Down Expand Up @@ -322,6 +322,10 @@ fn test_against_bedtools_makewindows() {
let widths = vec![131123, 1_000_0013];
let steps = vec![10_001, 10_113];

// smaller test set:
// let widths = vec![1_000_0013];
// let steps = vec![10000];

for width in widths.iter() {
for step in steps.iter() {
let bedtools_output = Command::new("bedtools")
Expand Down Expand Up @@ -441,8 +445,8 @@ fn test_against_bedtools_map_multiple() {
// try multiple operations at once
let num_ranges = BED_LENGTH;
let width = 1_000_000;
#[allow(unused_variables)]
let step = 10_000; // can uncomment lines below to test this
// #[allow(unused_variables)]
// let step = 10_000; // can uncomment lines below to test this

// make windows
let windows_file = temp_bedfile();
Expand All @@ -464,8 +468,11 @@ fn test_against_bedtools_map_multiple() {
granges_windows_output
);

// copy_tempfile_for_inspection(&windows_file.path(), "windows.bed");

// create the random data BED5
let bedscores_file = random_bed5file(num_ranges);
// copy_tempfile_for_inspection(&bedscores_file.path(), "scores.bed");

let bedtools_path = temp_bedfile();
let bedtools_output_file = File::create(&bedtools_path).unwrap();
Expand All @@ -485,6 +492,8 @@ fn test_against_bedtools_map_multiple() {
.output()
.expect("bedtools map failed");

// copy_tempfile_for_inspection(&bedtools_path.path(), "bedtools.bed");

let granges_output_file = temp_bedfile();
let granges_output = Command::new(granges_binary_path())
.arg("map")
Expand All @@ -501,6 +510,8 @@ fn test_against_bedtools_map_multiple() {
.output()
.expect("granges map failed");

// copy_tempfile_for_inspection(&granges_output_file.path(), "granges.bed");

assert!(bedtools_output.status.success(), "{:?}", bedtools_output);
assert!(granges_output.status.success(), "{:?}", granges_output);

Expand Down Expand Up @@ -539,6 +550,7 @@ fn test_against_bedtools_map_multiple() {
.iter()
.zip(bedtools_data.iter())
.for_each(|(gr, bd)| {
// dbg!((gr.min, bd.min));
assert_option_float_tol!(gr.min, bd.min);
assert_option_float_tol!(gr.max, bd.max);

Expand Down

0 comments on commit 5567cc6

Please sign in to comment.