Skip to content

Commit

Permalink
Update Final_Output.singleReference.R
Browse files Browse the repository at this point in the history
Removed bug where ORF output did not report non-IS depth
  • Loading branch information
joshuakirsch authored Jun 6, 2024
1 parent c92f470 commit dd02323
Showing 1 changed file with 104 additions and 38 deletions.
142 changes: 104 additions & 38 deletions Final_Output.singleReference.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ options(dplyr.summarise.inform = FALSE)

args = commandArgs(trailingOnly=TRUE)

sampleList = read_tsv(args[1], col_names = "Sample",
show_col_types = FALSE)
if (args[4] == "contig"){

sampleList = read_tsv(args[1], col_names = "Sample",
show_col_types = FALSE)
analysis = read_tsv(args[2],
show_col_types = FALSE) %>%
rowwise() %>%
Expand All @@ -21,7 +23,6 @@ sampleList = read_tsv(args[1], col_names = "Sample",
filter (low_end >= 0.1*(high_end)) %>%
distinct()


nonIS_mapping = data.frame()
for (i in seq(nrow(sampleList))){
temp = read_tsv (paste("mapping_files/", sampleList$Sample[i], ".contig.reads_mapped.depth", sep=""),
Expand All @@ -38,45 +39,43 @@ sampleList = read_tsv(args[1], col_names = "Sample",
"sample"= "sample")) %>%
mutate (depth_percentage = (100*(sum(`5prime`+`3prime`)/(sum(`5prime`+`3prime`)+non_IS_depth))))

ins_itr = analysis_annot %>% select (contig, insertion_pos) %>% distinct () %>%
ungroup() %>%
arrange (contig, insertion_pos) %>%
mutate (lag_insertion_pos = lag(insertion_pos)) %>%
mutate (adj_insertion_pos = insertion_pos) %>%
ins_itr = analysis_annot %>% select (contig, insertion_pos) %>% distinct () %>%
ungroup() %>%
arrange (contig, insertion_pos) %>%
mutate (lag_insertion_pos = lag(insertion_pos)) %>%
mutate (adj_insertion_pos = insertion_pos) %>%
mutate (lag_is_nearby = (contig == lag(contig) &
between (insertion_pos, (lag_insertion_pos-15), (lag_insertion_pos+15)) &
!is.na (lag_insertion_pos) &
lag(adj_insertion_pos) != adj_insertion_pos))

while(any(ins_itr$lag_is_nearby)) {
ins_itr = ins_itr %>% mutate (adj_insertion_pos = ifelse (lag_is_nearby,
lag(adj_insertion_pos), adj_insertion_pos)) %>%
mutate (lag_is_nearby = (contig == lag(contig) &
between (insertion_pos, (lag_insertion_pos-15), (lag_insertion_pos+15)) &
!is.na (lag_insertion_pos) &
lag(adj_insertion_pos) != adj_insertion_pos))

while(any(ins_itr$lag_is_nearby)) {
ins_itr = ins_itr %>% mutate (adj_insertion_pos = ifelse (lag_is_nearby,
lag(adj_insertion_pos), adj_insertion_pos)) %>%
mutate (lag_is_nearby = (contig == lag(contig) &
between (insertion_pos, (lag_insertion_pos-15), (lag_insertion_pos+15)) &
!is.na (lag_insertion_pos) &
lag(adj_insertion_pos) != adj_insertion_pos))
}
temp = ins_itr %>% select (contig, adj_insertion_pos) %>% distinct ()
temp$unique_ins_itr = seq(nrow(temp))
ins_itr = ins_itr %>% left_join(temp)

analysis_annot = analysis_annot %>% left_join(ins_itr, by=c("contig" = "contig", "insertion_pos"="insertion_pos")) %>%
select (sample, IS,contig, adj_insertion_pos,`5prime`, `3prime`,
total_IS_depth, "max_depth_site_insertion_pos", max_depth_site_depth, non_IS_depth, depth_percentage) %>%
rename ("Insertion_Position" = adj_insertion_pos) %>%
rename ("5prime_IS_depth" = `5prime`) %>%
rename ("3prime_IS_depth" = `3prime`)

#this block removes IS elements false hits (likely) and adds a better naming strategy for the IS elements

IS_fam_df = read_tsv(paste (args[3],"/IS_fam_annot.txt", sep="")) %>%
rowwise () %>%
mutate (IS = gsub("_", ":", IS))

analysis_annot = analysis_annot %>%
left_join(IS_fam_df)

if (args[4] == "contig"){
}
temp = ins_itr %>% select (contig, adj_insertion_pos) %>% distinct ()
temp$unique_ins_itr = seq(nrow(temp))
ins_itr = ins_itr %>% left_join(temp)

analysis_annot = analysis_annot %>% left_join(ins_itr, by=c("contig" = "contig", "insertion_pos"="insertion_pos")) %>%
select (sample, IS,contig, adj_insertion_pos,`5prime`, `3prime`,
total_IS_depth, "max_depth_site_insertion_pos", max_depth_site_depth, non_IS_depth, depth_percentage) %>%
rename ("Insertion_Position" = adj_insertion_pos) %>%
rename ("5prime_IS_depth" = `5prime`) %>%
rename ("3prime_IS_depth" = `3prime`)

#this block removes IS elements false hits (likely) and adds a better naming strategy for the IS elements

IS_fam_df = read_tsv(paste (args[3],"/IS_fam_annot.txt", sep="")) %>%
rowwise () %>%
mutate (IS = gsub("_", ":", IS))

analysis_annot = analysis_annot %>%
left_join(IS_fam_df)
inOrf = read_tsv ("final_results/IS_hits_in_orfs.txt",
col_names = c("contig","start_pos","max_depth_site_insertion_pos", "ORF"),
show_col_types = FALSE) %>%
Expand All @@ -90,6 +89,73 @@ if (args[4] == "contig"){
rename ("IS_Depth_at_Max_Depth_Site" = max_depth_site_depth)
write_tsv (analysis_annot, "final_results/pseudoR_output.contig.tsv")
} else if (args[4] == "ORF"){

sampleList = read_tsv(args[1], col_names = "Sample",
show_col_types = FALSE)
analysis = read_tsv(args[2],
show_col_types = FALSE) %>%
rowwise() %>%
mutate (total_IS_depth = sum(`5prime`+`3prime`)) %>%
filter (total_IS_depth >= 4) %>%
mutate (low_end = min (c(`5prime`,`3prime`))) %>%
mutate (high_end = max (c(`5prime`,`3prime`))) %>%
filter (low_end >= 0.1*(high_end)) %>%
distinct()


nonIS_mapping = data.frame()
for (i in seq(nrow(sampleList))){
temp = read_tsv (paste("mapping_files/", sampleList$Sample[i], ".orf.reads_mapped.depth", sep=""),
col_names = c("contig","start_pos","max_depth_site_insertion_pos","non_IS_depth"),
show_col_types = FALSE) %>%
mutate (sample = sampleList$Sample[i]) %>%
mutate (max_depth_site_insertion_pos = ifelse(start_pos==0, 0, max_depth_site_insertion_pos))
nonIS_mapping = rbind (nonIS_mapping, temp)
}
nonIS_mapping = nonIS_mapping %>% distinct()

analysis_annot = analysis %>% rowwise () %>%
left_join (nonIS_mapping, by=c("contig" = "contig", "max_depth_site_insertion_pos" = "max_depth_site_insertion_pos",
"sample"= "sample")) %>%
mutate (depth_percentage = (100*(sum(`5prime`+`3prime`)/(sum(`5prime`+`3prime`)+non_IS_depth))))

ins_itr = analysis_annot %>% select (contig, insertion_pos) %>% distinct () %>%
ungroup() %>%
arrange (contig, insertion_pos) %>%
mutate (lag_insertion_pos = lag(insertion_pos)) %>%
mutate (adj_insertion_pos = insertion_pos) %>%
mutate (lag_is_nearby = (contig == lag(contig) &
between (insertion_pos, (lag_insertion_pos-15), (lag_insertion_pos+15)) &
!is.na (lag_insertion_pos) &
lag(adj_insertion_pos) != adj_insertion_pos))

while(any(ins_itr$lag_is_nearby)) {
ins_itr = ins_itr %>% mutate (adj_insertion_pos = ifelse (lag_is_nearby,
lag(adj_insertion_pos), adj_insertion_pos)) %>%
mutate (lag_is_nearby = (contig == lag(contig) &
between (insertion_pos, (lag_insertion_pos-15), (lag_insertion_pos+15)) &
!is.na (lag_insertion_pos) &
lag(adj_insertion_pos) != adj_insertion_pos))
}
temp = ins_itr %>% select (contig, adj_insertion_pos) %>% distinct ()
temp$unique_ins_itr = seq(nrow(temp))
ins_itr = ins_itr %>% left_join(temp)

analysis_annot = analysis_annot %>% left_join(ins_itr, by=c("contig" = "contig", "insertion_pos"="insertion_pos")) %>%
select (sample, IS,contig, adj_insertion_pos,`5prime`, `3prime`,
total_IS_depth, "max_depth_site_insertion_pos", max_depth_site_depth, non_IS_depth, depth_percentage) %>%
rename ("Insertion_Position" = adj_insertion_pos) %>%
rename ("5prime_IS_depth" = `5prime`) %>%
rename ("3prime_IS_depth" = `3prime`)

#this block removes IS elements false hits (likely) and adds a better naming strategy for the IS elements

IS_fam_df = read_tsv(paste (args[3],"/IS_fam_annot.txt", sep="")) %>%
rowwise () %>%
mutate (IS = gsub("_", ":", IS))

analysis_annot = analysis_annot %>%
left_join(IS_fam_df)
analysis_annot = analysis_annot %>%
mutate ("ORF"=contig) %>%
rename ("Max_Depth_Site_Position" = max_depth_site_insertion_pos) %>%
Expand Down

0 comments on commit dd02323

Please sign in to comment.