Skip to content

Commit

Permalink
add deduplication
Browse files Browse the repository at this point in the history
  • Loading branch information
JappyPing committed Jun 30, 2024
1 parent 13e6c11 commit b1da6a1
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/noise2read/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
# @Last Modified by: Pengyao Ping
# @Last Modified time: 2023-09-07 14:03:54

__version__ = "0.3.0"
__version__ = "0.4.0"
5 changes: 5 additions & 0 deletions src/noise2read/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,10 @@ def __init__(self, config_file, logger):
else:
self.error_rate2 = 0.005 # default

if conf.has_option("Deduplication", "deduplication"):
self.umi_in_read = conf.getboolean("Deduplication", "deduplication")
else:
self.umi_in_read = True

# # Evaluation
# if conf.has_option("Evaluation", "delta"):
Expand Down Expand Up @@ -430,6 +434,7 @@ def __init__(self, config_file, logger):
self.error_rate1 = 0.09
self.error_rate2 = 0.02

self.deduplication = True
# # Evaluation
# self.delta = 1

Expand Down
Empty file added src/noise2read/deduplication.py
Empty file.
19 changes: 19 additions & 0 deletions src/noise2read/error_orrection.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,25 @@ def correct_amplicon_err(self, original_file, genuine_df, negative_df, new_negat
else:
return original_file

def get_deduplication(self, corrected_data):
record_iterator, file_type = parse_data(corrected_data)
seqs2id_dict = {}
for item in record_iterator:
seq = str(item.seq)
seqs2id_dict.setdefault(seq, []).append(str(item.id))

seq_records = []
record_num = 1
for read, ids in seqs2id_dict.items():
frequency = len(ids)
record_id = f"read {record_num}, counts: {frequency}"

seq_record = SeqRecord(Seq(read), id=record_id, description=" ".join(ids))
seq_records.append(seq_record)
record_num += 1
deduplicated_file = self.config.result_dir + self.base[0] + "_deduplicated.fasta"
SeqIO.write(seq_records, deduplicated_file, "fasta")

'''
def correct_amplicon_errors(self, orginal_file, df_data):
"""
Expand Down
10 changes: 9 additions & 1 deletion src/noise2read/noise2read.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,9 @@ def main():
else:
config.correct_data = corrected_file
logger.info("Error Correction finished.")
if config.deduplication:
EC.get_deduplication(config.correct_data)

del DG, EC
#MM.measure()
#gc.collect()
Expand Down Expand Up @@ -243,6 +246,8 @@ def main():
else:
config.correct_data = corrected_file
logger.info("Error Correction finished.")
if config.deduplication:
EC.get_deduplication(config.correct_data)
del DG, EC
#MM.measure()
#gc.collect()
Expand Down Expand Up @@ -339,6 +344,9 @@ def main():
logger.warning("No genuine or negative samples for amplicon errors prediction!")
config.correct_data = mid_result
logger.info("Error Correction finished.")
if config.deduplication:
EC.get_deduplication(config.correct_data)

del DG, EC
#MM.measure()
#gc.collect()
Expand Down Expand Up @@ -563,7 +571,7 @@ def main():
if os.path.exists(bcool_dir):
os.system("rm -rf %s" % bcool_dir)
############################################################################################################################
# elif module_arg == "extract_isolates":
# elif module_arg == "deduplication":

else:
# logger.error("Invalid module name, please check.")
Expand Down

0 comments on commit b1da6a1

Please sign in to comment.