Source code for scReadSim.scATAC_GenerateBAM

import pandas as pd
import numpy as np
import csv
import collections
import os
pd.options.mode.chained_assignment = None  # default='warn'
import random
import subprocess
from tqdm import tqdm
import pysam

[docs] def flatten(x): """Flatten a nested list. """ if isinstance(x, collections.Iterable): return [a for i in x for a in flatten(i)] else: return [x]
[docs] def cellbarcode_generator(length, size=10): """Generate random cellbarcode. Parameters ---------- length: `int` Number of cells. size: `int` (default: '10') Size of cell barcode. Default value is 10 bp. Return ------ cb_list: `list` List of randomly generated cell barcodes. """ chars = 'ACGT' cb_list = [''.join(random.choice(chars) for _ in range(size)) for cell in range(length)] return cb_list
[docs] def find_leftnearest_nonpeak(non_peak_list, gray_peak): """Find the gray peak's left nearest non-peak from a non-peak list. """ dist1 = np.abs(non_peak_list[:,2].astype(int) - int(gray_peak[1])) id = dist1.argmin() return id
[docs] def scATAC_GenerateBAMCoord(bed_file, count_mat_file, synthetic_cell_label_file, read_bedfile_prename, INPUT_bamfile, outdirectory, OUTPUT_cells_barcode_file, jitter_size=5, read_len=50, random_noise_mode=False, GrayAreaModeling=False): """Generate Synthetic reads in BED format. Parameters ---------- bed_file: `str` Features' bed file to generate the synthetic reads (Generated by function `Utility.scATAC_CreateFeatureSets`). count_mat_file: `str` The path to the synthetic count matrix generated by `GenerateSyntheticCount.scATAC_GenerateSyntheticCount`. synthetic_cell_label_file: `str` Synthetic cell label file generated by `scATAC_GenerateSyntheticCount`. read_bedfile_prename: `str` Specify the base name of output bed file. INPUT_bamfile: `str` Input BAM file for anlaysis. outdirectory: `str` Specify the output directory for synthetic reads bed file. OUTPUT_cells_barcode_file: `str` Specify the file name storing the synthetic cell barcodes. jitter_size: `int` (default: '5') Specify the range of random shift to avoid replicate synthetic reads. Default value is 5 bp. read_len: `int` (default: '50') Specify the length of synthetic reads. Default value is 50 bp. random_noise_mode: `bool` (default: 'False') Specify whether to use a uniform distribution of reads. GrayAreaModeling: `bool` (default: 'False') Specify whether to generate synthetic reads for Gray Areas using non-peak counts. Do not specify 'True' when generating reads for peaks. """ count_mat_df = pd.read_csv("%s" % (count_mat_file), header=None, delimiter="\t") # Remove count matrix first column: feature names count_mat = count_mat_df.iloc[: , 1:].to_numpy() count_mat_cluster = pd.read_csv(synthetic_cell_label_file, header=None, delimiter="\t").to_numpy().flatten() n_cell = np.shape(count_mat)[1] samfile = pysam.AlignmentFile(INPUT_bamfile, "rb") with open(bed_file) as file: reader = csv.reader(file, delimiter="\t") open_peak = np.asarray(list(reader)) peak_nonzero_id = np.nonzero(count_mat.sum(axis=1))[0] random.seed(2022) random_cellbarcode_list = cellbarcode_generator(n_cell, size=16) with open(outdirectory + "/"+ OUTPUT_cells_barcode_file, 'w') as f: for item in random_cellbarcode_list: f.write(item + "\n") cellbarcode_list_withclusters = np.vstack((random_cellbarcode_list, count_mat_cluster)).transpose() with open(outdirectory + "/"+ OUTPUT_cells_barcode_file + ".withSynthCluster", 'w') as f: for item in cellbarcode_list_withclusters: f.write("\t".join(item) + "\n") # Create read 1 and read 2 files with open("%s/%s.read1.bed" % (outdirectory, read_bedfile_prename), 'w') as f_1: pass with open("%s/%s.read2.bed" % (outdirectory, read_bedfile_prename), 'w') as f_2: pass print("[scReadSim] Generating Synthetic Reads for Feature Set: %s" % (bed_file)) for relative_peak_ind in tqdm(range(len(peak_nonzero_id))): peak_ind = peak_nonzero_id[relative_peak_ind] rec = open_peak[peak_ind] rec_name = '_'.join(rec) reads = samfile.fetch(rec[0], int(rec[1]), int(rec[2])) # tic = time.time() # Extract read information reads_str = [] for read in reads: if read.is_reverse==1: strand = -1 else: strand = 1 if read.is_read1==1: read_order = 1 else: read_order = 2 start = read.reference_start mate_start = read.next_reference_start read_len_cur = read.query_alignment_length read_info = [start, mate_start, read_len_cur, read_order, strand] reads_str.append(read_info) if len(reads_str) > 0: # If no real reads exist in the peak, skip count_vec = count_mat[peak_ind,:] # Synthetic umi count count_frag_vec = np.ceil(count_vec/2).astype(int) npair_read_synthetic = np.sum(count_frag_vec).astype(int) # nrow(reads_cur) should equal to nfrag_cur # npair_read_synthetic = np.ceil(np.sum(count_vec)/2).astype(int) # total number of synthetic reads read_sampled_unsplit = np.array(reads_str)[np.random.choice(len(reads_str), size=npair_read_synthetic, replace=True),:] # Sample starting position if random noise mode is on, or use real read starting position if random_noise_mode == True: read_synthetic_start = np.random.randint(int(rec[1]), int(rec[2]), size=npair_read_synthetic) else: read_synthetic_start = read_sampled_unsplit[:,0].astype(int) # Generate Read Name nonempty_cell_ind = np.where(count_frag_vec != 0)[0] target_peak_concat = rec[0] + ":" + str(rec[1]) + "-" + str(rec[2]) read_name_list = [random_cellbarcode_list[nonempty_cell_ind[ind]] + "CellNo" + str(nonempty_cell_ind[ind] + 1) + ":" + str(target_peak_concat) + "#" + str(count).zfill(4) for ind in range(len(nonempty_cell_ind)) for count in range(count_frag_vec[nonempty_cell_ind[ind]])] # Create dataframe for unspiltted sampled reads reads_cur = pd.DataFrame({ 'chr': rec[0], 'r_start': read_synthetic_start, 'mate_start': read_sampled_unsplit[:,1].astype(int) + read_synthetic_start - read_sampled_unsplit[:,0].astype(int), 'read_name': read_name_list, 'length': read_sampled_unsplit[:,2], 'read_order': read_sampled_unsplit[:,3].astype(int), 'strand': read_sampled_unsplit[:,4].astype(int), 'mate_strand': -read_sampled_unsplit[:,4].astype(int) }) contain_read_indicator = read_sampled_unsplit[:,0] == read_sampled_unsplit[:,1] reads_cur['read_length'] = read_len reads_cur['read_length'][contain_read_indicator] = abs(reads_cur['length'].astype(int)[contain_read_indicator]) # Add jitter size to read positions jitter_value_vec = np.random.randint(-jitter_size,jitter_size,size=npair_read_synthetic).astype(int) # nrow(reads_cur) should equal to nfrag_cur reads_cur['r_start_shifted'] = reads_cur['r_start'].astype(int) + jitter_value_vec reads_cur['mate_start_shifted'] = reads_cur['mate_start'].astype(int) + jitter_value_vec reads_cur['r_end_shifted'] = reads_cur['r_start'].astype(int) + reads_cur['read_length'].astype(int) + jitter_value_vec reads_cur['mate_end_shifted'] = reads_cur['mate_start'].astype(int) + reads_cur['read_length'].astype(int) + jitter_value_vec # Split read 1 and read 2 read_1_df = pd.concat([reads_cur.loc[reads_cur['read_order'] == 1, ['chr','r_start_shifted', 'r_end_shifted', 'read_length', 'strand']].rename(columns={'r_start_shifted':'r1_start_shifted', 'r_end_shifted':'r1_end_shifted'}), reads_cur.loc[reads_cur['read_order'] == 2, ['chr','mate_start_shifted', 'mate_end_shifted', 'read_length', 'mate_strand']].rename(columns={'mate_start_shifted':'r1_start_shifted', 'mate_end_shifted':'r1_end_shifted', 'mate_strand': 'strand'})], ignore_index=True) read_2_df = pd.concat([reads_cur.loc[reads_cur['read_order'] == 1, ['chr','mate_start_shifted', 'mate_end_shifted', 'read_length', 'mate_strand']].rename(columns={'mate_start_shifted':'r2_start_shifted', 'mate_end_shifted':'r2_end_shifted', 'mate_strand': 'strand'}), reads_cur.loc[reads_cur['read_order'] == 2, ['chr','r_start_shifted', 'r_end_shifted', 'read_length', 'strand']].rename(columns={'r_start_shifted':'r2_start_shifted', 'r_end_shifted':'r2_end_shifted'}), ], ignore_index=True) read_1_df['read_name'] = read_name_list read_2_df['read_name'] = read_name_list # read_1_df['read_length'] = read_len # read_2_df['read_length'] = read_len read_1_df['strand'] = ['+' if x == 1 else '-' for x in read_1_df['strand']] read_2_df['strand'] = ['+' if x == 1 else '-' for x in read_2_df['strand']] read_1_df_order = read_1_df[['chr','r1_start_shifted', 'r1_end_shifted', 'read_name', 'read_length', 'strand']] read_2_df_order = read_2_df[['chr','r2_start_shifted', 'r2_end_shifted', 'read_name', 'read_length', 'strand']] if read_1_df_order.shape[0] != read_2_df_order.shape[0]: print("[Warning] Peak %s read 1 and read 2 not identical!", relative_peak_ind) if np.sum(np.array(read_1_df_order[['r1_start_shifted']] < 0)) + np.sum(np.array(read_2_df_order[['r2_start_shifted']] < 0)) > 0: print("[Warning] Synthetic read pair for Peak %s %s has read 1 or read 2 start position negative: synthetic read pair removed!" % (relative_peak_ind, rec_name)) ind_preserve = np.array(read_1_df_order[['r1_start_shifted']] >= 0) * np.array(read_2_df_order[['r2_start_shifted']] >= 0) # Remove rows with read 1 or read 2's start position negative read_1_df_order_removeNegRead = read_1_df_order.loc[ind_preserve] read_2_df_order_removeNegRead = read_2_df_order.loc[ind_preserve] else: read_1_df_order_removeNegRead = read_1_df_order read_2_df_order_removeNegRead = read_2_df_order read_1_df_order_removeNegRead.to_csv("%s/%s.read1.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a') read_2_df_order_removeNegRead.to_csv("%s/%s.read2.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a') if npair_read_synthetic != read_1_df_order_removeNegRead.shape[0]: print("Target read pair %s | Sample synthetic read pair %s" % (npair_read_synthetic, read_1_df_order_removeNegRead.shape[0])) print("\n[scReadSim] Created:") print("[scReadSim] Read 1 bed file: %s/%s.read1.bed" % (outdirectory, read_bedfile_prename)) print("[scReadSim] Read 2 bed file: %s/%s.read2.bed" % (outdirectory, read_bedfile_prename)) # Modeling Gray Areas if GrayAreaModeling == True: print("\n[scReadSim] Generating reads for Gray Area...") # If gray area bed file not found, pring the error. try: with open(outdirectory + "/" + "scReadSim.grayareas.bed") as file: reader = csv.reader(file, delimiter="\t") GreyArea_set = np.asarray(list(reader)) except Exception as e: print("[Error] Gray Area Bed File not Found: %s" % (outdirectory, scReadSim.grayareas.bed)) with open("%s/GrayArea_Assigned_Synthetic_CountMatrix.txt" % outdirectory, 'w') as f_2: pass # Create read 1 and read 2 files with open("%s/%s.GrayArea.read1.bed" % (outdirectory, read_bedfile_prename), 'w') as f_1: pass with open("%s/%s.GrayArea.read2.bed" % (outdirectory, read_bedfile_prename), 'w') as f_2: pass random.seed(2022) for peak_id in tqdm(range(len(GreyArea_set))): grey_area = GreyArea_set[peak_id] rec_name = '_'.join(grey_area) grey_length = int(grey_area[2]) - int(grey_area[1]) idx = find_leftnearest_nonpeak(open_peak, grey_area) nonpeak_cur_count = count_mat[idx,:] nonpeak_cur_length = int(open_peak[idx,2]) - int(open_peak[idx,1]) if np.sum(nonpeak_cur_count) == 0: grey_count_vec = pd.DataFrame(np.zeros(n_cell, dtype=int)).T grey_count_vec.to_csv("%s/GrayArea_Assigned_Synthetic_CountMatrix.txt" % outdirectory, header=None, index=None, sep='\t', mode='a') continue # print zero counts for grey area if no reads in non-peak count mat # scaled_grey_count = np.round(nonpeak_cur_count * grey_length / nonpeak_cur_length).astype(int) scaled_grey_count = nonpeak_cur_count * np.random.binomial(1, min(grey_length / nonpeak_cur_length, 1), len(nonpeak_cur_count)) # Write out grey area synthetic count matrix (optional) grey_count_vec = pd.DataFrame(np.append(rec_name, scaled_grey_count)).T grey_count_vec.to_csv("%s/GrayArea_Assigned_Synthetic_CountMatrix.txt" % outdirectory, header=None, index=None, sep='\t', mode='a') if np.sum(scaled_grey_count) == 0: continue # if no synthetic count for grey then skip the peak reads = samfile.fetch(grey_area[0], int(grey_area[1]), int(grey_area[2])) # Extract read information reads_str = [] for read in reads: if read.is_reverse==1: strand = -1 else: strand = 1 if read.is_read1==1: read_order = 1 else: read_order = 2 start = read.reference_start mate_start = read.next_reference_start read_len_cur = read.query_alignment_length read_info = [start, mate_start, read_len_cur, read_order, strand] reads_str.append(read_info) if len(reads_str) == 0: # If no real reads exist in the peak, skip continue count_frag_vec = np.ceil(scaled_grey_count/2).astype(int) npair_read_synthetic = np.sum(count_frag_vec).astype(int) # nrow(reads_cur) should equal to nfrag_cur read_sampled_unsplit = np.array(reads_str)[np.random.choice(len(reads_str), size=npair_read_synthetic, replace=True),:] # Sample starting position if random noise mode is on, or use real read starting position if random_noise_mode == True: read_synthetic_start = np.random.randint(int(grey_area[1]), int(grey_area[2]), size=npair_read_synthetic) else: read_synthetic_start = read_sampled_unsplit[:,0].astype(int) # Generate Read Name nonempty_cell_ind = np.where(count_frag_vec != 0)[0] target_peak_concat = grey_area[0] + ":" + str(grey_area[1]) + "-" + str(grey_area[2]) read_name_list = [random_cellbarcode_list[nonempty_cell_ind[ind]] + ":" + "CellNo" + str(nonempty_cell_ind[ind] + 1) + ":" + str(target_peak_concat) + "#" + str(count).zfill(4) for ind in range(len(nonempty_cell_ind)) for count in range(count_frag_vec[nonempty_cell_ind[ind]])] # Create dataframe for unspiltted sampled reads reads_cur = pd.DataFrame({ 'chr': grey_area[0], 'r_start': read_synthetic_start, 'mate_start': read_sampled_unsplit[:,1].astype(int) + read_synthetic_start - read_sampled_unsplit[:,0].astype(int), 'read_name': read_name_list, 'length': read_sampled_unsplit[:,2], 'read_order': read_sampled_unsplit[:,3].astype(int), 'strand': read_sampled_unsplit[:,4].astype(int), 'mate_strand': -read_sampled_unsplit[:,4].astype(int) }) contain_read_indicator = read_sampled_unsplit[:,0] == read_sampled_unsplit[:,1] reads_cur['read_length'] = read_len reads_cur['read_length'][contain_read_indicator] = abs(reads_cur['length'].astype(int)[contain_read_indicator]) # Add jitter size to read positions jitter_value_vec = np.random.randint(-jitter_size,jitter_size,size=npair_read_synthetic).astype(int) # nrow(reads_cur) should equal to nfrag_cur reads_cur['r_start_shifted'] = reads_cur['r_start'].astype(int) + jitter_value_vec reads_cur['mate_start_shifted'] = reads_cur['mate_start'].astype(int) + jitter_value_vec reads_cur['r_end_shifted'] = reads_cur['r_start'].astype(int) + reads_cur['read_length'].astype(int) + jitter_value_vec reads_cur['mate_end_shifted'] = reads_cur['mate_start'].astype(int) + reads_cur['read_length'].astype(int) + jitter_value_vec # Split read 1 and read 2 read_1_df = pd.concat([reads_cur.loc[reads_cur['read_order'] == 1, ['chr','r_start_shifted', 'r_end_shifted', 'read_length', 'strand']].rename(columns={'r_start_shifted':'r1_start_shifted', 'r_end_shifted':'r1_end_shifted'}), reads_cur.loc[reads_cur['read_order'] == 2, ['chr','mate_start_shifted', 'mate_end_shifted', 'read_length', 'mate_strand']].rename(columns={'mate_start_shifted':'r1_start_shifted', 'mate_end_shifted':'r1_end_shifted', 'mate_strand': 'strand'})], ignore_index=True) read_2_df = pd.concat([reads_cur.loc[reads_cur['read_order'] == 1, ['chr','mate_start_shifted', 'mate_end_shifted', 'read_length', 'mate_strand']].rename(columns={'mate_start_shifted':'r2_start_shifted', 'mate_end_shifted':'r2_end_shifted', 'mate_strand': 'strand'}), reads_cur.loc[reads_cur['read_order'] == 2, ['chr','r_start_shifted', 'r_end_shifted', 'read_length', 'strand']].rename(columns={'r_start_shifted':'r2_start_shifted', 'r_end_shifted':'r2_end_shifted'}), ], ignore_index=True) read_1_df['read_name'] = read_name_list read_2_df['read_name'] = read_name_list # read_1_df['read_length'] = read_len # read_2_df['read_length'] = read_len read_1_df['strand'] = ['+' if x == 1 else '-' for x in read_1_df['strand']] read_2_df['strand'] = ['+' if x == 1 else '-' for x in read_2_df['strand']] read_1_df_order = read_1_df[['chr','r1_start_shifted', 'r1_end_shifted', 'read_name', 'read_length', 'strand']] read_2_df_order = read_2_df[['chr','r2_start_shifted', 'r2_end_shifted', 'read_name', 'read_length', 'strand']] if read_1_df_order.shape[0] != read_2_df_order.shape[0]: print("[Warning] Gray Area %s read 1 and read 2 not identical!", peak_id) if np.sum(np.array(read_1_df_order[['r1_start_shifted']] < 0)) + np.sum(np.array(read_2_df_order[['r2_start_shifted']] < 0)) > 0: print("[Warning] Synthetic read pair for Peak %s %s has read 1 or read 2 start position negative: synthetic read pair removed!" % (peak_id, rec_name)) ind_preserve = np.array(read_1_df_order[['r1_start_shifted']] >= 0) * np.array(read_2_df_order[['r2_start_shifted']] >= 0) # Remove rows with read 1 or read 2's start position negative read_1_df_order_removeNegRead = read_1_df_order.loc[ind_preserve] read_2_df_order_removeNegRead = read_2_df_order.loc[ind_preserve] else: read_1_df_order_removeNegRead = read_1_df_order read_2_df_order_removeNegRead = read_2_df_order read_1_df_order_removeNegRead.to_csv("%s/%s.GrayArea.read1.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a') read_2_df_order_removeNegRead.to_csv("%s/%s.GrayArea.read2.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a') if npair_read_synthetic != read_1_df_order_removeNegRead.shape[0]: print("Target read pair %s | Sample synthetic read pair %s" % (npair_read_synthetic, read_1_df_order_removeNegRead.shape[0])) print("\n[scReadSim] Created:") print("[scReadSim] Read 1 Bed File: %s/%s.GrayArea.read1.bed" % (outdirectory, read_bedfile_prename)) print("[scReadSim] Read 2 Bed File: %s/%s.GrayArea.read2.bed" % (outdirectory, read_bedfile_prename)) print("[scReadSim] Done.")
[docs] def scATAC_GenerateBAMCoord_OutputPeak(target_peak_assignment_file, count_mat_file, synthetic_cell_label_file, read_bedfile_prename, INPUT_bamfile, outdirectory, OUTPUT_cells_barcode_file, jitter_size=5, read_len=50, random_noise_mode = False): """Generate Synthetic reads in BED format. Parameters ---------- target_peak_assignment_file: `str` Mapping file between input peaks and output peaks, output by 'FeatureMapping'. count_mat_file: `str` The path to the synthetic count matrix generated by `GenerateSyntheticCount.scATAC_GenerateSyntheticCount`. synthetic_cell_label_file: `str` Synthetic cell label file generated by `scATAC_GenerateSyntheticCount`. read_bedfile_prename: `str` Specify the base name of output bed file. INPUT_bamfile: `str` Input BAM file for anlaysis. outdirectory: `str` Specify the output directory for synthetic reads bed file. OUTPUT_cells_barcode_file: `str` Specify the file name storing the synthetic cell barcodes. jitter_size: `int` (default: '5') Specify the range of random shift to avoid replicate synthetic reads. Default value is 5 bp. read_len: `int` (default: '50') Specify the length of synthetic reads. Default value is 50 bp. random_noise_mode: 'bool' (default: 'False') Specify whether to use a uniform distribution of reads. """ count_mat_df = pd.read_csv("%s" % (count_mat_file), header=None, delimiter="\t") # Remove count matrix first column: feature names count_mat = count_mat_df.iloc[: , 1:].to_numpy() count_mat_cluster = pd.read_csv(synthetic_cell_label_file, header=None, delimiter="\t").to_numpy().flatten() n_cell = np.shape(count_mat)[1] samfile = pysam.AlignmentFile(INPUT_bamfile, "rb") with open(target_peak_assignment_file) as open_peak: reader = csv.reader(open_peak, delimiter="\t") open_peak = np.asarray(list(reader)) peak_nonzero_id = np.nonzero(count_mat.sum(axis=1))[0] random.seed(2022) random_cellbarcode_list = cellbarcode_generator(n_cell, size=16) with open(outdirectory + "/"+ OUTPUT_cells_barcode_file, 'w') as f: for item in random_cellbarcode_list: f.write(item + "\n") cellbarcode_list_withclusters = np.vstack((random_cellbarcode_list, count_mat_cluster)).transpose() with open(outdirectory + "/"+ OUTPUT_cells_barcode_file + ".withSynthCluster", 'w') as f: for item in cellbarcode_list_withclusters: f.write("\t".join(item) + "\n") # Create read 1 and read 2 files with open("%s/%s.read1.bed" % (outdirectory, read_bedfile_prename), 'w') as f_1: pass with open("%s/%s.read2.bed" % (outdirectory, read_bedfile_prename), 'w') as f_2: pass # w/ Target Peak print("[scReadSim] Generating Synthetic Reads for Feature Set: %s" % (target_peak_assignment_file)) for relative_peak_ind in tqdm(range(len(peak_nonzero_id))): peak_ind = peak_nonzero_id[relative_peak_ind] rec = open_peak[peak_ind] rec_name = '_'.join(rec) true_peak_concat = rec[3] + ":" + str(rec[4]) + "-" + str(rec[5]) target_peak_concat = rec[0] + ":" + str(rec[1]) + "-" + str(rec[2]) if int(rec[2]) - int(rec[1]) == 0: print("Peak %s has identical start and end position. Skip.") continue shift_number = int(rec[1]) - int(rec[4]) reads = samfile.fetch(rec[3], int(rec[4]), int(rec[5])) # Extract reads from true peaks # tic = time.time() # Extract read information reads_str = [] for read in reads: if read.is_reverse==1: strand = -1 else: strand = 1 if read.is_read1==1: read_order = 1 else: read_order = 2 start = read.reference_start mate_start = read.next_reference_start read_len_cur = read.query_alignment_length read_info = [start, mate_start, read_len_cur, read_order, strand] reads_str.append(read_info) # Sample npair_read_synthetic read 1 from reads and reserve fragment length if len(reads_str) > 0: # If no real reads exist in the peak, skip count_vec = count_mat[peak_ind,:] # Synthetic umi count count_frag_vec = np.ceil(count_vec/2).astype(int) npair_read_synthetic = np.sum(count_frag_vec).astype(int) # nrow(reads_cur) should equal to nfrag_cur # npair_read_synthetic = np.ceil(np.sum(count_vec)/2).astype(int) # total number of synthetic reads read_sampled_unsplit = np.array(reads_str)[np.random.choice(len(reads_str), size=npair_read_synthetic, replace=True),:] # Sample starting position if random noise mode is on, or use real read starting position if random_noise_mode == True: read_synthetic_start = np.random.randint(int(rec[1]), int(rec[2]), size=npair_read_synthetic) else: read_synthetic_start = read_sampled_unsplit[:,0].astype(int) + shift_number # Generate Read Name nonempty_cell_ind = np.where(count_frag_vec != 0)[0] target_peak_concat = rec[0] + ":" + str(rec[1]) + "-" + str(rec[2]) read_name_list = [random_cellbarcode_list[nonempty_cell_ind[ind]] + ":" + "CellNo" + str(nonempty_cell_ind[ind] + 1) + ":" + str(target_peak_concat) + "#" + str(count).zfill(4) for ind in range(len(nonempty_cell_ind)) for count in range(count_frag_vec[nonempty_cell_ind[ind]])] # Create dataframe for unspiltted sampled reads reads_cur = pd.DataFrame({ 'chr': rec[0], 'r_start': read_synthetic_start, 'mate_start': read_sampled_unsplit[:,1].astype(int) + read_synthetic_start - read_sampled_unsplit[:,0].astype(int), 'read_name': read_name_list, 'length': read_sampled_unsplit[:,2], 'read_order': read_sampled_unsplit[:,3].astype(int), 'strand': read_sampled_unsplit[:,4].astype(int), 'mate_strand': -read_sampled_unsplit[:,4].astype(int) }) contain_read_indicator = read_sampled_unsplit[:,0] == read_sampled_unsplit[:,1] reads_cur['read_length'] = read_len reads_cur['read_length'][contain_read_indicator] = abs(reads_cur['length'].astype(int)[contain_read_indicator]) # Add jitter size to read positions jitter_value_vec = np.random.randint(-jitter_size,jitter_size,size=npair_read_synthetic).astype(int) # nrow(reads_cur) should equal to nfrag_cur reads_cur['r_start_shifted'] = reads_cur['r_start'].astype(int) + jitter_value_vec reads_cur['mate_start_shifted'] = reads_cur['mate_start'].astype(int) + jitter_value_vec reads_cur['r_end_shifted'] = reads_cur['r_start'].astype(int) + reads_cur['read_length'].astype(int) + jitter_value_vec reads_cur['mate_end_shifted'] = reads_cur['mate_start'].astype(int) + reads_cur['read_length'].astype(int) + jitter_value_vec # Split read 1 and read 2 read_1_df = pd.concat([reads_cur.loc[reads_cur['read_order'] == 1, ['chr','r_start_shifted', 'r_end_shifted', 'read_length', 'strand']].rename(columns={'r_start_shifted':'r1_start_shifted', 'r_end_shifted':'r1_end_shifted'}), reads_cur.loc[reads_cur['read_order'] == 2, ['chr','mate_start_shifted', 'mate_end_shifted', 'read_length', 'mate_strand']].rename(columns={'mate_start_shifted':'r1_start_shifted', 'mate_end_shifted':'r1_end_shifted', 'mate_strand': 'strand'})], ignore_index=True) read_2_df = pd.concat([reads_cur.loc[reads_cur['read_order'] == 1, ['chr','mate_start_shifted', 'mate_end_shifted', 'read_length', 'mate_strand']].rename(columns={'mate_start_shifted':'r2_start_shifted', 'mate_end_shifted':'r2_end_shifted', 'mate_strand': 'strand'}), reads_cur.loc[reads_cur['read_order'] == 2, ['chr','r_start_shifted', 'r_end_shifted', 'read_length', 'strand']].rename(columns={'r_start_shifted':'r2_start_shifted', 'r_end_shifted':'r2_end_shifted'}), ], ignore_index=True) read_1_df['read_name'] = read_name_list read_2_df['read_name'] = read_name_list read_1_df['strand'] = ['+' if x == 1 else '-' for x in read_1_df['strand']] read_2_df['strand'] = ['+' if x == 1 else '-' for x in read_2_df['strand']] read_1_df_order = read_1_df[['chr','r1_start_shifted', 'r1_end_shifted', 'read_name', 'read_length', 'strand']] read_2_df_order = read_2_df[['chr','r2_start_shifted', 'r2_end_shifted', 'read_name', 'read_length', 'strand']] if read_2_df_order.shape[0] != read_2_df_order.shape[0]: print("Peak %s read 1 and read 2 not identical!", relative_peak_ind) if np.sum(np.array(read_1_df_order[['r1_start_shifted']] < 0)) + np.sum(np.array(read_2_df_order[['r2_start_shifted']] < 0)) > 0: print("[Warning] Synthetic read pair for Peak %s %s has read 1 or read 2 start position negative: synthetic read pair removed!" % (relative_peak_ind, rec_name)) ind_preserve = np.array(read_1_df_order[['r1_start_shifted']] >= 0) * np.array(read_2_df_order[['r2_start_shifted']] >= 0) # Remove rows with read 1 or read 2's start position negative read_1_df_order_removeNegRead = read_1_df_order.loc[ind_preserve] read_2_df_order_removeNegRead = read_2_df_order.loc[ind_preserve] else: read_1_df_order_removeNegRead = read_1_df_order read_2_df_order_removeNegRead = read_2_df_order read_1_df_order_removeNegRead.to_csv("%s/%s.read1.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a') read_2_df_order_removeNegRead.to_csv("%s/%s.read2.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a') print("\n[scReadSim] Created:") print("[scReadSim] Read 1 bed file: %s/%s.read1.bed" % (outdirectory, read_bedfile_prename)) print("[scReadSim] Read 2 bed file: %s/%s.read2.bed" % (outdirectory, read_bedfile_prename)) print("[scReadSim] Done.")
[docs] def scATAC_CombineBED(outdirectory, peak_read_bedfile_prename, nonpeak_read_bedfile_prename, BED_filename_combined_pre, GrayAreaModeling=True): """Combine the bed files of foreground and background feature sets into one bed file. Parameters ---------- outdirectory: `str` Directory of `peak_read_bedfile_prename`.txt and `nonpeak_read_bedfile_prename`.txt. peak_read_bedfile_prename: `str` Base name of the bed file containig synthetic reads for peaks (generated by function `scATAC_GenerateBAM.scATAC_GenerateBAMCoord`). nonpeak_read_bedfile_prename: 'str' Base name of the bed file containig synthetic reads for non-peaks (generated by function `scATAC_GenerateBAM.scATAC_GenerateBAMCoord`). BED_filename_combined_pre: 'str' Specify the combined syntehtic reads bed file prename. The combined bed file will be output to `outdirectory`. GrayAreaModeling: `bool` (default: 'True') Specify whether to combine gray area's reads. """ if GrayAreaModeling: print("[scReadSim] Combining synthetic read 1 BED files from peaks, non-peaks and gray areas.") combine_read1_cmd = "cat %s/%s.read1.bed %s/%s.read1.bed %s/%s.GrayArea.read1.bed > %s/%s.read1.bed" % (outdirectory, peak_read_bedfile_prename, outdirectory, nonpeak_read_bedfile_prename, outdirectory, nonpeak_read_bedfile_prename, outdirectory, BED_filename_combined_pre) output, error = subprocess.Popen(combine_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create combine synthetic read1 bed files:\n', error.decode()) print("[scReadSim] Combining synthetic read 2 BED files from peaks, non-peaks and gray areas.") combine_read2_cmd = "cat %s/%s.read2.bed %s/%s.read2.bed %s/%s.GrayArea.read2.bed > %s/%s.read2.bed" % (outdirectory, peak_read_bedfile_prename, outdirectory, nonpeak_read_bedfile_prename, outdirectory, nonpeak_read_bedfile_prename, outdirectory, BED_filename_combined_pre) output, error = subprocess.Popen(combine_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create combine synthetic read2 bed files:\n', error.decode()) # sys.exit('[ERROR] Fail to create combine synthetic read2 bed files:\n', error.decode()) else: print("[scReadSim] Combining synthetic read 1 BED files from peaks, non-peaks.") combine_read1_cmd = "cat %s/%s.read1.bed %s/%s.read1.bed > %s/%s.read1.bed" % (outdirectory, peak_read_bedfile_prename, outdirectory, nonpeak_read_bedfile_prename, outdirectory, BED_filename_combined_pre) output, error = subprocess.Popen(combine_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create combine synthetic read1 bed files:\n', error.decode()) print("[scReadSim] Combining synthetic read 2 BED files from peaks, non-peaks.") combine_read2_cmd = "cat %s/%s.read2.bed %s/%s.read2.bed > %s/%s.read2.bed" % (outdirectory, peak_read_bedfile_prename, outdirectory, nonpeak_read_bedfile_prename, outdirectory, BED_filename_combined_pre) output, error = subprocess.Popen(combine_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create combine synthetic read2 bed files:\n', error.decode()) # sys.exit('[ERROR] Fail to create combine synthetic read2 bed files:\n', error.decode()) print("\n[scReadSim] Created:") print("[scReadSim] Combined read 1 BED file: %s/%s.read1.bed" % (outdirectory, BED_filename_combined_pre)) print("[scReadSim] Combined read 2 BED file: %s/%s.read2.bed" % (outdirectory, BED_filename_combined_pre)) print("[scReadSim] Done.")
[docs] def scATAC_BED2FASTQ(bedtools_directory, seqtk_directory, referenceGenome_file, outdirectory, BED_filename_combined, synthetic_fastq_prename): """Convert Synthetic reads from BED to FASTQ. Parameters ---------- bedtools_directory: `str` Directory of software bedtools. seqtk_directory: `str` Directory of software seqtk. referenceGenome_file: `str` Directory of the reference genome FASTA file that the synthteic reads should align. outdirectory: `str` Output directory of the synthteic bed file and its corresponding cell barcodes file. BED_filename_combined: `str` Specify the base name of output bed file of function 'scATAC_CombineBED'. synthetic_fastq_prename: `str` Specify the base name of the output FASTQ files. """ # Create FASTA print('[scReadSim] Generating synthetic read FASTA files...') fasta_read1_cmd = "%s/bedtools getfasta -s -fi %s -bed %s/%s.read1.bed -fo %s/%s.read1.bed2fa.strand.fa -nameOnly" % (bedtools_directory, referenceGenome_file, outdirectory, BED_filename_combined, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(fasta_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print(error.decode()) fasta_read2_cmd = "%s/bedtools getfasta -s -fi %s -bed %s/%s.read2.bed -fo %s/%s.read2.bed2fa.strand.fa -nameOnly" % (bedtools_directory, referenceGenome_file, outdirectory, BED_filename_combined, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(fasta_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print(error.decode()) # remove (-) or (+) org_fasta_read1_cmd = "sed '/^>/s/.\{3\}$//' %s/%s.read1.bed2fa.strand.fa > %s/%s.read1.bed2fa.fa" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(org_fasta_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to remove strand infomormation from synthetic read1 fasta file:', error.decode()) org_fasta_read2_cmd = "sed '/^>/s/.\{3\}$//' %s/%s.read2.bed2fa.strand.fa > %s/%s.read2.bed2fa.fa" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(org_fasta_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to remove strand infomormation from synthetic read2 fasta file:', error.decode()) # FASTA to FASTQ print('[scReadSim] Generating synthetic read FASTQ files...') fastq_read1_cmd = "%s/seqtk seq -F 'F' %s/%s.read1.bed2fa.fa > %s/%s.read1.bed2fa.fq" % (seqtk_directory, outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(fastq_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to convert read1 synthetic fasta file to fastq file:', error.decode()) fastq_read2_cmd = "%s/seqtk seq -F 'F' %s/%s.read2.bed2fa.fa > %s/%s.read2.bed2fa.fq" % (seqtk_directory, outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(fastq_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to convert read2 synthetic fasta file to fastq file:', error.decode()) print('[scReadSim] Sorting FASTQ files...') sort_fastq_read1_cmd = "cat %s/%s.read1.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(sort_fastq_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort read1 synthetic fastq file:', error.decode()) sort_fastq_read2_cmd = "cat %s/%s.read2.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(sort_fastq_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort read2 synthetic fastq file:', error.decode()) print("\n[scReadSim] Created:") print("[scReadSim] Read 1 FASTQ file: %s/%s.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename)) print("[scReadSim] Read 2 FASTQ file: %s/%s.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename)) print("[scReadSim] Done.")
[docs] def AlignSyntheticBam_Pair(bowtie2_directory, samtools_directory, outdirectory, referenceGenome_name, referenceGenome_dir, synthetic_fastq_prename, output_BAM_pre): """Convert Synthetic reads from FASTQ to BAM. Parameters ---------- bowtie2_directory: `str` Path to software bowtie2. samtools_directory: `str` Path to software samtools. outdirectory: `str` Specify the output directory of the synthteic BAM file. referenceGenome_name: `str` Base name of the eference genome FASTA file. For example, you should input "chr1" for file "chr1.fa". referenceGenome_dir: `str` Path to the reference genome FASTA file. synthetic_fastq_prename: `str` Base name of the synthetic FASTQ files output by function `scATAC_BED2FASTQ`. output_BAM_pre: `str` Specify the base name of the output BAM file. """ print('[scReadSim] Aligning FASTQ files onto Reference Genome Files with Bowtie2...') alignment_cmd = "%s/bowtie2 --minins 0 --maxins 1200 -x %s/%s -1 %s/%s.read1.bed2fa.sorted.fq -2 %s/%s.read2.bed2fa.sorted.fq | %s/samtools view -bS - > %s/%s.synthetic.noCB.bam" % (bowtie2_directory, referenceGenome_dir, referenceGenome_name, outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename, samtools_directory, outdirectory, output_BAM_pre) output, error = subprocess.Popen(alignment_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() print('[Bowtie2] Aligning:\n', error.decode()) print('[scReadSim] Alignment done.') print('[scReadSim] Generating cell barcode tag...') addBC2BAM_header_cmd = "%s/samtools view %s/%s.synthetic.noCB.bam -H > %s/%s.synthetic.noCB.header.sam" % (samtools_directory, outdirectory, output_BAM_pre, outdirectory, output_BAM_pre) output, error = subprocess.Popen(addBC2BAM_header_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() addBC2BAM_cmd = "cat <( cat %s/%s.synthetic.noCB.header.sam ) <( paste <(%s/samtools view %s/%s.synthetic.noCB.bam ) <(%s/samtools view %s/%s.synthetic.noCB.bam | cut -f1 | cut -d':' -f1 | sed -e 's/^/CB:Z:/')) | %s/samtools view -bS - > %s/%s.synthetic.bam" % (outdirectory, output_BAM_pre, samtools_directory, outdirectory, output_BAM_pre, samtools_directory, outdirectory, output_BAM_pre, samtools_directory, outdirectory, output_BAM_pre) output, error = subprocess.Popen(addBC2BAM_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to add BC tag to synthetic BAM file:', error.decode()) print('[scReadSim] Sorting and indexing BAM file...') sortBAMcmd = "%s/samtools sort %s/%s.synthetic.bam > %s/%s.synthetic.sorted.bam" % (samtools_directory, outdirectory, output_BAM_pre, outdirectory, output_BAM_pre) output, error = subprocess.Popen(sortBAMcmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort synthetic BAM file:', error.decode()) indexBAMcmd = "%s/samtools index %s/%s.synthetic.sorted.bam" % (samtools_directory, outdirectory, output_BAM_pre) output, error = subprocess.Popen(indexBAMcmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to index synthetic BAM file:', error.decode()) print("\n[scReadSim] Created:") print("[scReadSim] Synthetic read BAM file: %s/%s.synthetic.sorted.bam" % (outdirectory, output_BAM_pre)) print("[scReadSim] Done.")
[docs] def ErrorBase(base, prop, base_call_ref): """Sample random errors. """ err_base_call_id = np.random.choice(a=[0, 1, 2], size=1, p=prop)[0] err_base_call = base_call_ref[base][err_base_call_id] return err_base_call
[docs] def ErroneousRead(real_error_rate_read, read_df, output_fq_file): """Generate random errors according to input real data error rates. """ n_read = int(np.shape(read_df)[0]/4) ## Prepare Error rate real_error_rate_read_A = real_error_rate_read[['a_to_c_error_rate', 'a_to_g_error_rate', 'a_to_t_error_rate']].to_numpy() real_error_rate_read_A_prop = real_error_rate_read_A/real_error_rate_read_A.sum(axis=1,keepdims=1) real_error_rate_read_A_prop[np.isnan(real_error_rate_read_A_prop).any(axis=1),:] = 1/3 real_error_rate_read_C = real_error_rate_read[['c_to_a_error_rate', 'c_to_g_error_rate', 'c_to_t_error_rate']].to_numpy() real_error_rate_read_C_prop = real_error_rate_read_C/real_error_rate_read_C.sum(axis=1,keepdims=1) real_error_rate_read_C_prop[np.isnan(real_error_rate_read_C_prop).any(axis=1),:] = 1/3 real_error_rate_read_G = real_error_rate_read[['g_to_a_error_rate', 'g_to_c_error_rate', 'g_to_t_error_rate']].to_numpy() real_error_rate_read_G_prop = real_error_rate_read_G/real_error_rate_read_G.sum(axis=1,keepdims=1) real_error_rate_read_G_prop[np.isnan(real_error_rate_read_G_prop).any(axis=1),:] = 1/3 real_error_rate_read_T = real_error_rate_read[['t_to_a_error_rate', 't_to_c_error_rate', 't_to_g_error_rate']].to_numpy() real_error_rate_read_T_prop = real_error_rate_read_T/real_error_rate_read_T.sum(axis=1,keepdims=1) real_error_rate_read_T_prop[np.isnan(real_error_rate_read_T_prop).any(axis=1),:] = 1/3 # Base decision matrix real_error_rate_read_prop_dict = {'A': real_error_rate_read_A_prop, 'C': real_error_rate_read_C_prop, 'G': real_error_rate_read_G_prop, 'T': real_error_rate_read_T_prop} # Error decision vector real_error_rate_read_perbase = real_error_rate_read['error_rate'].to_numpy() ## Decide whether error occurs for each read read_length = real_error_rate_read.shape[0] error_read_perbase_indicator = np.zeros((n_read, read_length), dtype=int) random.seed(1) for base_id in range(read_length): error_read_perbase_indicator[:,base_id] = np.random.binomial(n=1, p=real_error_rate_read_perbase[base_id], size=n_read) erroneous_read_id = np.where(np.sum(error_read_perbase_indicator, axis=1) > 0)[0] ## For erroneous reads, generate erroneous base based on the probability matrix base_call_ref = {'A': ['C', 'G', 'T'], 'C': ['A', 'G', 'T'], 'G': ['A', 'C', 'T'], 'T': ['A', 'C', 'G']} random.seed(2021) read_df_witherror = read_df for read_id_tqdm in tqdm(range(len(erroneous_read_id))): read_id = erroneous_read_id[read_id_tqdm] read_cur = read_df[(read_id*4) : (read_id*4 + 4)] bases = list(read_cur[1][0].upper()) Qscores = list(read_cur[3][0]) for errorneous_base_id in np.where(error_read_perbase_indicator[read_id,:] > 0)[0]: if errorneous_base_id < len(bases): base_cur = bases[errorneous_base_id] if base_cur in real_error_rate_read_prop_dict: prop = real_error_rate_read_prop_dict[base_cur][errorneous_base_id] # Decide error base err_base_call = ErrorBase(base_cur, prop, base_call_ref) bases[errorneous_base_id] = err_base_call # Decide Q score # Use 9 (Phred score 24) for erroneous base Qscores[errorneous_base_id] = '9' read_df_witherror[read_id*4+1] = ''.join(bases) read_df_witherror[read_id*4+3] = ''.join(Qscores) ## Write out np.savetxt(output_fq_file, read_df_witherror, fmt='%s')
[docs] def SubstiError_Pair(real_error_rate_file, outdirectory, synthetic_fastq_prename): """Generate random errors for paired-end sequencing reads according to input real data error rates. Parameters ---------- real_error_rate_file: `str` Path to software fgbio jar script. outdirectory: `str` Specify the output directory of the synthteic FASTQ file with random errors. synthetic_fastq_prename: `str` Specify the base name of the synthetic erroneous reads' FASTQ files. """ # Read in real error rates real_error_rate_dir = real_error_rate_file real_error_rate = pd.read_csv(real_error_rate_dir, header=0, delimiter="\t") real_error_rate_read1 = real_error_rate[real_error_rate['read_number'] == 1] real_error_rate_read2 = real_error_rate[real_error_rate['read_number'] == 2] # Read in perfect reads read1_fq = outdirectory + "/" + synthetic_fastq_prename + ".read1.bed2fa.fq" read2_fq = outdirectory + "/" + synthetic_fastq_prename + ".read2.bed2fa.fq" read1_df = pd.read_csv(read1_fq, header=None).to_numpy() read2_df = pd.read_csv(read2_fq, header=None).to_numpy() # Generate random error according to Real data ErroneousRead(real_error_rate_read1, read1_df, outdirectory + "/" + synthetic_fastq_prename + ".ErrorIncluded.read1.bed2fa.fq") ErroneousRead(real_error_rate_read2, read2_df, outdirectory + "/" + synthetic_fastq_prename + ".ErrorIncluded.read2.bed2fa.fq")
[docs] def scATAC_ErrorBase(fgbio_jarfile, INPUT_bamfile, referenceGenome_file, outdirectory, synthetic_fastq_prename): """Introduce random substitution errors into synthetic reads according to real data error rates. Parameters ---------- fgbio_jarfile: `str` Path to software fgbio jar script. INPUT_bamfile: `str` Input BAM file for anlaysis. referenceGenome_file: 'str' Reference genome FASTA file that the synthteic reads should align. outdirectory: `str` Specify the output directory of the synthteic FASTQ file with random errors. synthetic_fastq_prename: `str` Base name of the synthetic FASTQ files output by function `scATAC_BED2FASTQ`. """ print('[scReadSim] Substitution error calculating...') combine_read1_cmd = "java -jar %s ErrorRateByReadPosition -i %s -r %s -o %s/Real --collapse false" % (fgbio_jarfile, INPUT_bamfile, referenceGenome_file, outdirectory) output, error = subprocess.Popen(combine_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[Messages] Running fgbio on real bam file:\n', error.decode()) # Generate Errors into fastq files print('[scReadSim] Generting synthetic read FASTQ files with substitution errors...') real_error_rate_file = outdirectory + "/" + "Real.error_rate_by_read_position.txt" SubstiError_Pair(real_error_rate_file, outdirectory, synthetic_fastq_prename) # Combine FASTQs print('[scReadSim] Sorting FASTQ files...') sort_fastq_read1_cmd = "cat %s/%s.ErrorIncluded.read1.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.ErrorIncluded.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(sort_fastq_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort read1 synthetic fastq file:', error.decode()) sort_fastq_read2_cmd = "cat %s/%s.ErrorIncluded.read2.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.ErrorIncluded.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(sort_fastq_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort read2 synthetic fastq file:', error.decode()) print("\n[scReadSim] Created:") print("[scReadSim] Read 1 FASTQ file with substitutional error: %s/%s.ErrorIncluded.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename)) print("[scReadSim] Read 2 FASTQ file with substitutional error: %s/%s.ErrorIncluded.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename)) print("[scReadSim] Done.")
[docs] def scATAC_GenerateSyntheticRead_MultiSample(INPUT_bamfile, outdirectory, bedtools_directory, seqtk_directory, fgbio_jarfile, referenceGenome_file, read_len=50): """Multi-sample/replicate implement of scReadSim for simulating scATAC-seq synthetic reads. Parameters ---------- INPUT_bamfile: `str` List of input BAM files (use absolute paths to the BAM files). outdirectory: `str` Specify the working directory of scReadSim for generating intermediate and final output files. bedtools_directory: `str` Directory of software bedtools. seqtk_directory: `str` Directory of software seqtk. fgbio_jarfile: `str` Path to software fgbio jar script. referenceGenome_file: 'str' Reference genome FASTA file that the synthteic reads should align. read_len: `int` (default: '50') Specify the length of synthetic reads. Default value is 50 bp. """ for rep_id in range(len(INPUT_bamfile)): print("\n[scReadSim] Generating synthetic reads for sample %s..." % str(rep_id+1)) sample_output_d = outdirectory + "/" + "Rep" + str(rep_id+1) # Obtain bedfile peak_bedfile = sample_output_d + "/" + "scReadSim.superset.peak.bed" nonpeak_bedfile = sample_output_d + "/" + "scReadSim.superset.nonpeak.bed" # Specify the output count matrices' prenames count_mat_peak_filename = "Rep%s.peak.countmatrix" % str(rep_id+1) count_mat_nonpeak_filename = "Rep%s.nonpeak.countmatrix" % str(rep_id+1) # Specify the names of synthetic count matrices (generated by GenerateSyntheticCount.scATAC_GenerateSyntheticCount) synthetic_countmat_peak_file = count_mat_peak_filename + ".scDesign2Simulated.txt" synthetic_countmat_nonpeak_file = count_mat_nonpeak_filename + ".scDesign2Simulated.txt" # Specify the base name of bed files containing synthetic reads OUTPUT_cells_barcode_file = "synthetic_cell_barcode_rep%s.txt" % str(rep_id+1) peak_read_bedfile_prename = "Rep%s.syntheticBAM.peak" % str(rep_id+1) nonpeak_read_bedfile_prename = "Rep%s.syntheticBAM.nonpeak" % str(rep_id+1) BED_filename_combined_pre = "Rep%s.syntheticBAM.combined" % str(rep_id+1) synthetic_cell_label_file = count_mat_peak_filename + ".scDesign2Simulated.CellTypeLabel.txt" # Create synthetic read bed file for peaks print("\n[scReadSim] Generating synthetic read BED file for peaks...") scATAC_GenerateBAMCoord(bed_file=peak_bedfile, count_mat_file=sample_output_d + "/" + synthetic_countmat_peak_file, synthetic_cell_label_file=sample_output_d + "/" + synthetic_cell_label_file, read_bedfile_prename=peak_read_bedfile_prename, INPUT_bamfile=INPUT_bamfile[rep_id], outdirectory=sample_output_d, OUTPUT_cells_barcode_file=OUTPUT_cells_barcode_file, jitter_size=5, read_len=read_len) # Create synthetic read bed file for non-peaks print("\n[scReadSim] Generating synthetic read BED file for non-peaks...") scATAC_GenerateBAMCoord(bed_file=nonpeak_bedfile, count_mat_file=sample_output_d + "/" + synthetic_countmat_nonpeak_file, synthetic_cell_label_file=sample_output_d + "/" + synthetic_cell_label_file, read_bedfile_prename=nonpeak_read_bedfile_prename, INPUT_bamfile=INPUT_bamfile[rep_id], outdirectory=sample_output_d, OUTPUT_cells_barcode_file=OUTPUT_cells_barcode_file, jitter_size=5, read_len=read_len, GrayAreaModeling=True) # Combine bed fileS scATAC_CombineBED(outdirectory=sample_output_d, peak_read_bedfile_prename=peak_read_bedfile_prename, nonpeak_read_bedfile_prename=nonpeak_read_bedfile_prename, BED_filename_combined_pre=BED_filename_combined_pre) # Generate FASTQ files synthetic_fastq_prename = BED_filename_combined_pre # Convert combined bed file into FASTQ files print("\n[scReadSim] Generating read synthetic FASTQ files...") scATAC_BED2FASTQ(bedtools_directory=bedtools_directory, seqtk_directory=seqtk_directory, referenceGenome_file=referenceGenome_file, outdirectory=sample_output_d, BED_filename_combined=BED_filename_combined_pre, synthetic_fastq_prename=synthetic_fastq_prename) # Generate reads with errors in FASTQs print("\n[scReadSim] Generating read synthetic FASTQ files with substitutional error...") scATAC_ErrorBase(fgbio_jarfile=fgbio_jarfile, INPUT_bamfile=INPUT_bamfile[rep_id], referenceGenome_file=referenceGenome_file, outdirectory=sample_output_d, synthetic_fastq_prename=synthetic_fastq_prename)