import pandas as pd
import numpy as np
import csv
import collections
import os
pd.options.mode.chained_assignment = None # default='warn'
import random
import subprocess
from tqdm import tqdm
import pysam
[docs]
def flatten(x):
"""Flatten a nested list.
"""
if isinstance(x, collections.Iterable):
return [a for i in x for a in flatten(i)]
else:
return [x]
[docs]
def cellbarcode_generator(length, size=10):
"""Generate random cellbarcode.
Parameters
----------
length: `int`
Number of cells.
size: `int` (default: '10')
Size of cell barcode. Default value is 10 bp.
Return
------
cb_list: `list`
List of randomly generated cell barcodes.
"""
chars = 'ACGT'
cb_list = [''.join(random.choice(chars) for _ in range(size)) for cell in range(length)]
return cb_list
[docs]
def find_leftnearest_nonpeak(non_peak_list, gray_peak):
"""Find the gray peak's left nearest non-peak from a non-peak list.
"""
dist1 = np.abs(non_peak_list[:,2].astype(int) - int(gray_peak[1]))
id = dist1.argmin()
return id
[docs]
def scATAC_GenerateBAMCoord(bed_file, count_mat_file, synthetic_cell_label_file, read_bedfile_prename, INPUT_bamfile, outdirectory, OUTPUT_cells_barcode_file, jitter_size=5, read_len=50, random_noise_mode=False, GrayAreaModeling=False):
"""Generate Synthetic reads in BED format.
Parameters
----------
bed_file: `str`
Features' bed file to generate the synthetic reads (Generated by function `Utility.scATAC_CreateFeatureSets`).
count_mat_file: `str`
The path to the synthetic count matrix generated by `GenerateSyntheticCount.scATAC_GenerateSyntheticCount`.
synthetic_cell_label_file: `str`
Synthetic cell label file generated by `scATAC_GenerateSyntheticCount`.
read_bedfile_prename: `str`
Specify the base name of output bed file.
INPUT_bamfile: `str`
Input BAM file for anlaysis.
outdirectory: `str`
Specify the output directory for synthetic reads bed file.
OUTPUT_cells_barcode_file: `str`
Specify the file name storing the synthetic cell barcodes.
jitter_size: `int` (default: '5')
Specify the range of random shift to avoid replicate synthetic reads. Default value is 5 bp.
read_len: `int` (default: '50')
Specify the length of synthetic reads. Default value is 50 bp.
random_noise_mode: `bool` (default: 'False')
Specify whether to use a uniform distribution of reads.
GrayAreaModeling: `bool` (default: 'False')
Specify whether to generate synthetic reads for Gray Areas using non-peak counts. Do not specify 'True' when generating reads for peaks.
"""
count_mat_df = pd.read_csv("%s" % (count_mat_file), header=None, delimiter="\t")
# Remove count matrix first column: feature names
count_mat = count_mat_df.iloc[: , 1:].to_numpy()
count_mat_cluster = pd.read_csv(synthetic_cell_label_file, header=None, delimiter="\t").to_numpy().flatten()
n_cell = np.shape(count_mat)[1]
samfile = pysam.AlignmentFile(INPUT_bamfile, "rb")
with open(bed_file) as file:
reader = csv.reader(file, delimiter="\t")
open_peak = np.asarray(list(reader))
peak_nonzero_id = np.nonzero(count_mat.sum(axis=1))[0]
random.seed(2022)
random_cellbarcode_list = cellbarcode_generator(n_cell, size=16)
with open(outdirectory + "/"+ OUTPUT_cells_barcode_file, 'w') as f:
for item in random_cellbarcode_list:
f.write(item + "\n")
cellbarcode_list_withclusters = np.vstack((random_cellbarcode_list, count_mat_cluster)).transpose()
with open(outdirectory + "/"+ OUTPUT_cells_barcode_file + ".withSynthCluster", 'w') as f:
for item in cellbarcode_list_withclusters:
f.write("\t".join(item) + "\n")
# Create read 1 and read 2 files
with open("%s/%s.read1.bed" % (outdirectory, read_bedfile_prename), 'w') as f_1:
pass
with open("%s/%s.read2.bed" % (outdirectory, read_bedfile_prename), 'w') as f_2:
pass
print("[scReadSim] Generating Synthetic Reads for Feature Set: %s" % (bed_file))
for relative_peak_ind in tqdm(range(len(peak_nonzero_id))):
peak_ind = peak_nonzero_id[relative_peak_ind]
rec = open_peak[peak_ind]
rec_name = '_'.join(rec)
reads = samfile.fetch(rec[0], int(rec[1]), int(rec[2]))
# tic = time.time()
# Extract read information
reads_str = []
for read in reads:
if read.is_reverse==1:
strand = -1
else:
strand = 1
if read.is_read1==1:
read_order = 1
else:
read_order = 2
start = read.reference_start
mate_start = read.next_reference_start
read_len_cur = read.query_alignment_length
read_info = [start, mate_start, read_len_cur, read_order, strand]
reads_str.append(read_info)
if len(reads_str) > 0: # If no real reads exist in the peak, skip
count_vec = count_mat[peak_ind,:] # Synthetic umi count
count_frag_vec = np.ceil(count_vec/2).astype(int)
npair_read_synthetic = np.sum(count_frag_vec).astype(int) # nrow(reads_cur) should equal to nfrag_cur
# npair_read_synthetic = np.ceil(np.sum(count_vec)/2).astype(int) # total number of synthetic reads
read_sampled_unsplit = np.array(reads_str)[np.random.choice(len(reads_str), size=npair_read_synthetic, replace=True),:]
# Sample starting position if random noise mode is on, or use real read starting position
if random_noise_mode == True:
read_synthetic_start = np.random.randint(int(rec[1]), int(rec[2]), size=npair_read_synthetic)
else:
read_synthetic_start = read_sampled_unsplit[:,0].astype(int)
# Generate Read Name
nonempty_cell_ind = np.where(count_frag_vec != 0)[0]
target_peak_concat = rec[0] + ":" + str(rec[1]) + "-" + str(rec[2])
read_name_list = [random_cellbarcode_list[nonempty_cell_ind[ind]] + "CellNo" + str(nonempty_cell_ind[ind] + 1) + ":" + str(target_peak_concat) + "#" + str(count).zfill(4) for ind in range(len(nonempty_cell_ind)) for count in range(count_frag_vec[nonempty_cell_ind[ind]])]
# Create dataframe for unspiltted sampled reads
reads_cur = pd.DataFrame({
'chr': rec[0],
'r_start': read_synthetic_start,
'mate_start': read_sampled_unsplit[:,1].astype(int) + read_synthetic_start - read_sampled_unsplit[:,0].astype(int),
'read_name': read_name_list,
'length': read_sampled_unsplit[:,2],
'read_order': read_sampled_unsplit[:,3].astype(int),
'strand': read_sampled_unsplit[:,4].astype(int),
'mate_strand': -read_sampled_unsplit[:,4].astype(int)
})
contain_read_indicator = read_sampled_unsplit[:,0] == read_sampled_unsplit[:,1]
reads_cur['read_length'] = read_len
reads_cur['read_length'][contain_read_indicator] = abs(reads_cur['length'].astype(int)[contain_read_indicator])
# Add jitter size to read positions
jitter_value_vec = np.random.randint(-jitter_size,jitter_size,size=npair_read_synthetic).astype(int) # nrow(reads_cur) should equal to nfrag_cur
reads_cur['r_start_shifted'] = reads_cur['r_start'].astype(int) + jitter_value_vec
reads_cur['mate_start_shifted'] = reads_cur['mate_start'].astype(int) + jitter_value_vec
reads_cur['r_end_shifted'] = reads_cur['r_start'].astype(int) + reads_cur['read_length'].astype(int) + jitter_value_vec
reads_cur['mate_end_shifted'] = reads_cur['mate_start'].astype(int) + reads_cur['read_length'].astype(int) + jitter_value_vec
# Split read 1 and read 2
read_1_df = pd.concat([reads_cur.loc[reads_cur['read_order'] == 1, ['chr','r_start_shifted', 'r_end_shifted', 'read_length', 'strand']].rename(columns={'r_start_shifted':'r1_start_shifted', 'r_end_shifted':'r1_end_shifted'}), reads_cur.loc[reads_cur['read_order'] == 2, ['chr','mate_start_shifted', 'mate_end_shifted', 'read_length', 'mate_strand']].rename(columns={'mate_start_shifted':'r1_start_shifted', 'mate_end_shifted':'r1_end_shifted', 'mate_strand': 'strand'})], ignore_index=True)
read_2_df = pd.concat([reads_cur.loc[reads_cur['read_order'] == 1, ['chr','mate_start_shifted', 'mate_end_shifted', 'read_length', 'mate_strand']].rename(columns={'mate_start_shifted':'r2_start_shifted', 'mate_end_shifted':'r2_end_shifted', 'mate_strand': 'strand'}), reads_cur.loc[reads_cur['read_order'] == 2, ['chr','r_start_shifted', 'r_end_shifted', 'read_length', 'strand']].rename(columns={'r_start_shifted':'r2_start_shifted', 'r_end_shifted':'r2_end_shifted'}), ], ignore_index=True)
read_1_df['read_name'] = read_name_list
read_2_df['read_name'] = read_name_list
# read_1_df['read_length'] = read_len
# read_2_df['read_length'] = read_len
read_1_df['strand'] = ['+' if x == 1 else '-' for x in read_1_df['strand']]
read_2_df['strand'] = ['+' if x == 1 else '-' for x in read_2_df['strand']]
read_1_df_order = read_1_df[['chr','r1_start_shifted', 'r1_end_shifted', 'read_name', 'read_length', 'strand']]
read_2_df_order = read_2_df[['chr','r2_start_shifted', 'r2_end_shifted', 'read_name', 'read_length', 'strand']]
if read_1_df_order.shape[0] != read_2_df_order.shape[0]:
print("[Warning] Peak %s read 1 and read 2 not identical!", relative_peak_ind)
if np.sum(np.array(read_1_df_order[['r1_start_shifted']] < 0)) + np.sum(np.array(read_2_df_order[['r2_start_shifted']] < 0)) > 0:
print("[Warning] Synthetic read pair for Peak %s %s has read 1 or read 2 start position negative: synthetic read pair removed!" % (relative_peak_ind, rec_name))
ind_preserve = np.array(read_1_df_order[['r1_start_shifted']] >= 0) * np.array(read_2_df_order[['r2_start_shifted']] >= 0) # Remove rows with read 1 or read 2's start position negative
read_1_df_order_removeNegRead = read_1_df_order.loc[ind_preserve]
read_2_df_order_removeNegRead = read_2_df_order.loc[ind_preserve]
else:
read_1_df_order_removeNegRead = read_1_df_order
read_2_df_order_removeNegRead = read_2_df_order
read_1_df_order_removeNegRead.to_csv("%s/%s.read1.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a')
read_2_df_order_removeNegRead.to_csv("%s/%s.read2.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a')
if npair_read_synthetic != read_1_df_order_removeNegRead.shape[0]:
print("Target read pair %s | Sample synthetic read pair %s" % (npair_read_synthetic, read_1_df_order_removeNegRead.shape[0]))
print("\n[scReadSim] Created:")
print("[scReadSim] Read 1 bed file: %s/%s.read1.bed" % (outdirectory, read_bedfile_prename))
print("[scReadSim] Read 2 bed file: %s/%s.read2.bed" % (outdirectory, read_bedfile_prename))
# Modeling Gray Areas
if GrayAreaModeling == True:
print("\n[scReadSim] Generating reads for Gray Area...")
# If gray area bed file not found, pring the error.
try:
with open(outdirectory + "/" + "scReadSim.grayareas.bed") as file:
reader = csv.reader(file, delimiter="\t")
GreyArea_set = np.asarray(list(reader))
except Exception as e:
print("[Error] Gray Area Bed File not Found: %s" % (outdirectory, scReadSim.grayareas.bed))
with open("%s/GrayArea_Assigned_Synthetic_CountMatrix.txt" % outdirectory, 'w') as f_2:
pass
# Create read 1 and read 2 files
with open("%s/%s.GrayArea.read1.bed" % (outdirectory, read_bedfile_prename), 'w') as f_1:
pass
with open("%s/%s.GrayArea.read2.bed" % (outdirectory, read_bedfile_prename), 'w') as f_2:
pass
random.seed(2022)
for peak_id in tqdm(range(len(GreyArea_set))):
grey_area = GreyArea_set[peak_id]
rec_name = '_'.join(grey_area)
grey_length = int(grey_area[2]) - int(grey_area[1])
idx = find_leftnearest_nonpeak(open_peak, grey_area)
nonpeak_cur_count = count_mat[idx,:]
nonpeak_cur_length = int(open_peak[idx,2]) - int(open_peak[idx,1])
if np.sum(nonpeak_cur_count) == 0:
grey_count_vec = pd.DataFrame(np.zeros(n_cell, dtype=int)).T
grey_count_vec.to_csv("%s/GrayArea_Assigned_Synthetic_CountMatrix.txt" % outdirectory, header=None, index=None, sep='\t', mode='a')
continue # print zero counts for grey area if no reads in non-peak count mat
# scaled_grey_count = np.round(nonpeak_cur_count * grey_length / nonpeak_cur_length).astype(int)
scaled_grey_count = nonpeak_cur_count * np.random.binomial(1, min(grey_length / nonpeak_cur_length, 1), len(nonpeak_cur_count))
# Write out grey area synthetic count matrix (optional)
grey_count_vec = pd.DataFrame(np.append(rec_name, scaled_grey_count)).T
grey_count_vec.to_csv("%s/GrayArea_Assigned_Synthetic_CountMatrix.txt" % outdirectory, header=None, index=None, sep='\t', mode='a')
if np.sum(scaled_grey_count) == 0:
continue # if no synthetic count for grey then skip the peak
reads = samfile.fetch(grey_area[0], int(grey_area[1]), int(grey_area[2]))
# Extract read information
reads_str = []
for read in reads:
if read.is_reverse==1:
strand = -1
else:
strand = 1
if read.is_read1==1:
read_order = 1
else:
read_order = 2
start = read.reference_start
mate_start = read.next_reference_start
read_len_cur = read.query_alignment_length
read_info = [start, mate_start, read_len_cur, read_order, strand]
reads_str.append(read_info)
if len(reads_str) == 0: # If no real reads exist in the peak, skip
continue
count_frag_vec = np.ceil(scaled_grey_count/2).astype(int)
npair_read_synthetic = np.sum(count_frag_vec).astype(int) # nrow(reads_cur) should equal to nfrag_cur
read_sampled_unsplit = np.array(reads_str)[np.random.choice(len(reads_str), size=npair_read_synthetic, replace=True),:]
# Sample starting position if random noise mode is on, or use real read starting position
if random_noise_mode == True:
read_synthetic_start = np.random.randint(int(grey_area[1]), int(grey_area[2]), size=npair_read_synthetic)
else:
read_synthetic_start = read_sampled_unsplit[:,0].astype(int)
# Generate Read Name
nonempty_cell_ind = np.where(count_frag_vec != 0)[0]
target_peak_concat = grey_area[0] + ":" + str(grey_area[1]) + "-" + str(grey_area[2])
read_name_list = [random_cellbarcode_list[nonempty_cell_ind[ind]] + ":" + "CellNo" + str(nonempty_cell_ind[ind] + 1) + ":" + str(target_peak_concat) + "#" + str(count).zfill(4) for ind in range(len(nonempty_cell_ind)) for count in range(count_frag_vec[nonempty_cell_ind[ind]])]
# Create dataframe for unspiltted sampled reads
reads_cur = pd.DataFrame({
'chr': grey_area[0],
'r_start': read_synthetic_start,
'mate_start': read_sampled_unsplit[:,1].astype(int) + read_synthetic_start - read_sampled_unsplit[:,0].astype(int),
'read_name': read_name_list,
'length': read_sampled_unsplit[:,2],
'read_order': read_sampled_unsplit[:,3].astype(int),
'strand': read_sampled_unsplit[:,4].astype(int),
'mate_strand': -read_sampled_unsplit[:,4].astype(int)
})
contain_read_indicator = read_sampled_unsplit[:,0] == read_sampled_unsplit[:,1]
reads_cur['read_length'] = read_len
reads_cur['read_length'][contain_read_indicator] = abs(reads_cur['length'].astype(int)[contain_read_indicator])
# Add jitter size to read positions
jitter_value_vec = np.random.randint(-jitter_size,jitter_size,size=npair_read_synthetic).astype(int) # nrow(reads_cur) should equal to nfrag_cur
reads_cur['r_start_shifted'] = reads_cur['r_start'].astype(int) + jitter_value_vec
reads_cur['mate_start_shifted'] = reads_cur['mate_start'].astype(int) + jitter_value_vec
reads_cur['r_end_shifted'] = reads_cur['r_start'].astype(int) + reads_cur['read_length'].astype(int) + jitter_value_vec
reads_cur['mate_end_shifted'] = reads_cur['mate_start'].astype(int) + reads_cur['read_length'].astype(int) + jitter_value_vec
# Split read 1 and read 2
read_1_df = pd.concat([reads_cur.loc[reads_cur['read_order'] == 1, ['chr','r_start_shifted', 'r_end_shifted', 'read_length', 'strand']].rename(columns={'r_start_shifted':'r1_start_shifted', 'r_end_shifted':'r1_end_shifted'}), reads_cur.loc[reads_cur['read_order'] == 2, ['chr','mate_start_shifted', 'mate_end_shifted', 'read_length', 'mate_strand']].rename(columns={'mate_start_shifted':'r1_start_shifted', 'mate_end_shifted':'r1_end_shifted', 'mate_strand': 'strand'})], ignore_index=True)
read_2_df = pd.concat([reads_cur.loc[reads_cur['read_order'] == 1, ['chr','mate_start_shifted', 'mate_end_shifted', 'read_length', 'mate_strand']].rename(columns={'mate_start_shifted':'r2_start_shifted', 'mate_end_shifted':'r2_end_shifted', 'mate_strand': 'strand'}), reads_cur.loc[reads_cur['read_order'] == 2, ['chr','r_start_shifted', 'r_end_shifted', 'read_length', 'strand']].rename(columns={'r_start_shifted':'r2_start_shifted', 'r_end_shifted':'r2_end_shifted'}), ], ignore_index=True)
read_1_df['read_name'] = read_name_list
read_2_df['read_name'] = read_name_list
# read_1_df['read_length'] = read_len
# read_2_df['read_length'] = read_len
read_1_df['strand'] = ['+' if x == 1 else '-' for x in read_1_df['strand']]
read_2_df['strand'] = ['+' if x == 1 else '-' for x in read_2_df['strand']]
read_1_df_order = read_1_df[['chr','r1_start_shifted', 'r1_end_shifted', 'read_name', 'read_length', 'strand']]
read_2_df_order = read_2_df[['chr','r2_start_shifted', 'r2_end_shifted', 'read_name', 'read_length', 'strand']]
if read_1_df_order.shape[0] != read_2_df_order.shape[0]:
print("[Warning] Gray Area %s read 1 and read 2 not identical!", peak_id)
if np.sum(np.array(read_1_df_order[['r1_start_shifted']] < 0)) + np.sum(np.array(read_2_df_order[['r2_start_shifted']] < 0)) > 0:
print("[Warning] Synthetic read pair for Peak %s %s has read 1 or read 2 start position negative: synthetic read pair removed!" % (peak_id, rec_name))
ind_preserve = np.array(read_1_df_order[['r1_start_shifted']] >= 0) * np.array(read_2_df_order[['r2_start_shifted']] >= 0) # Remove rows with read 1 or read 2's start position negative
read_1_df_order_removeNegRead = read_1_df_order.loc[ind_preserve]
read_2_df_order_removeNegRead = read_2_df_order.loc[ind_preserve]
else:
read_1_df_order_removeNegRead = read_1_df_order
read_2_df_order_removeNegRead = read_2_df_order
read_1_df_order_removeNegRead.to_csv("%s/%s.GrayArea.read1.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a')
read_2_df_order_removeNegRead.to_csv("%s/%s.GrayArea.read2.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a')
if npair_read_synthetic != read_1_df_order_removeNegRead.shape[0]:
print("Target read pair %s | Sample synthetic read pair %s" % (npair_read_synthetic, read_1_df_order_removeNegRead.shape[0]))
print("\n[scReadSim] Created:")
print("[scReadSim] Read 1 Bed File: %s/%s.GrayArea.read1.bed" % (outdirectory, read_bedfile_prename))
print("[scReadSim] Read 2 Bed File: %s/%s.GrayArea.read2.bed" % (outdirectory, read_bedfile_prename))
print("[scReadSim] Done.")
[docs]
def scATAC_GenerateBAMCoord_OutputPeak(target_peak_assignment_file, count_mat_file, synthetic_cell_label_file, read_bedfile_prename, INPUT_bamfile, outdirectory, OUTPUT_cells_barcode_file, jitter_size=5, read_len=50, random_noise_mode = False):
"""Generate Synthetic reads in BED format.
Parameters
----------
target_peak_assignment_file: `str`
Mapping file between input peaks and output peaks, output by 'FeatureMapping'.
count_mat_file: `str`
The path to the synthetic count matrix generated by `GenerateSyntheticCount.scATAC_GenerateSyntheticCount`.
synthetic_cell_label_file: `str`
Synthetic cell label file generated by `scATAC_GenerateSyntheticCount`.
read_bedfile_prename: `str`
Specify the base name of output bed file.
INPUT_bamfile: `str`
Input BAM file for anlaysis.
outdirectory: `str`
Specify the output directory for synthetic reads bed file.
OUTPUT_cells_barcode_file: `str`
Specify the file name storing the synthetic cell barcodes.
jitter_size: `int` (default: '5')
Specify the range of random shift to avoid replicate synthetic reads. Default value is 5 bp.
read_len: `int` (default: '50')
Specify the length of synthetic reads. Default value is 50 bp.
random_noise_mode: 'bool' (default: 'False')
Specify whether to use a uniform distribution of reads.
"""
count_mat_df = pd.read_csv("%s" % (count_mat_file), header=None, delimiter="\t")
# Remove count matrix first column: feature names
count_mat = count_mat_df.iloc[: , 1:].to_numpy()
count_mat_cluster = pd.read_csv(synthetic_cell_label_file, header=None, delimiter="\t").to_numpy().flatten()
n_cell = np.shape(count_mat)[1]
samfile = pysam.AlignmentFile(INPUT_bamfile, "rb")
with open(target_peak_assignment_file) as open_peak:
reader = csv.reader(open_peak, delimiter="\t")
open_peak = np.asarray(list(reader))
peak_nonzero_id = np.nonzero(count_mat.sum(axis=1))[0]
random.seed(2022)
random_cellbarcode_list = cellbarcode_generator(n_cell, size=16)
with open(outdirectory + "/"+ OUTPUT_cells_barcode_file, 'w') as f:
for item in random_cellbarcode_list:
f.write(item + "\n")
cellbarcode_list_withclusters = np.vstack((random_cellbarcode_list, count_mat_cluster)).transpose()
with open(outdirectory + "/"+ OUTPUT_cells_barcode_file + ".withSynthCluster", 'w') as f:
for item in cellbarcode_list_withclusters:
f.write("\t".join(item) + "\n")
# Create read 1 and read 2 files
with open("%s/%s.read1.bed" % (outdirectory, read_bedfile_prename), 'w') as f_1:
pass
with open("%s/%s.read2.bed" % (outdirectory, read_bedfile_prename), 'w') as f_2:
pass
# w/ Target Peak
print("[scReadSim] Generating Synthetic Reads for Feature Set: %s" % (target_peak_assignment_file))
for relative_peak_ind in tqdm(range(len(peak_nonzero_id))):
peak_ind = peak_nonzero_id[relative_peak_ind]
rec = open_peak[peak_ind]
rec_name = '_'.join(rec)
true_peak_concat = rec[3] + ":" + str(rec[4]) + "-" + str(rec[5])
target_peak_concat = rec[0] + ":" + str(rec[1]) + "-" + str(rec[2])
if int(rec[2]) - int(rec[1]) == 0:
print("Peak %s has identical start and end position. Skip.")
continue
shift_number = int(rec[1]) - int(rec[4])
reads = samfile.fetch(rec[3], int(rec[4]), int(rec[5])) # Extract reads from true peaks
# tic = time.time()
# Extract read information
reads_str = []
for read in reads:
if read.is_reverse==1:
strand = -1
else:
strand = 1
if read.is_read1==1:
read_order = 1
else:
read_order = 2
start = read.reference_start
mate_start = read.next_reference_start
read_len_cur = read.query_alignment_length
read_info = [start, mate_start, read_len_cur, read_order, strand]
reads_str.append(read_info)
# Sample npair_read_synthetic read 1 from reads and reserve fragment length
if len(reads_str) > 0: # If no real reads exist in the peak, skip
count_vec = count_mat[peak_ind,:] # Synthetic umi count
count_frag_vec = np.ceil(count_vec/2).astype(int)
npair_read_synthetic = np.sum(count_frag_vec).astype(int) # nrow(reads_cur) should equal to nfrag_cur
# npair_read_synthetic = np.ceil(np.sum(count_vec)/2).astype(int) # total number of synthetic reads
read_sampled_unsplit = np.array(reads_str)[np.random.choice(len(reads_str), size=npair_read_synthetic, replace=True),:]
# Sample starting position if random noise mode is on, or use real read starting position
if random_noise_mode == True:
read_synthetic_start = np.random.randint(int(rec[1]), int(rec[2]), size=npair_read_synthetic)
else:
read_synthetic_start = read_sampled_unsplit[:,0].astype(int) + shift_number
# Generate Read Name
nonempty_cell_ind = np.where(count_frag_vec != 0)[0]
target_peak_concat = rec[0] + ":" + str(rec[1]) + "-" + str(rec[2])
read_name_list = [random_cellbarcode_list[nonempty_cell_ind[ind]] + ":" + "CellNo" + str(nonempty_cell_ind[ind] + 1) + ":" + str(target_peak_concat) + "#" + str(count).zfill(4) for ind in range(len(nonempty_cell_ind)) for count in range(count_frag_vec[nonempty_cell_ind[ind]])]
# Create dataframe for unspiltted sampled reads
reads_cur = pd.DataFrame({
'chr': rec[0],
'r_start': read_synthetic_start,
'mate_start': read_sampled_unsplit[:,1].astype(int) + read_synthetic_start - read_sampled_unsplit[:,0].astype(int),
'read_name': read_name_list,
'length': read_sampled_unsplit[:,2],
'read_order': read_sampled_unsplit[:,3].astype(int),
'strand': read_sampled_unsplit[:,4].astype(int),
'mate_strand': -read_sampled_unsplit[:,4].astype(int)
})
contain_read_indicator = read_sampled_unsplit[:,0] == read_sampled_unsplit[:,1]
reads_cur['read_length'] = read_len
reads_cur['read_length'][contain_read_indicator] = abs(reads_cur['length'].astype(int)[contain_read_indicator])
# Add jitter size to read positions
jitter_value_vec = np.random.randint(-jitter_size,jitter_size,size=npair_read_synthetic).astype(int) # nrow(reads_cur) should equal to nfrag_cur
reads_cur['r_start_shifted'] = reads_cur['r_start'].astype(int) + jitter_value_vec
reads_cur['mate_start_shifted'] = reads_cur['mate_start'].astype(int) + jitter_value_vec
reads_cur['r_end_shifted'] = reads_cur['r_start'].astype(int) + reads_cur['read_length'].astype(int) + jitter_value_vec
reads_cur['mate_end_shifted'] = reads_cur['mate_start'].astype(int) + reads_cur['read_length'].astype(int) + jitter_value_vec
# Split read 1 and read 2
read_1_df = pd.concat([reads_cur.loc[reads_cur['read_order'] == 1, ['chr','r_start_shifted', 'r_end_shifted', 'read_length', 'strand']].rename(columns={'r_start_shifted':'r1_start_shifted', 'r_end_shifted':'r1_end_shifted'}), reads_cur.loc[reads_cur['read_order'] == 2, ['chr','mate_start_shifted', 'mate_end_shifted', 'read_length', 'mate_strand']].rename(columns={'mate_start_shifted':'r1_start_shifted', 'mate_end_shifted':'r1_end_shifted', 'mate_strand': 'strand'})], ignore_index=True)
read_2_df = pd.concat([reads_cur.loc[reads_cur['read_order'] == 1, ['chr','mate_start_shifted', 'mate_end_shifted', 'read_length', 'mate_strand']].rename(columns={'mate_start_shifted':'r2_start_shifted', 'mate_end_shifted':'r2_end_shifted', 'mate_strand': 'strand'}), reads_cur.loc[reads_cur['read_order'] == 2, ['chr','r_start_shifted', 'r_end_shifted', 'read_length', 'strand']].rename(columns={'r_start_shifted':'r2_start_shifted', 'r_end_shifted':'r2_end_shifted'}), ], ignore_index=True)
read_1_df['read_name'] = read_name_list
read_2_df['read_name'] = read_name_list
read_1_df['strand'] = ['+' if x == 1 else '-' for x in read_1_df['strand']]
read_2_df['strand'] = ['+' if x == 1 else '-' for x in read_2_df['strand']]
read_1_df_order = read_1_df[['chr','r1_start_shifted', 'r1_end_shifted', 'read_name', 'read_length', 'strand']]
read_2_df_order = read_2_df[['chr','r2_start_shifted', 'r2_end_shifted', 'read_name', 'read_length', 'strand']]
if read_2_df_order.shape[0] != read_2_df_order.shape[0]:
print("Peak %s read 1 and read 2 not identical!", relative_peak_ind)
if np.sum(np.array(read_1_df_order[['r1_start_shifted']] < 0)) + np.sum(np.array(read_2_df_order[['r2_start_shifted']] < 0)) > 0:
print("[Warning] Synthetic read pair for Peak %s %s has read 1 or read 2 start position negative: synthetic read pair removed!" % (relative_peak_ind, rec_name))
ind_preserve = np.array(read_1_df_order[['r1_start_shifted']] >= 0) * np.array(read_2_df_order[['r2_start_shifted']] >= 0) # Remove rows with read 1 or read 2's start position negative
read_1_df_order_removeNegRead = read_1_df_order.loc[ind_preserve]
read_2_df_order_removeNegRead = read_2_df_order.loc[ind_preserve]
else:
read_1_df_order_removeNegRead = read_1_df_order
read_2_df_order_removeNegRead = read_2_df_order
read_1_df_order_removeNegRead.to_csv("%s/%s.read1.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a')
read_2_df_order_removeNegRead.to_csv("%s/%s.read2.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a')
print("\n[scReadSim] Created:")
print("[scReadSim] Read 1 bed file: %s/%s.read1.bed" % (outdirectory, read_bedfile_prename))
print("[scReadSim] Read 2 bed file: %s/%s.read2.bed" % (outdirectory, read_bedfile_prename))
print("[scReadSim] Done.")
[docs]
def scATAC_CombineBED(outdirectory, peak_read_bedfile_prename, nonpeak_read_bedfile_prename, BED_filename_combined_pre, GrayAreaModeling=True):
"""Combine the bed files of foreground and background feature sets into one bed file.
Parameters
----------
outdirectory: `str`
Directory of `peak_read_bedfile_prename`.txt and `nonpeak_read_bedfile_prename`.txt.
peak_read_bedfile_prename: `str`
Base name of the bed file containig synthetic reads for peaks (generated by function `scATAC_GenerateBAM.scATAC_GenerateBAMCoord`).
nonpeak_read_bedfile_prename: 'str'
Base name of the bed file containig synthetic reads for non-peaks (generated by function `scATAC_GenerateBAM.scATAC_GenerateBAMCoord`).
BED_filename_combined_pre: 'str'
Specify the combined syntehtic reads bed file prename. The combined bed file will be output to `outdirectory`.
GrayAreaModeling: `bool` (default: 'True')
Specify whether to combine gray area's reads.
"""
if GrayAreaModeling:
print("[scReadSim] Combining synthetic read 1 BED files from peaks, non-peaks and gray areas.")
combine_read1_cmd = "cat %s/%s.read1.bed %s/%s.read1.bed %s/%s.GrayArea.read1.bed > %s/%s.read1.bed" % (outdirectory, peak_read_bedfile_prename, outdirectory, nonpeak_read_bedfile_prename, outdirectory, nonpeak_read_bedfile_prename, outdirectory, BED_filename_combined_pre)
output, error = subprocess.Popen(combine_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to create combine synthetic read1 bed files:\n', error.decode())
print("[scReadSim] Combining synthetic read 2 BED files from peaks, non-peaks and gray areas.")
combine_read2_cmd = "cat %s/%s.read2.bed %s/%s.read2.bed %s/%s.GrayArea.read2.bed > %s/%s.read2.bed" % (outdirectory, peak_read_bedfile_prename, outdirectory, nonpeak_read_bedfile_prename, outdirectory, nonpeak_read_bedfile_prename, outdirectory, BED_filename_combined_pre)
output, error = subprocess.Popen(combine_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to create combine synthetic read2 bed files:\n', error.decode())
# sys.exit('[ERROR] Fail to create combine synthetic read2 bed files:\n', error.decode())
else:
print("[scReadSim] Combining synthetic read 1 BED files from peaks, non-peaks.")
combine_read1_cmd = "cat %s/%s.read1.bed %s/%s.read1.bed > %s/%s.read1.bed" % (outdirectory, peak_read_bedfile_prename, outdirectory, nonpeak_read_bedfile_prename, outdirectory, BED_filename_combined_pre)
output, error = subprocess.Popen(combine_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to create combine synthetic read1 bed files:\n', error.decode())
print("[scReadSim] Combining synthetic read 2 BED files from peaks, non-peaks.")
combine_read2_cmd = "cat %s/%s.read2.bed %s/%s.read2.bed > %s/%s.read2.bed" % (outdirectory, peak_read_bedfile_prename, outdirectory, nonpeak_read_bedfile_prename, outdirectory, BED_filename_combined_pre)
output, error = subprocess.Popen(combine_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to create combine synthetic read2 bed files:\n', error.decode())
# sys.exit('[ERROR] Fail to create combine synthetic read2 bed files:\n', error.decode())
print("\n[scReadSim] Created:")
print("[scReadSim] Combined read 1 BED file: %s/%s.read1.bed" % (outdirectory, BED_filename_combined_pre))
print("[scReadSim] Combined read 2 BED file: %s/%s.read2.bed" % (outdirectory, BED_filename_combined_pre))
print("[scReadSim] Done.")
[docs]
def scATAC_BED2FASTQ(bedtools_directory, seqtk_directory, referenceGenome_file, outdirectory, BED_filename_combined, synthetic_fastq_prename):
"""Convert Synthetic reads from BED to FASTQ.
Parameters
----------
bedtools_directory: `str`
Directory of software bedtools.
seqtk_directory: `str`
Directory of software seqtk.
referenceGenome_file: `str`
Directory of the reference genome FASTA file that the synthteic reads should align.
outdirectory: `str`
Output directory of the synthteic bed file and its corresponding cell barcodes file.
BED_filename_combined: `str`
Specify the base name of output bed file of function 'scATAC_CombineBED'.
synthetic_fastq_prename: `str`
Specify the base name of the output FASTQ files.
"""
# Create FASTA
print('[scReadSim] Generating synthetic read FASTA files...')
fasta_read1_cmd = "%s/bedtools getfasta -s -fi %s -bed %s/%s.read1.bed -fo %s/%s.read1.bed2fa.strand.fa -nameOnly" % (bedtools_directory, referenceGenome_file, outdirectory, BED_filename_combined, outdirectory, synthetic_fastq_prename)
output, error = subprocess.Popen(fasta_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print(error.decode())
fasta_read2_cmd = "%s/bedtools getfasta -s -fi %s -bed %s/%s.read2.bed -fo %s/%s.read2.bed2fa.strand.fa -nameOnly" % (bedtools_directory, referenceGenome_file, outdirectory, BED_filename_combined, outdirectory, synthetic_fastq_prename)
output, error = subprocess.Popen(fasta_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print(error.decode())
# remove (-) or (+)
org_fasta_read1_cmd = "sed '/^>/s/.\{3\}$//' %s/%s.read1.bed2fa.strand.fa > %s/%s.read1.bed2fa.fa" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
output, error = subprocess.Popen(org_fasta_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to remove strand infomormation from synthetic read1 fasta file:', error.decode())
org_fasta_read2_cmd = "sed '/^>/s/.\{3\}$//' %s/%s.read2.bed2fa.strand.fa > %s/%s.read2.bed2fa.fa" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
output, error = subprocess.Popen(org_fasta_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to remove strand infomormation from synthetic read2 fasta file:', error.decode())
# FASTA to FASTQ
print('[scReadSim] Generating synthetic read FASTQ files...')
fastq_read1_cmd = "%s/seqtk seq -F 'F' %s/%s.read1.bed2fa.fa > %s/%s.read1.bed2fa.fq" % (seqtk_directory, outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
output, error = subprocess.Popen(fastq_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to convert read1 synthetic fasta file to fastq file:', error.decode())
fastq_read2_cmd = "%s/seqtk seq -F 'F' %s/%s.read2.bed2fa.fa > %s/%s.read2.bed2fa.fq" % (seqtk_directory, outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
output, error = subprocess.Popen(fastq_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to convert read2 synthetic fasta file to fastq file:', error.decode())
print('[scReadSim] Sorting FASTQ files...')
sort_fastq_read1_cmd = "cat %s/%s.read1.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
output, error = subprocess.Popen(sort_fastq_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to sort read1 synthetic fastq file:', error.decode())
sort_fastq_read2_cmd = "cat %s/%s.read2.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
output, error = subprocess.Popen(sort_fastq_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to sort read2 synthetic fastq file:', error.decode())
print("\n[scReadSim] Created:")
print("[scReadSim] Read 1 FASTQ file: %s/%s.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename))
print("[scReadSim] Read 2 FASTQ file: %s/%s.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename))
print("[scReadSim] Done.")
[docs]
def AlignSyntheticBam_Pair(bowtie2_directory, samtools_directory, outdirectory, referenceGenome_name, referenceGenome_dir, synthetic_fastq_prename, output_BAM_pre):
"""Convert Synthetic reads from FASTQ to BAM.
Parameters
----------
bowtie2_directory: `str`
Path to software bowtie2.
samtools_directory: `str`
Path to software samtools.
outdirectory: `str`
Specify the output directory of the synthteic BAM file.
referenceGenome_name: `str`
Base name of the eference genome FASTA file. For example, you should input "chr1" for file "chr1.fa".
referenceGenome_dir: `str`
Path to the reference genome FASTA file.
synthetic_fastq_prename: `str`
Base name of the synthetic FASTQ files output by function `scATAC_BED2FASTQ`.
output_BAM_pre: `str`
Specify the base name of the output BAM file.
"""
print('[scReadSim] Aligning FASTQ files onto Reference Genome Files with Bowtie2...')
alignment_cmd = "%s/bowtie2 --minins 0 --maxins 1200 -x %s/%s -1 %s/%s.read1.bed2fa.sorted.fq -2 %s/%s.read2.bed2fa.sorted.fq | %s/samtools view -bS - > %s/%s.synthetic.noCB.bam" % (bowtie2_directory, referenceGenome_dir, referenceGenome_name, outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename, samtools_directory, outdirectory, output_BAM_pre)
output, error = subprocess.Popen(alignment_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
print('[Bowtie2] Aligning:\n', error.decode())
print('[scReadSim] Alignment done.')
print('[scReadSim] Generating cell barcode tag...')
addBC2BAM_header_cmd = "%s/samtools view %s/%s.synthetic.noCB.bam -H > %s/%s.synthetic.noCB.header.sam" % (samtools_directory, outdirectory, output_BAM_pre, outdirectory, output_BAM_pre)
output, error = subprocess.Popen(addBC2BAM_header_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
addBC2BAM_cmd = "cat <( cat %s/%s.synthetic.noCB.header.sam ) <( paste <(%s/samtools view %s/%s.synthetic.noCB.bam ) <(%s/samtools view %s/%s.synthetic.noCB.bam | cut -f1 | cut -d':' -f1 | sed -e 's/^/CB:Z:/')) | %s/samtools view -bS - > %s/%s.synthetic.bam" % (outdirectory, output_BAM_pre, samtools_directory, outdirectory, output_BAM_pre, samtools_directory, outdirectory, output_BAM_pre, samtools_directory, outdirectory, output_BAM_pre)
output, error = subprocess.Popen(addBC2BAM_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to add BC tag to synthetic BAM file:', error.decode())
print('[scReadSim] Sorting and indexing BAM file...')
sortBAMcmd = "%s/samtools sort %s/%s.synthetic.bam > %s/%s.synthetic.sorted.bam" % (samtools_directory, outdirectory, output_BAM_pre, outdirectory, output_BAM_pre)
output, error = subprocess.Popen(sortBAMcmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to sort synthetic BAM file:', error.decode())
indexBAMcmd = "%s/samtools index %s/%s.synthetic.sorted.bam" % (samtools_directory, outdirectory, output_BAM_pre)
output, error = subprocess.Popen(indexBAMcmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to index synthetic BAM file:', error.decode())
print("\n[scReadSim] Created:")
print("[scReadSim] Synthetic read BAM file: %s/%s.synthetic.sorted.bam" % (outdirectory, output_BAM_pre))
print("[scReadSim] Done.")
[docs]
def ErrorBase(base, prop, base_call_ref):
"""Sample random errors.
"""
err_base_call_id = np.random.choice(a=[0, 1, 2], size=1, p=prop)[0]
err_base_call = base_call_ref[base][err_base_call_id]
return err_base_call
[docs]
def ErroneousRead(real_error_rate_read, read_df, output_fq_file):
"""Generate random errors according to input real data error rates.
"""
n_read = int(np.shape(read_df)[0]/4)
## Prepare Error rate
real_error_rate_read_A = real_error_rate_read[['a_to_c_error_rate', 'a_to_g_error_rate', 'a_to_t_error_rate']].to_numpy()
real_error_rate_read_A_prop = real_error_rate_read_A/real_error_rate_read_A.sum(axis=1,keepdims=1)
real_error_rate_read_A_prop[np.isnan(real_error_rate_read_A_prop).any(axis=1),:] = 1/3
real_error_rate_read_C = real_error_rate_read[['c_to_a_error_rate', 'c_to_g_error_rate', 'c_to_t_error_rate']].to_numpy()
real_error_rate_read_C_prop = real_error_rate_read_C/real_error_rate_read_C.sum(axis=1,keepdims=1)
real_error_rate_read_C_prop[np.isnan(real_error_rate_read_C_prop).any(axis=1),:] = 1/3
real_error_rate_read_G = real_error_rate_read[['g_to_a_error_rate', 'g_to_c_error_rate', 'g_to_t_error_rate']].to_numpy()
real_error_rate_read_G_prop = real_error_rate_read_G/real_error_rate_read_G.sum(axis=1,keepdims=1)
real_error_rate_read_G_prop[np.isnan(real_error_rate_read_G_prop).any(axis=1),:] = 1/3
real_error_rate_read_T = real_error_rate_read[['t_to_a_error_rate', 't_to_c_error_rate', 't_to_g_error_rate']].to_numpy()
real_error_rate_read_T_prop = real_error_rate_read_T/real_error_rate_read_T.sum(axis=1,keepdims=1)
real_error_rate_read_T_prop[np.isnan(real_error_rate_read_T_prop).any(axis=1),:] = 1/3
# Base decision matrix
real_error_rate_read_prop_dict = {'A': real_error_rate_read_A_prop, 'C': real_error_rate_read_C_prop, 'G': real_error_rate_read_G_prop, 'T': real_error_rate_read_T_prop}
# Error decision vector
real_error_rate_read_perbase = real_error_rate_read['error_rate'].to_numpy()
## Decide whether error occurs for each read
read_length = real_error_rate_read.shape[0]
error_read_perbase_indicator = np.zeros((n_read, read_length), dtype=int)
random.seed(1)
for base_id in range(read_length):
error_read_perbase_indicator[:,base_id] = np.random.binomial(n=1, p=real_error_rate_read_perbase[base_id], size=n_read)
erroneous_read_id = np.where(np.sum(error_read_perbase_indicator, axis=1) > 0)[0]
## For erroneous reads, generate erroneous base based on the probability matrix
base_call_ref = {'A': ['C', 'G', 'T'], 'C': ['A', 'G', 'T'], 'G': ['A', 'C', 'T'], 'T': ['A', 'C', 'G']}
random.seed(2021)
read_df_witherror = read_df
for read_id_tqdm in tqdm(range(len(erroneous_read_id))):
read_id = erroneous_read_id[read_id_tqdm]
read_cur = read_df[(read_id*4) : (read_id*4 + 4)]
bases = list(read_cur[1][0].upper())
Qscores = list(read_cur[3][0])
for errorneous_base_id in np.where(error_read_perbase_indicator[read_id,:] > 0)[0]:
if errorneous_base_id < len(bases):
base_cur = bases[errorneous_base_id]
if base_cur in real_error_rate_read_prop_dict:
prop = real_error_rate_read_prop_dict[base_cur][errorneous_base_id]
# Decide error base
err_base_call = ErrorBase(base_cur, prop, base_call_ref)
bases[errorneous_base_id] = err_base_call
# Decide Q score
# Use 9 (Phred score 24) for erroneous base
Qscores[errorneous_base_id] = '9'
read_df_witherror[read_id*4+1] = ''.join(bases)
read_df_witherror[read_id*4+3] = ''.join(Qscores)
## Write out
np.savetxt(output_fq_file, read_df_witherror, fmt='%s')
[docs]
def SubstiError_Pair(real_error_rate_file, outdirectory, synthetic_fastq_prename):
"""Generate random errors for paired-end sequencing reads according to input real data error rates.
Parameters
----------
real_error_rate_file: `str`
Path to software fgbio jar script.
outdirectory: `str`
Specify the output directory of the synthteic FASTQ file with random errors.
synthetic_fastq_prename: `str`
Specify the base name of the synthetic erroneous reads' FASTQ files.
"""
# Read in real error rates
real_error_rate_dir = real_error_rate_file
real_error_rate = pd.read_csv(real_error_rate_dir, header=0, delimiter="\t")
real_error_rate_read1 = real_error_rate[real_error_rate['read_number'] == 1]
real_error_rate_read2 = real_error_rate[real_error_rate['read_number'] == 2]
# Read in perfect reads
read1_fq = outdirectory + "/" + synthetic_fastq_prename + ".read1.bed2fa.fq"
read2_fq = outdirectory + "/" + synthetic_fastq_prename + ".read2.bed2fa.fq"
read1_df = pd.read_csv(read1_fq, header=None).to_numpy()
read2_df = pd.read_csv(read2_fq, header=None).to_numpy()
# Generate random error according to Real data
ErroneousRead(real_error_rate_read1, read1_df, outdirectory + "/" + synthetic_fastq_prename + ".ErrorIncluded.read1.bed2fa.fq")
ErroneousRead(real_error_rate_read2, read2_df, outdirectory + "/" + synthetic_fastq_prename + ".ErrorIncluded.read2.bed2fa.fq")
[docs]
def scATAC_ErrorBase(fgbio_jarfile, INPUT_bamfile, referenceGenome_file, outdirectory, synthetic_fastq_prename):
"""Introduce random substitution errors into synthetic reads according to real data error rates.
Parameters
----------
fgbio_jarfile: `str`
Path to software fgbio jar script.
INPUT_bamfile: `str`
Input BAM file for anlaysis.
referenceGenome_file: 'str'
Reference genome FASTA file that the synthteic reads should align.
outdirectory: `str`
Specify the output directory of the synthteic FASTQ file with random errors.
synthetic_fastq_prename: `str`
Base name of the synthetic FASTQ files output by function `scATAC_BED2FASTQ`.
"""
print('[scReadSim] Substitution error calculating...')
combine_read1_cmd = "java -jar %s ErrorRateByReadPosition -i %s -r %s -o %s/Real --collapse false" % (fgbio_jarfile, INPUT_bamfile, referenceGenome_file, outdirectory)
output, error = subprocess.Popen(combine_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[Messages] Running fgbio on real bam file:\n', error.decode())
# Generate Errors into fastq files
print('[scReadSim] Generting synthetic read FASTQ files with substitution errors...')
real_error_rate_file = outdirectory + "/" + "Real.error_rate_by_read_position.txt"
SubstiError_Pair(real_error_rate_file, outdirectory, synthetic_fastq_prename)
# Combine FASTQs
print('[scReadSim] Sorting FASTQ files...')
sort_fastq_read1_cmd = "cat %s/%s.ErrorIncluded.read1.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.ErrorIncluded.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
output, error = subprocess.Popen(sort_fastq_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to sort read1 synthetic fastq file:', error.decode())
sort_fastq_read2_cmd = "cat %s/%s.ErrorIncluded.read2.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.ErrorIncluded.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
output, error = subprocess.Popen(sort_fastq_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if error:
print('[ERROR] Fail to sort read2 synthetic fastq file:', error.decode())
print("\n[scReadSim] Created:")
print("[scReadSim] Read 1 FASTQ file with substitutional error: %s/%s.ErrorIncluded.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename))
print("[scReadSim] Read 2 FASTQ file with substitutional error: %s/%s.ErrorIncluded.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename))
print("[scReadSim] Done.")
[docs]
def scATAC_GenerateSyntheticRead_MultiSample(INPUT_bamfile, outdirectory, bedtools_directory, seqtk_directory, fgbio_jarfile, referenceGenome_file, read_len=50):
"""Multi-sample/replicate implement of scReadSim for simulating scATAC-seq synthetic reads.
Parameters
----------
INPUT_bamfile: `str`
List of input BAM files (use absolute paths to the BAM files).
outdirectory: `str`
Specify the working directory of scReadSim for generating intermediate and final output files.
bedtools_directory: `str`
Directory of software bedtools.
seqtk_directory: `str`
Directory of software seqtk.
fgbio_jarfile: `str`
Path to software fgbio jar script.
referenceGenome_file: 'str'
Reference genome FASTA file that the synthteic reads should align.
read_len: `int` (default: '50')
Specify the length of synthetic reads. Default value is 50 bp.
"""
for rep_id in range(len(INPUT_bamfile)):
print("\n[scReadSim] Generating synthetic reads for sample %s..." % str(rep_id+1))
sample_output_d = outdirectory + "/" + "Rep" + str(rep_id+1)
# Obtain bedfile
peak_bedfile = sample_output_d + "/" + "scReadSim.superset.peak.bed"
nonpeak_bedfile = sample_output_d + "/" + "scReadSim.superset.nonpeak.bed"
# Specify the output count matrices' prenames
count_mat_peak_filename = "Rep%s.peak.countmatrix" % str(rep_id+1)
count_mat_nonpeak_filename = "Rep%s.nonpeak.countmatrix" % str(rep_id+1)
# Specify the names of synthetic count matrices (generated by GenerateSyntheticCount.scATAC_GenerateSyntheticCount)
synthetic_countmat_peak_file = count_mat_peak_filename + ".scDesign2Simulated.txt"
synthetic_countmat_nonpeak_file = count_mat_nonpeak_filename + ".scDesign2Simulated.txt"
# Specify the base name of bed files containing synthetic reads
OUTPUT_cells_barcode_file = "synthetic_cell_barcode_rep%s.txt" % str(rep_id+1)
peak_read_bedfile_prename = "Rep%s.syntheticBAM.peak" % str(rep_id+1)
nonpeak_read_bedfile_prename = "Rep%s.syntheticBAM.nonpeak" % str(rep_id+1)
BED_filename_combined_pre = "Rep%s.syntheticBAM.combined" % str(rep_id+1)
synthetic_cell_label_file = count_mat_peak_filename + ".scDesign2Simulated.CellTypeLabel.txt"
# Create synthetic read bed file for peaks
print("\n[scReadSim] Generating synthetic read BED file for peaks...")
scATAC_GenerateBAMCoord(bed_file=peak_bedfile, count_mat_file=sample_output_d + "/" + synthetic_countmat_peak_file, synthetic_cell_label_file=sample_output_d + "/" + synthetic_cell_label_file, read_bedfile_prename=peak_read_bedfile_prename, INPUT_bamfile=INPUT_bamfile[rep_id], outdirectory=sample_output_d, OUTPUT_cells_barcode_file=OUTPUT_cells_barcode_file, jitter_size=5, read_len=read_len)
# Create synthetic read bed file for non-peaks
print("\n[scReadSim] Generating synthetic read BED file for non-peaks...")
scATAC_GenerateBAMCoord(bed_file=nonpeak_bedfile, count_mat_file=sample_output_d + "/" + synthetic_countmat_nonpeak_file, synthetic_cell_label_file=sample_output_d + "/" + synthetic_cell_label_file, read_bedfile_prename=nonpeak_read_bedfile_prename, INPUT_bamfile=INPUT_bamfile[rep_id], outdirectory=sample_output_d, OUTPUT_cells_barcode_file=OUTPUT_cells_barcode_file, jitter_size=5, read_len=read_len, GrayAreaModeling=True)
# Combine bed fileS
scATAC_CombineBED(outdirectory=sample_output_d, peak_read_bedfile_prename=peak_read_bedfile_prename, nonpeak_read_bedfile_prename=nonpeak_read_bedfile_prename, BED_filename_combined_pre=BED_filename_combined_pre)
# Generate FASTQ files
synthetic_fastq_prename = BED_filename_combined_pre
# Convert combined bed file into FASTQ files
print("\n[scReadSim] Generating read synthetic FASTQ files...")
scATAC_BED2FASTQ(bedtools_directory=bedtools_directory, seqtk_directory=seqtk_directory, referenceGenome_file=referenceGenome_file, outdirectory=sample_output_d, BED_filename_combined=BED_filename_combined_pre, synthetic_fastq_prename=synthetic_fastq_prename)
# Generate reads with errors in FASTQs
print("\n[scReadSim] Generating read synthetic FASTQ files with substitutional error...")
scATAC_ErrorBase(fgbio_jarfile=fgbio_jarfile, INPUT_bamfile=INPUT_bamfile[rep_id], referenceGenome_file=referenceGenome_file, outdirectory=sample_output_d, synthetic_fastq_prename=synthetic_fastq_prename)