Source code for scReadSim.scRNA_GenerateBAM

import pandas as pd
import numpy as np
import csv
import collections
import time
import sys
import os
pd.options.mode.chained_assignment = None  # default='warn'
import string
import random
import subprocess
from tqdm import tqdm
from pathlib import Path
from joblib import Parallel, delayed
import pysam
from collections import defaultdict
# New modules in revision2
from Bio import SeqIO
import gffpandas.gffpandas as gffpd
import warnings

warnings.filterwarnings('ignore', module='gffpandas')


[docs]
def flatten(x):
	"""Flatten a nested list.

	"""
	if isinstance(x, collections.Iterable):
		return [a for i in x for a in flatten(i)]
	else:
		return [x]




[docs]
def cellbarcode_generator(length, size=10):
	"""Generate random cellbarcode.

	Parameters
	----------
	length: `int`
		Number of cells.
	size: `int` (default: '10')
		Size of cell barcode. Default value is 10 bp.

	Return
	------
	cb_list: `list`
		List of randomly generated cell barcodes.
	"""
	chars = 'ACGT'
	cb_list = [''.join(random.choice(chars) for _ in range(size)) for cell in range(length)]
	return cb_list



# # Test 
# UMI_count_mat_file = "NGS_H2228_H1975_A549_H838_HCC827_Mixture_10X.COMPLE.UMIcountmatrix.scDesign2Simulated.test.txt"
# outdirectory = "/home/guanao/Projects/scIsoSim/results/20230204"
# cell_label_file = outdirectory+"/"+"NGS_H2228_H1975_A549_H838_HCC827_Mixture_10X.UMIcountmatrix" + ".scDesign2Simulated.CellTypeLabel.txt"


[docs]
def scRNA_GenerateBAMCoord(bed_file, UMI_count_mat_file, synthetic_cell_label_file, read_bedfile_prename, INPUT_bamfile, outdirectory, OUTPUT_cells_barcode_file, jitter_size=5, read_len=90, UMI_tag='UB:Z'):
	"""Generate Synthetic reads in BED format.

	Parameters
	----------
	bed_file: `str`
		Features' bed file to generate the synthetic reads (Generated by function `Utility.scRNA_CreateFeatureSets`).
	UMI_count_mat_file: `str`
		The path to synthetic UMI count matrix.
	synthetic_cell_label_file: `str`
		Synthetic cell label file generated by `scRNA_GenerateSyntheticCount`.
	read_bedfile_prename: `str`
		Specify the base name of output bed file.
	INPUT_bamfile: `str`
		Input BAM file for anlaysis.
	outdirectory: `str`
		Specify the output directory for synthetic reads bed file.
	OUTPUT_cells_barcode_file: `str`
		Specify the file name storing the synthetic cell barcodes.
	jitter_size: `int` (default: '5')
		Specify the range of random shift to avoid replicate synthetic reads. Default value is 5 bp.
	read_len: `int` (default: '90')
		Specify the length of synthetic reads. Default value is 90 bp.
	UMI_tag: `str` (default: 'UB:Z')
		If UMI_modeling is set to True, specify the UMI tag of input BAM file, default value 'UB:Z' is the UMI tag for 10x scRNA-seq.
	"""
	UMI_count_mat_df = pd.read_csv("%s" % (UMI_count_mat_file), header=None, delimiter="\t")
	# Remove count matrix first column: feature names
	UMI_count_mat = UMI_count_mat_df.iloc[: , 1:].to_numpy()
	UMI_count_mat_cluster = pd.read_csv(synthetic_cell_label_file, header=None, delimiter="\t").to_numpy().flatten()	
	n_cell = np.shape(UMI_count_mat)[1]
	samfile = pysam.AlignmentFile(INPUT_bamfile, "rb")
	with open(bed_file) as open_peak:
		reader = csv.reader(open_peak, delimiter="\t")
		open_peak = np.asarray(list(reader))
	peak_nonzero_id = np.nonzero(UMI_count_mat.sum(axis=1))[0]
	random.seed(2022)
	random_cellbarcode_list = cellbarcode_generator(n_cell, size=16)
	with open(outdirectory + "/" + OUTPUT_cells_barcode_file, 'w') as f:
		for item in random_cellbarcode_list:
			f.write(item + "\n")
	cellbarcode_list_withclusters = np.vstack((random_cellbarcode_list, UMI_count_mat_cluster)).transpose()
	with open(outdirectory + "/" + OUTPUT_cells_barcode_file + ".withSynthCluster", 'w') as f:
		for item in cellbarcode_list_withclusters:
			f.write("\t".join(item) + "\n")
	with open("%s/%s.read.bed" % (outdirectory, read_bedfile_prename), 'w') as fp:
		pass
	print("[scReadSim] Generating synthetic reads in BED file...")
	for relative_peak_ind in tqdm(range(len(peak_nonzero_id))):
		peak_ind = peak_nonzero_id[relative_peak_ind]
		rec = open_peak[peak_ind]
		rec_name = '_'.join(rec)
		UMI_read_dict_pergene = defaultdict(lambda: [])
		reads = samfile.fetch(rec[0], int(rec[1]), int(rec[2]))
		for read in reads:
			if read.has_tag(UMI_tag):
				UMI = read.get_tag(UMI_tag)
				start = read.reference_start
				if read.is_reverse==1:
					strand = "-" 
				else:
					strand = "+"
				# strand = read.is_reverse
				UMI_read_dict_pergene[UMI].append(str(start) + ":" + str(strand))
		## Real data: Obtain the empirical distribution of # reads of UMI in the gene 
		UMI_real_vec = list(UMI_read_dict_pergene.keys())
		nread_perUMI = [len(x) for x in UMI_read_dict_pergene.values()] # return a vector with the length of # UMIs in the gene
		# nread_perUMI_prob = nread_perUMI / np.sum(nread_perUMI)
		## Synthetic data
		# Sample syntheitc UMI's read number and assign real UMI to synthetic UMI (only for sampling reads)
		UMI_count_vec = UMI_count_mat[peak_ind,:] # Synthetic umi count
		nonzer_UMI_zero_id = np.nonzero(UMI_count_vec)[0]
		synthetic_nread_perUMI_percell = [np.random.choice(nread_perUMI, size=UMI_count_vec[i], replace=True) for i in nonzer_UMI_zero_id] # return a vector with the length of non-zero count cells, each entry idnicates the number of reads for each UMI
		nread_synthetic = sum(np.sum(x) for x in synthetic_nread_perUMI_percell) # total number of synthetic reads
		# Assign real UMI to synthetic UMI based on read number per UMI
		# Construct a dic with keys as number of reasd per UMI and values as the corresponding UMI strings
		nread_UMI_dic = defaultdict(lambda: [])
		for i,item in enumerate(nread_perUMI):
			nread_UMI_dic[item].append(UMI_real_vec[i])
		synthetic_nread_perUMI_percell_unlist = [xs for x in synthetic_nread_perUMI_percell for xs in list(x)]
		synthetic_realUMI_assign_array = np.array(synthetic_nread_perUMI_percell_unlist).astype(str)
		synthetic_nread_perUMI_percell_unlist_array = np.array(synthetic_nread_perUMI_percell_unlist)
		for i in set(synthetic_nread_perUMI_percell_unlist):
			tmp_ind = np.where(synthetic_nread_perUMI_percell_unlist_array == i)[0]
			synthetic_realUMI_assign_array[tmp_ind] = np.random.choice(nread_UMI_dic[i], size=len(tmp_ind), replace=True)
		synthetic_realUMI_assign_vec = list(synthetic_realUMI_assign_array) # a vectir sane length to synthetic_nread_perUMI_percell_unlist, synthetic UMI's number 
		# synthetic_realUMI_assign_vec = [np.random.choice(nread_UMI_dic[nread], size=1, replace=True) for nread in synthetic_nread_perUMI_percell_unlist]
		# Sample reads for each synthetic UMI
		synthetic_read_list = [np.random.choice(UMI_read_dict_pergene[synthetic_realUMI_assign_vec[id]], size=synthetic_nread_perUMI_percell_unlist[id], replace=True) for id in range(len(synthetic_realUMI_assign_vec))]
		synthetic_read_unlist = [xs for x in synthetic_read_list for xs in list(x)]
		synthetic_read_start_list = [int(xs.split(':', 1)[0]) - 1  for xs in synthetic_read_unlist]  # -1 accounts for the BAM coordinates is +1 compared with getfasta bedtools
		synthetic_read_strand_list = [str(xs.split(':', 1)[1]) for xs in synthetic_read_unlist]
		# Creat synthetic UMI string
		UMI_synthetic_uniq = cellbarcode_generator(np.sum(UMI_count_vec), size=10)
		UMI_synthetic_list = np.repeat(np.array(UMI_synthetic_uniq), synthetic_nread_perUMI_percell_unlist)
		# Create syntheitc CB string
		CB_synthetic_repeat = [np.sum(x) for x in synthetic_nread_perUMI_percell] # same length to nozero cells
		CB_synthetic_list = np.repeat(np.array(random_cellbarcode_list)[nonzer_UMI_zero_id], CB_synthetic_repeat)
		# Create syntheitc read name string
		read_name_remaining_list = ["CellNo" + str(nonzer_UMI_zero_id[ind] + 1) + ":" + str(rec_name) + "#" + str(count).zfill(4) for ind in range(len(CB_synthetic_repeat)) for count in range(CB_synthetic_repeat[ind])]
		read_name_list = [CB_synthetic_list[id] + UMI_synthetic_list[id] + ":" + read_name_remaining_list[id] for id in range(nread_synthetic)]
		# Create output table
		jitter_value_vec = np.random.randint(-jitter_size,jitter_size,size=nread_synthetic)  # nrow(reads_cur) should equal to nfrag_cur
		reads_cur = pd.DataFrame({
			'chr': rec[0],
			'r1_start_shifted': synthetic_read_start_list  + jitter_value_vec,
			'r1_end_shifted': synthetic_read_start_list + jitter_value_vec + read_len,
			'read_name': read_name_list,
			'read_length': read_len,
			'strand': synthetic_read_strand_list
			})
		reads_cur.to_csv("%s/%s.read.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a')
	print("\n[scReadSim] Created:")
	print("[scReadSim] Read bed file: %s/%s.read.bed" % (outdirectory, read_bedfile_prename))




[docs]
def scRNA_CombineBED(outdirectory, gene_read_bedfile_prename, intergene_read_bedfile_prename, BED_filename_combined_pre):
	"""Combine the bed files of foreground and background feature sets into one bed file.

	Parameters
	----------
	outdirectory: `str`
		Directory of `gene_read_bedfile_prename`.txt and `intergene_read_bedfile_prename`.txt.
	gene_read_bedfile_prename: 'str'
		File prename of foreground synthetic reads bed file.
	intergene_read_bedfile_prename: 'str'
		File prename of background synthetic reads bed file.
	BED_filename_combined_pre: 'str'
		Specify the combined syntehtic reads bed file prename. The combined bed file will be output to `outdirectory`.
	"""
	print("[scReadSim] Combining synthetic read BED files from genes and inter-genes.")
	combine_read_cmd = "cat %s/%s.read.bed %s/%s.read.bed | sort -k1,1 -k2,2n > %s/%s.read.bed" % (outdirectory, gene_read_bedfile_prename, outdirectory, intergene_read_bedfile_prename, outdirectory, BED_filename_combined_pre)
	output, error = subprocess.Popen(combine_read_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	if error:
		print('[ERROR] Fail to create combine synthetic read bed files:\n', error.decode())
	print("\n[scReadSim] Created:")
	print("[scReadSim] Combined synthetic read BED file: %s/%s.read.bed" % (outdirectory, BED_filename_combined_pre))
	print("[scReadSim] Done.")




[docs]
def scRNA_BED2FASTQ(bedtools_directory, seqtk_directory, referenceGenome_file, outdirectory, BED_filename_combined, synthetic_fastq_prename):
	"""Convert Synthetic reads from BED to FASTQ. 

	Parameters
	----------
	bedtools_directory: `str`
		Path to software bedtools.
	seqtk_directory: `str`
		Path to software seqtk.
	referenceGenome_file: `str`
		Reference genome FASTA file that the synthteic reads should align.
	outdirectory: `str`
		Output directory of the synthteic bed file and its corresponding cell barcodes file.
	BED_filename_combined: `str`
		Base name of the combined bed file output by function `scRNA_CombineBED`.
	synthetic_fastq_prename
		Specify the base name of the output FASTQ files.
	"""
	# Create FASTA
	print('[scReadSim] Generating synthetic read FASTA files...')
	fasta_read2_cmd = "%s/bedtools getfasta -s -nameOnly -fi %s -bed %s/%s.read.bed -fo %s/%s.read2.strand.bed2fa.fa" % (bedtools_directory, referenceGenome_file, outdirectory, BED_filename_combined, outdirectory, synthetic_fastq_prename)
	output, error = subprocess.Popen(fasta_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	if error:
		print(error.decode())
	# remove (-) or (+)
	org_fasta_read2_cmd = "sed '/^>/s/.\{3\}$//' %s/%s.read2.strand.bed2fa.fa > %s/%s.read2.bed2fa.fa" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
	output, error = subprocess.Popen(org_fasta_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	if error:
		print('[ERROR] Fail to remove strand infomormation from synthetic read fasta file:\n', error.decode())
	# Create Read 1 from Read 2
	fasta_read1_cmd = "awk 'NR%%2==0 {print substr(p,2,26);} NR%%2 {p=$0;print p;}' %s/%s.read2.bed2fa.fa > %s/%s.read1.bed2fa.fa" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
	output, error = subprocess.Popen(fasta_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	if error:
			print(error.decode())
	# FASTA to FASTQ
	print('[scReadSim] Generating synthetic read FASTQ files...')
	fastq_read1_cmd = "%s/seqtk seq -F 'F' %s/%s.read1.bed2fa.fa > %s/%s.read1.bed2fa.fq" % (seqtk_directory, outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
	output, error = subprocess.Popen(fastq_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	if error:
			print('[ERROR] Fail to convert read1 synthetic fasta file to fastq file:', error.decode())
	fastq_read2_cmd = "%s/seqtk seq -F 'F' %s/%s.read2.bed2fa.fa > %s/%s.read2.bed2fa.fq" % (seqtk_directory, outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
	output, error = subprocess.Popen(fastq_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	if error:
			print('[ERROR] Fail to convert read2 synthetic fasta file to fastq file:', error.decode())
	print('[scReadSim] Sorting FASTQ files...')
	sort_fastq_read1_cmd = "cat %s/%s.read1.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
	output, error = subprocess.Popen(sort_fastq_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	if error:
			print('[ERROR] Fail to sort read1 synthetic fastq file:', error.decode())
	sort_fastq_read2_cmd = "cat %s/%s.read2.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
	output, error = subprocess.Popen(sort_fastq_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	if error:
			print('[ERROR] Fail to sort read2 synthetic fastq file:', error.decode())
	print("\n[scReadSim] Created:")
	print("[scReadSim] Read 1 FASTQ file: %s/%s.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename))
	print("[scReadSim] Read 2 FASTQ file: %s/%s.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename))
	print("[scReadSim] Done.")




[docs]
def AlignSyntheticBam_Single(bowtie2_directory, samtools_directory, outdirectory, referenceGenome_name, referenceGenome_dir, synthetic_fastq_prename, output_BAM_pre):
	"""Convert Synthetic reads from FASTQ to BAM.

	Parameters
	----------
	bowtie2_directory: `str`
		Path to software bowtie2.
	samtools_directory: `str`
		Path to software samtools.
	outdirectory: `str`
		Specify the output directory of the synthteic BAM file.
	referenceGenome_name: `str`
		Base name of the eference genome FASTA file. For example, you should input "chr1" for file "chr1.fa".
	referenceGenome_dir: `str`
		Path to the reference genome FASTA file.
	synthetic_fastq_prename: `str`
		Base name of the synthetic FASTQ file output by function `scRNA_BED2FASTQ`.
	output_BAM_pre: `str`
		Specify the base name of the output BAM file.
	"""
	print('[scReadSim] Aligning FASTQ files onto reference genome files with Bowtie2...')
	alignment_cmd = "%s/bowtie2 -x %s/%s -U %s/%s.read2.bed2fa.sorted.fq | %s/samtools view -bS - > %s/%s.synthetic.noCB.bam" % (bowtie2_directory, referenceGenome_dir, referenceGenome_name,  outdirectory, synthetic_fastq_prename, samtools_directory, outdirectory, output_BAM_pre)
	output, error = subprocess.Popen(alignment_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	print('[Bowtie2] Aligning:\n', error.decode())
	print('[scReadSim] Alignment done.')
	print('[scReadSim] Generating cell barcode and UMI barcode tags...')
	addBC2BAM_header_cmd = "%s/samtools view %s/%s.synthetic.noCB.bam -H > %s/%s.synthetic.noCB.header.sam" % (samtools_directory, outdirectory, output_BAM_pre, outdirectory, output_BAM_pre)
	output, error = subprocess.Popen(addBC2BAM_header_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	addBC2BAM_cmd = "cat <( cat %s/%s.synthetic.noCB.header.sam ) <( paste <(%s/samtools view %s/%s.synthetic.noCB.bam ) <(%s/samtools view %s/%s.synthetic.noCB.bam | cut -f1 | cut -d':' -f1 | awk '{s=substr($1,1,16)}{g=substr($1,17,length($1))}{printf \"CB:Z:%%s\tUB:Z:%%s\\n\",s,g;}')) | %s/samtools view -bS - > %s/%s.synthetic.bam" % (outdirectory, output_BAM_pre, samtools_directory, outdirectory, output_BAM_pre, samtools_directory, outdirectory, output_BAM_pre, samtools_directory, outdirectory, output_BAM_pre)
	output, error = subprocess.Popen(addBC2BAM_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	if error:
		print('[ERROR] Fail to add CB and UB tags to synthetic BAM file:\n', error.decode())
	print('[scReadSim] Sorting and indexing BAM file...')
	sortBAMcmd = "%s/samtools sort %s/%s.synthetic.bam > %s/%s.synthetic.sorted.bam" % (samtools_directory, outdirectory, output_BAM_pre, outdirectory, output_BAM_pre)
	output, error = subprocess.Popen(sortBAMcmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	if error:
		print('[ERROR] Fail to sort synthetic BAM file:\n', error.decode())
	indexBAMcmd = "%s/samtools index %s/%s.synthetic.sorted.bam" % (samtools_directory, outdirectory, output_BAM_pre)
	output, error = subprocess.Popen(indexBAMcmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	if error:
		print('[ERROR] Fail to index synthetic BAM file:\n', error.decode())
	print("\n[scReadSim] Created:")
	print("[scReadSim] Synthetic read BAM file: %s/%s.synthetic.sorted.bam" % (outdirectory, output_BAM_pre))
	print("[scReadSim] Done.")



## Error rate

[docs]
def ErrorBase(base, prop, base_call_ref):
	"""Sample random errors. 

	"""
	err_base_call_id = np.random.choice(a=[0, 1, 2], size=1, p=prop)[0]      
	err_base_call = base_call_ref[base][err_base_call_id]
	return err_base_call




[docs]
def ErroneousRead(real_error_rate_read, read_df, output_fq_file):
	"""Generate random errors according to input real data error rates. 

	"""
	n_read = int(np.shape(read_df)[0]/4)
	## Prepare Error rate
	real_error_rate_read_A = real_error_rate_read[['a_to_c_error_rate', 'a_to_g_error_rate', 'a_to_t_error_rate']].to_numpy() 
	real_error_rate_read_A_prop = real_error_rate_read_A/real_error_rate_read_A.sum(axis=1,keepdims=1)
	real_error_rate_read_A_prop[np.isnan(real_error_rate_read_A_prop).any(axis=1),:] = 1/3
	real_error_rate_read_C = real_error_rate_read[['c_to_a_error_rate', 'c_to_g_error_rate', 'c_to_t_error_rate']].to_numpy() 
	real_error_rate_read_C_prop = real_error_rate_read_C/real_error_rate_read_C.sum(axis=1,keepdims=1)
	real_error_rate_read_C_prop[np.isnan(real_error_rate_read_C_prop).any(axis=1),:] = 1/3
	real_error_rate_read_G = real_error_rate_read[['g_to_a_error_rate', 'g_to_c_error_rate', 'g_to_t_error_rate']].to_numpy() 
	real_error_rate_read_G_prop = real_error_rate_read_G/real_error_rate_read_G.sum(axis=1,keepdims=1)
	real_error_rate_read_G_prop[np.isnan(real_error_rate_read_G_prop).any(axis=1),:] = 1/3
	real_error_rate_read_T = real_error_rate_read[['t_to_a_error_rate', 't_to_c_error_rate', 't_to_g_error_rate']].to_numpy() 
	real_error_rate_read_T_prop = real_error_rate_read_T/real_error_rate_read_T.sum(axis=1,keepdims=1)
	real_error_rate_read_T_prop[np.isnan(real_error_rate_read_T_prop).any(axis=1),:] = 1/3
	# Base decision matrix
	real_error_rate_read_prop_dict = {'A': real_error_rate_read_A_prop, 'C': real_error_rate_read_C_prop, 'G': real_error_rate_read_G_prop, 'T': real_error_rate_read_T_prop}
	# Error decision vector
	real_error_rate_read_perbase = real_error_rate_read['error_rate'].to_numpy()
	## Decide whether error occurs for each read
	read_length = real_error_rate_read.shape[0]
	error_read_perbase_indicator = np.zeros((n_read, read_length), dtype=int)
	random.seed(1)
	for base_id in range(read_length):
		error_read_perbase_indicator[:,base_id] = np.random.binomial(n=1, p=real_error_rate_read_perbase[base_id], size=n_read)
	erroneous_read_id = np.where(np.sum(error_read_perbase_indicator, axis=1) > 0)[0]
	## For erroneous reads, generate erroneous base based on the probability matrix
	base_call_ref = {'A': ['C', 'G', 'T'], 'C': ['A', 'G', 'T'], 'G': ['A', 'C', 'T'], 'T': ['A', 'C', 'G']}
	random.seed(2021)
	read_df_witherror = read_df
	for read_id_tqdm in tqdm(range(len(erroneous_read_id))):
		read_id = erroneous_read_id[read_id_tqdm]
		read_cur = read_df[(read_id*4) : (read_id*4 + 4)]
		bases = list(read_cur[1][0].upper())
		Qscores = list(read_cur[3][0])
		for errorneous_base_id in np.where(error_read_perbase_indicator[read_id,:] > 0)[0]:
			if errorneous_base_id < len(bases):
				base_cur = bases[errorneous_base_id]
				if base_cur in real_error_rate_read_prop_dict:
					prop = real_error_rate_read_prop_dict[base_cur][errorneous_base_id]
					# Decide error base
					err_base_call = ErrorBase(base_cur, prop, base_call_ref)
					bases[errorneous_base_id] = err_base_call
					# Decide Q score
					# Use 9 (Phred score 24) for erroneous base
					Qscores[errorneous_base_id] = '9'
		read_df_witherror[read_id*4+1] = ''.join(bases)
		read_df_witherror[read_id*4+3] = ''.join(Qscores)
	## Write out
	np.savetxt(output_fq_file, read_df_witherror, fmt='%s')




[docs]
def SubstiError(real_error_rate_file, outdirectory, synthetic_fastq_prename):
	"""Generate random errors for single-end sequencing reads according to input real data error rates. 

	Parameters
	----------
	real_error_rate_file: `str`
		Path to software fgbio jar script.
	outdirectory: `str`
		Specify the output directory of the synthteic FASTQ file with random errors.
	synthetic_fastq_prename: `str`
		Specify the base name of the synthetic erroneous reads' FASTQ files.
	"""
	# Read in real error rates
	real_error_rate_dir = real_error_rate_file
	real_error_rate = pd.read_csv(real_error_rate_dir, header=0, delimiter="\t")
	# Read in perfect reads
	read2_fq = outdirectory  + "/" + synthetic_fastq_prename + ".read2.bed2fa.fq"
	read2_df = pd.read_csv(read2_fq, header=None).to_numpy()
	# Real data quality score
	# Error rate to Qscore
	ErroneousRead(real_error_rate, read2_df, outdirectory + "/" + synthetic_fastq_prename + ".ErrorIncluded.read2.bed2fa.fq") 




[docs]
def scRNA_ErrorBase(fgbio_jarfile, INPUT_bamfile, referenceGenome_file, outdirectory, synthetic_fastq_prename):
	"""Introduce random substitution errors into synthetic reads according to real data error rates.

	Parameters
	----------
	fgbio_jarfile: `str`
		Path to software fgbio jar script.
	INPUT_bamfile: `str`
		Input BAM file for anlaysis.
	referenceGenome_file: 'str'
		Reference genome FASTA file that the synthteic reads should align.
	outdirectory: `str`
		Specify the output directory of the synthteic FASTQ file with random errors.
	synthetic_fastq_prename: `str`
		Base name of the synthetic FASTQ files output by function `scATAC_BED2FASTQ`.
	"""
	print('[scReadSim] Substitution error calculating...')
	combine_read1_cmd = "java -jar %s ErrorRateByReadPosition -i %s -r %s -o %s/Real --collapse false" % (fgbio_jarfile, INPUT_bamfile, referenceGenome_file, outdirectory)
	output, error = subprocess.Popen(combine_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	if error:
		print('[Messages] Running fgbio on real bam file:\n', error.decode())
	# Generate Errors into fastq files
	real_error_rate_file = outdirectory + "/" + "Real.error_rate_by_read_position.txt"
	SubstiError(real_error_rate_file, outdirectory, synthetic_fastq_prename)
	# Combine FASTQs
	print('[scReadSim] Sorting FASTQ files...')
	sort_fastq_read1_cmd = "cat %s/%s.read1.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.ErrorIncluded.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
	output, error = subprocess.Popen(sort_fastq_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	if error:
			print('[ERROR] Fail to sort read1 synthetic fastq file:', error.decode())
	sort_fastq_read2_cmd = "cat %s/%s.ErrorIncluded.read2.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.ErrorIncluded.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
	output, error = subprocess.Popen(sort_fastq_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
	if error:
			print('[ERROR] Fail to sort read2 synthetic fastq file:', error.decode())
	print("\n[scReadSim] Created:")
	print("[scReadSim] Read 1 FASTQ file with substitution error: %s/%s.ErrorIncluded.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename))
	print("[scReadSim] Read 2 FASTQ file with substitution error: %s/%s.ErrorIncluded.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename))
	print("[scReadSim] Done.")






[docs]
def scRNA_GenerateSyntheticRead_MultiSample(INPUT_bamfile, outdirectory, bedtools_directory, seqtk_directory, fgbio_jarfile, referenceGenome_file, read_len=90):
	"""Multi-sample/replicate implement of scReadSim for simulating scRNA-seq synthetic reads.

	Parameters
	----------
	INPUT_bamfile: `str`
		List of input BAM files (use absolute paths to the BAM files).
	outdirectory: `str`
		Specify the working directory of scReadSim for generating intermediate and final output files.
	bedtools_directory: `str`
		Directory of software bedtools.
	seqtk_directory: `str`
		Directory of software seqtk.
	fgbio_jarfile: `str`
		Path to software fgbio jar script.
	referenceGenome_file: 'str'
		Reference genome FASTA file that the synthteic reads should align.
	read_len: `int` (default: '90')
		Specify the length of synthetic reads. Default value is 90 bp.
	"""
	for rep_id in range(len(INPUT_bamfile)):
		print("\n[scReadSim] Generating synthetic reads for sample %s..." % str(rep_id+1))
		sample_output_d = outdirectory + "/" + "Rep" + str(rep_id+1)
		# Obtain bedfile
		gene_bedfile = sample_output_d + "/" + "scReadSim.Gene.bed"
		intergene_bedfile = sample_output_d + "/" + "scReadSim.InterGene.bed"
		# Specify the output count matrices' prenames
		UMI_gene_count_mat_filename = "Rep%s.gene.countmatrix" % str(rep_id+1)
		UMI_intergene_count_mat_filename = "Rep%s.intergene.countmatrix" % str(rep_id+1)
		# Specify the names of synthetic count matrices (generated by GenerateSyntheticCount.scATAC_GenerateSyntheticCount)
		synthetic_countmat_gene_file = UMI_gene_count_mat_filename + ".scDesign2Simulated.txt"
		synthetic_countmat_intergene_file = UMI_intergene_count_mat_filename + ".scDesign2Simulated.txt"
		# Specify the base name of bed files containing synthetic reads
		OUTPUT_cells_barcode_file = "synthetic_cell_barcode_rep%s.txt" % str(rep_id+1)
		gene_read_bedfile_prename = "Rep%s.syntheticBAM.gene" % str(rep_id+1)
		intergene_read_bedfile_prename = "Rep%s.syntheticBAM.intergene" % str(rep_id+1)
		BED_filename_combined_pre = "Rep%s.syntheticBAM.combined" % str(rep_id+1)
		synthetic_cell_label_file = UMI_gene_count_mat_filename + ".scDesign2Simulated.CellTypeLabel.txt"
		# Create synthetic read bed file for peaks
		print("\n[scReadSim] Generating synthetic read BED file for genes...")
		scRNA_GenerateBAMCoord(bed_file=gene_bedfile, UMI_count_mat_file=sample_output_d + "/" + synthetic_countmat_gene_file, synthetic_cell_label_file=sample_output_d + "/" + synthetic_cell_label_file, read_bedfile_prename=gene_read_bedfile_prename, INPUT_bamfile=INPUT_bamfile[rep_id], outdirectory=sample_output_d, OUTPUT_cells_barcode_file=OUTPUT_cells_barcode_file, jitter_size=5, read_len=read_len)
		# Create synthetic read bed file for non-peaks
		print("\n[scReadSim] Generating synthetic read BED file for inter-genes...")
		scRNA_GenerateBAMCoord(bed_file=intergene_bedfile, UMI_count_mat_file=sample_output_d + "/" + synthetic_countmat_intergene_file, synthetic_cell_label_file=sample_output_d + "/" + synthetic_cell_label_file, read_bedfile_prename=intergene_read_bedfile_prename, INPUT_bamfile=INPUT_bamfile[rep_id], outdirectory=sample_output_d, OUTPUT_cells_barcode_file=OUTPUT_cells_barcode_file, jitter_size=5, read_len=read_len)
		# Combine bed file
		scRNA_CombineBED(outdirectory=sample_output_d, gene_read_bedfile_prename=gene_read_bedfile_prename, intergene_read_bedfile_prename=intergene_read_bedfile_prename, BED_filename_combined_pre=BED_filename_combined_pre)
		# Generate FASTQ files
		synthetic_fastq_prename = BED_filename_combined_pre
		# Convert combined bed file into FASTQ files
		print("\n")
		scRNA_BED2FASTQ(bedtools_directory=bedtools_directory, seqtk_directory=seqtk_directory, referenceGenome_file=referenceGenome_file, outdirectory=sample_output_d, BED_filename_combined=BED_filename_combined_pre, synthetic_fastq_prename=synthetic_fastq_prename)
		# Generate reads with errors in FASTQs
		print("\n[scReadSim] Generating synthetic read FASTQ files with substitutional error...")
		scRNA_ErrorBase(fgbio_jarfile=fgbio_jarfile, INPUT_bamfile=INPUT_bamfile[rep_id], referenceGenome_file=referenceGenome_file, outdirectory=sample_output_d, synthetic_fastq_prename=synthetic_fastq_prename)

  
  
  

def generateSuperGeneProp(bed_file, isoform_annotation_file, outdirectory, isoform_proportion_file, gene_ind=None):
    """Generaet proportions for user-specified isoforms

    Parameters
    ----------
    bed_file: 'str'
        Gene bed file.
    isoform_annotation_file: 'str'
        Path to the isoform annotation gff file.
    gene_ind: `str`
        Genes that are used for generating isoforms
    isoform_proportion_file: 'str'
        Specify the name of output isoform proportion file.
    """
    annotation = gffpd.read_gff3(isoform_annotation_file)
    with open(bed_file) as open_peak:
        reader = csv.reader(open_peak, delimiter="\t")
        open_peak = np.asarray(list(reader))
    if gene_ind is None:
        meta_gene_list = open_peak
    else:
        if len(gene_ind) > 0:
            meta_gene_list = open_peak[gene_ind,]
        else:
            print("[ERROR]: Input gene selection list has zero elements.")
            return
    # Create empty proportion file with column names
    df = pd.DataFrame(columns=['MetageneID','Genes','GeneID','Proportion'])
    df.to_csv("%s/%s" % (outdirectory, isoform_proportion_file), index=None, sep='\t')
    print("\n[scReadSim] Generating Gene Proportions according to Annotation File: %s" % isoform_annotation_file)
    random.seed(2023)
    # Loop over metagene list
    for gene_cur_ind in tqdm(range(len(meta_gene_list))):
        gene_cur = meta_gene_list[gene_cur_ind]
        overlapings = annotation.overlaps_with(type='gene', seq_id=gene_cur[0],
                                            start=int(gene_cur[1]), end=int(gene_cur[2]))
        # overlapings_attr_to_columns = overlapings.attributes_to_columns()
        overlapings_df = overlapings.df
        gene_cur_name_vec = overlapings_df["attributes"].str.split(';', expand=True)[0].str.split(' ', expand=True)[1].str.replace('"', '')
        gene_cur_name = ';'.join(gene_cur_name_vec) # all subgenes name concatenated
        n_genes = overlapings_df.shape[0]     
        # overlapings_df_to_columns = overlapings.df
        # Sample multinomial prob. using Drichlet prior distribution
        dirch_alpha = np.random.uniform(size=n_genes)
        gene_prop = np.random.dirichlet(alpha=dirch_alpha, size=1)[0]
        df_cur = pd.DataFrame({
                'MetageneID': ["_".join(gene_cur)] * n_genes,
                'Genes': [gene_cur_name] * n_genes,
                'GeneID': gene_cur_name_vec,
                'Proportion': gene_prop
                })
        df_cur.to_csv("%s/%s" % (outdirectory, isoform_proportion_file), header=None, index=None, sep='\t', mode='a', quoting=csv.QUOTE_NONE)
    print("\n[scReadSim] Created:")
    print("[scReadSim] SuperGene2Gene Proportion File: %s/%s" % (outdirectory, isoform_proportion_file))
    print("[scReadSim] Done.")



def scRNA_GenerateBAMCoord_ExonOnly(bed_file, UMI_count_mat_file, synthetic_cell_label_file, INPUT_bamfile, isoform_annotation_file, trancriptome_file, outdirectory, read_bedfile_prename, OUTPUT_cells_barcode_file, UMI_tag='UB:Z', read_len=90, UMI_len=10):
	"""Generate isoforms according to UMI count matrix and ground truth isoforms (with proportions)

	Parameters
	----------
	bed_file: 'str'
		Gene bed file.
	UMI_count_mat_file: `str`
		The path to synthetic UMI count matrix.
	synthetic_cell_label_file: `str`
		Synthetic cell label file generated by `scRNA_GenerateSyntheticCount`.
	INPUT_bamfile: `str`
		Input BAM file for anlaysis.
	isoform_annotation_file: 'str'
		Path to the isoform annotation gtf file.
	trancriptome_file: 'str'
		Path to the collapsed transcriptome file.
	outdirectory: `str`
		Specify the output directory for synthetic reads bed file.
	read_bedfile_prename: `str`
		Specify the base name of output bed file.
	OUTPUT_cells_barcode_file: `str`
		Specify the file name storing the synthetic cell barcodes.
	UMI_tag: `str` (default: 'UB:Z')
		If UMI_modeling is set to True, specify the UMI tag of input BAM file, default value 'UB:Z' is the UMI tag for 10x scRNA-seq.
	read_len: `int` (default: '90')
		Specify the length of synthetic reads. Default value is 90 bp.
	UMI_len: `int` (default: '10')
		Specify the length of synthetic reads. Default value is 10 bp.
	Output
	------
	- BED file containing coordinates for cell-type-specific reads.
	"""
	# Generate supergene-gene proportion
	isoform_proportion_file = "superGene_GeneProp.txt"
	generateSuperGeneProp(bed_file= bed_file,
							isoform_annotation_file=isoform_annotation_file,
							outdirectory=outdirectory,
							isoform_proportion_file=isoform_proportion_file)
	# Generate coordinate
	UMI_count_mat_df = pd.read_csv("%s" % (UMI_count_mat_file), header=None, delimiter="\t")
	# Remove count matrix first column: feature names
	UMI_count_mat = UMI_count_mat_df.iloc[: , 1:].to_numpy()
	UMI_count_mat_cluster = pd.read_csv(synthetic_cell_label_file, header=None, delimiter="\t").to_numpy().flatten()	
	n_cell = np.shape(UMI_count_mat)[1]
	# Read in SuperGene2gene proportion file
	isoform_prop_df = pd.read_csv("%s/%s" % (outdirectory,isoform_proportion_file), header=0, delimiter="\t")
	# Read in SuperGene bed file
	metagene_list = isoform_prop_df['MetageneID'].unique()
	open_peak = pd.read_csv(bed_file, header=None, delimiter="\t")
	open_peak_df = open_peak[0].astype(str) + "_" + open_peak[1].astype(str) + "_" + open_peak[2].astype(str)
	# Read in input BAM file
	samfile = pysam.AlignmentFile(INPUT_bamfile, "rb")
	# Read in merged gene fasta file
	transcript_dict = SeqIO.to_dict(SeqIO.parse(open(trancriptome_file), "fasta"))
	# Set random seed
	random.seed(2023)
	rng = np.random.default_rng(2023)
	# Generate random cell barcode
	random_cellbarcode_list = cellbarcode_generator(n_cell, size=16)
	with open(outdirectory + "/" + OUTPUT_cells_barcode_file, 'w') as f:
		for item in random_cellbarcode_list:
			f.write(item + "\n")
	cellbarcode_list_withclusters = np.vstack((random_cellbarcode_list, UMI_count_mat_cluster)).transpose()
	with open(outdirectory + "/" + OUTPUT_cells_barcode_file + ".withSynthCluster", 'w') as f:
		for item in cellbarcode_list_withclusters:
			f.write("\t".join(item) + "\n")
	with open("%s/%s.read.bed" % (outdirectory, read_bedfile_prename), 'w') as fp:
		pass
	UMIcountmat_perisoform_percell_list = []
	for metagene_cur_id in tqdm(range(len(metagene_list))):
		metagene_cur = metagene_list[metagene_cur_id]
		rec = metagene_cur.split("_")
		UMI_read_dict_pergene = defaultdict(lambda: [])
		## Model UMI-Read empirical distribution
		reads = samfile.fetch(rec[0], int(rec[1]), int(rec[2]))
		for read in reads:
			if read.has_tag(UMI_tag):
				UMI = read.get_tag(UMI_tag)
				start = read.reference_start
				if read.is_reverse==1:
					strand = "-" 
				else:
					strand = "+"
				UMI_read_dict_pergene[UMI].append(str(start) + ":" + str(strand))
		## Real data: Obtain the empirical distribution of # reads of UMI in the gene 
		nread_perUMI = [len(x) for x in UMI_read_dict_pergene.values()] # return a vector with the length of # UMIs in the gene
		# Synthetic isoform proportion  
		isoform_prop_df_cur = isoform_prop_df.loc[isoform_prop_df['MetageneID'] == metagene_cur]
		# Replace NA proportion value with 0
		isoform_prop_df_cur['Proportion'] = isoform_prop_df_cur['Proportion'].fillna(0)
		# Synthetic UMI count per gene  
		UMI_count_permetagene = UMI_count_mat[np.where(open_peak_df == metagene_cur)[0],:].flatten()
		# Sample synthetic isoform-level UMI count from isoform prop. and gene-wise UMI count: n_isoform by n_subcell
		isoform_level_UMIcount = np.transpose(rng.multinomial(n=UMI_count_permetagene, pvals=isoform_prop_df_cur['Proportion'])) # sample multinomial for each cell with different experments number (size = 1)
		isoform_nonzero_id = np.nonzero(isoform_level_UMIcount.sum(axis=1))[0]
		UMIcountmat_perisoform_percell_list.append(isoform_level_UMIcount) # store isoform level count matrix into tmp list
		for isoform_id in isoform_nonzero_id:
			isoform_name = isoform_prop_df_cur.iloc[isoform_id]['GeneID']
			# isoform_name = isoform_prop_df.iloc[isoform_id]['IsoformID']
			print("[scReadSim] Generating Synthetic Reads for SuperGene %s Gene %s" % (metagene_cur, isoform_name))
			# Sample syntheitc UMI's read number
			UMI_count_vec = isoform_level_UMIcount[isoform_id,:] # Synthetic umi count
			nonzer_UMI_zero_id = np.nonzero(UMI_count_vec)[0]
			synthetic_nread_perUMI_percell = [np.random.choice(nread_perUMI, size=UMI_count_vec[i], replace=True) for i in nonzer_UMI_zero_id] # return a vector with the length of non-zero count cells, each entry idnicates the number of reads for each UMI
			nread_synthetic = sum(np.sum(x) for x in synthetic_nread_perUMI_percell) # total number of synthetic reads
			synthetic_nread_perUMI_percell_unlist = [xs for x in synthetic_nread_perUMI_percell for xs in list(x)]
			# Sample synthetic read starting position (relative to transcripts)
			isoform_len = len(transcript_dict[isoform_name].seq)
			synthetic_read_start_list = np.random.randint(low=1, high=isoform_len, size=nread_synthetic)
			# Assign strand infor to synthetic reads (relative to transcripts; for single-end read, should be +, since transcripts already contain strand infor)
			synthetic_read_strand_list = ["+"] * nread_synthetic
			# Creat synthetic UMI string
			UMI_synthetic_uniq = cellbarcode_generator(np.sum(UMI_count_vec), size=UMI_len)
			UMI_synthetic_list = np.repeat(np.array(UMI_synthetic_uniq), synthetic_nread_perUMI_percell_unlist)
			# Organize syntheitc CB string
			CB_synthetic_repeat = [np.sum(x) for x in synthetic_nread_perUMI_percell] # same length to nozero cells
			CB_synthetic_list = np.repeat(np.array(random_cellbarcode_list)[nonzer_UMI_zero_id], CB_synthetic_repeat)
			# Create syntheitc read name string
			read_name_remaining_list = [str(metagene_cur) + ":"+ str(isoform_name) + "#" + str(count).zfill(4) for ind in range(len(CB_synthetic_repeat)) for count in range(CB_synthetic_repeat[ind])]
			read_name_list = [CB_synthetic_list[id] + UMI_synthetic_list[id] + ":" + read_name_remaining_list[id] for id in range(nread_synthetic)]
			# Create output table (seqID is not chr, but for isoform name)
			reads_cur = pd.DataFrame({
				'seqID': isoform_name,
				'r1_start_shifted': synthetic_read_start_list,
				'r1_end_shifted': np.clip(synthetic_read_start_list + read_len, None, isoform_len), # make sure read is within transcript length
				'read_name': read_name_list,
				'read_length': read_len,
				'strand': synthetic_read_strand_list
				})
			reads_cur.to_csv("%s/%s.read.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a')
	# Output ground truth isoform read count matrix
	UMIcountmat_perisoform_percell = np.vstack(UMIcountmat_perisoform_percell_list)
	UMIcountmat_perisoform_percell_pd = pd.DataFrame(UMIcountmat_perisoform_percell, index=isoform_prop_df['GeneID'])
	UMIcountmat_perisoform_percell_pd.to_csv('%s/GeneLevel.SyntheticRead.UMIcountmatrix.txt' % outdirectory, sep="\t", header=False)
	print("\n[scReadSim] Created:")
	print("[scReadSim] Gene-level UMI count matrix file: %s/GeneLevel.SyntheticRead.UMIcountmatrix.txt" % (outdirectory))
	print("[scReadSim] Read bed file: %s/%s.read.bed" % (outdirectory, read_bedfile_prename))
		

def combineFASTQ(gene_synthetic_fastq_prename, intergene_synthetic_fastq_prename, outdirectory, synthetic_fastq_prename):
    print('[scReadSim] Combine Gene and InterGene FASTQ files...')
    sort_fastq_read1_cmd = "cat %s/%s.read1.bed2fa.sorted.fq %s/%s.read1.bed2fa.sorted.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.read1.bed2fa.sorted.fq" % (outdirectory, gene_synthetic_fastq_prename, outdirectory, intergene_synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
    output, error = subprocess.Popen(sort_fastq_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
    if error:
            print('[ERROR] Fail to sort read1 synthetic fastq file:', error.decode())
    sort_fastq_read2_cmd = "cat %s/%s.read2.bed2fa.sorted.fq %s/%s.read2.bed2fa.sorted.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.read2.bed2fa.sorted.fq" % (outdirectory, gene_synthetic_fastq_prename, outdirectory, intergene_synthetic_fastq_prename, outdirectory, synthetic_fastq_prename)
    output, error = subprocess.Popen(sort_fastq_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
    if error:
            print('[ERROR] Fail to sort read2 synthetic fastq file:', error.decode())
    print("\n[scReadSim] Created:")
    print("[scReadSim] Read 1 FASTQ file: %s/%s.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename))
    print("[scReadSim] Read 2 FASTQ file: %s/%s.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename))
    print("[scReadSim] Done.")