Source code for scReadSim.scRNA_GenerateBAM

import pandas as pd
import numpy as np
import csv
import collections
import time
import sys
import os
pd.options.mode.chained_assignment = None  # default='warn'
import string
import random
import subprocess
from tqdm import tqdm
from pathlib import Path
from joblib import Parallel, delayed
import pysam
from collections import defaultdict
# New modules in revision2
from Bio import SeqIO
import gffpandas.gffpandas as gffpd
import warnings

warnings.filterwarnings('ignore', module='gffpandas')

[docs] def flatten(x): """Flatten a nested list. """ if isinstance(x, collections.Iterable): return [a for i in x for a in flatten(i)] else: return [x]
[docs] def cellbarcode_generator(length, size=10): """Generate random cellbarcode. Parameters ---------- length: `int` Number of cells. size: `int` (default: '10') Size of cell barcode. Default value is 10 bp. Return ------ cb_list: `list` List of randomly generated cell barcodes. """ chars = 'ACGT' cb_list = [''.join(random.choice(chars) for _ in range(size)) for cell in range(length)] return cb_list
# # Test # UMI_count_mat_file = "NGS_H2228_H1975_A549_H838_HCC827_Mixture_10X.COMPLE.UMIcountmatrix.scDesign2Simulated.test.txt" # outdirectory = "/home/guanao/Projects/scIsoSim/results/20230204" # cell_label_file = outdirectory+"/"+"NGS_H2228_H1975_A549_H838_HCC827_Mixture_10X.UMIcountmatrix" + ".scDesign2Simulated.CellTypeLabel.txt"
[docs] def scRNA_GenerateBAMCoord(bed_file, UMI_count_mat_file, synthetic_cell_label_file, read_bedfile_prename, INPUT_bamfile, outdirectory, OUTPUT_cells_barcode_file, jitter_size=5, read_len=90, UMI_tag='UB:Z'): """Generate Synthetic reads in BED format. Parameters ---------- bed_file: `str` Features' bed file to generate the synthetic reads (Generated by function `Utility.scRNA_CreateFeatureSets`). UMI_count_mat_file: `str` The path to synthetic UMI count matrix. synthetic_cell_label_file: `str` Synthetic cell label file generated by `scRNA_GenerateSyntheticCount`. read_bedfile_prename: `str` Specify the base name of output bed file. INPUT_bamfile: `str` Input BAM file for anlaysis. outdirectory: `str` Specify the output directory for synthetic reads bed file. OUTPUT_cells_barcode_file: `str` Specify the file name storing the synthetic cell barcodes. jitter_size: `int` (default: '5') Specify the range of random shift to avoid replicate synthetic reads. Default value is 5 bp. read_len: `int` (default: '90') Specify the length of synthetic reads. Default value is 90 bp. UMI_tag: `str` (default: 'UB:Z') If UMI_modeling is set to True, specify the UMI tag of input BAM file, default value 'UB:Z' is the UMI tag for 10x scRNA-seq. """ UMI_count_mat_df = pd.read_csv("%s" % (UMI_count_mat_file), header=None, delimiter="\t") # Remove count matrix first column: feature names UMI_count_mat = UMI_count_mat_df.iloc[: , 1:].to_numpy() UMI_count_mat_cluster = pd.read_csv(synthetic_cell_label_file, header=None, delimiter="\t").to_numpy().flatten() n_cell = np.shape(UMI_count_mat)[1] samfile = pysam.AlignmentFile(INPUT_bamfile, "rb") with open(bed_file) as open_peak: reader = csv.reader(open_peak, delimiter="\t") open_peak = np.asarray(list(reader)) peak_nonzero_id = np.nonzero(UMI_count_mat.sum(axis=1))[0] random.seed(2022) random_cellbarcode_list = cellbarcode_generator(n_cell, size=16) with open(outdirectory + "/" + OUTPUT_cells_barcode_file, 'w') as f: for item in random_cellbarcode_list: f.write(item + "\n") cellbarcode_list_withclusters = np.vstack((random_cellbarcode_list, UMI_count_mat_cluster)).transpose() with open(outdirectory + "/" + OUTPUT_cells_barcode_file + ".withSynthCluster", 'w') as f: for item in cellbarcode_list_withclusters: f.write("\t".join(item) + "\n") with open("%s/%s.read.bed" % (outdirectory, read_bedfile_prename), 'w') as fp: pass print("[scReadSim] Generating synthetic reads in BED file...") for relative_peak_ind in tqdm(range(len(peak_nonzero_id))): peak_ind = peak_nonzero_id[relative_peak_ind] rec = open_peak[peak_ind] rec_name = '_'.join(rec) UMI_read_dict_pergene = defaultdict(lambda: []) reads = samfile.fetch(rec[0], int(rec[1]), int(rec[2])) for read in reads: if read.has_tag(UMI_tag): UMI = read.get_tag(UMI_tag) start = read.reference_start if read.is_reverse==1: strand = "-" else: strand = "+" # strand = read.is_reverse UMI_read_dict_pergene[UMI].append(str(start) + ":" + str(strand)) ## Real data: Obtain the empirical distribution of # reads of UMI in the gene UMI_real_vec = list(UMI_read_dict_pergene.keys()) nread_perUMI = [len(x) for x in UMI_read_dict_pergene.values()] # return a vector with the length of # UMIs in the gene # nread_perUMI_prob = nread_perUMI / np.sum(nread_perUMI) ## Synthetic data # Sample syntheitc UMI's read number and assign real UMI to synthetic UMI (only for sampling reads) UMI_count_vec = UMI_count_mat[peak_ind,:] # Synthetic umi count nonzer_UMI_zero_id = np.nonzero(UMI_count_vec)[0] synthetic_nread_perUMI_percell = [np.random.choice(nread_perUMI, size=UMI_count_vec[i], replace=True) for i in nonzer_UMI_zero_id] # return a vector with the length of non-zero count cells, each entry idnicates the number of reads for each UMI nread_synthetic = sum(np.sum(x) for x in synthetic_nread_perUMI_percell) # total number of synthetic reads # Assign real UMI to synthetic UMI based on read number per UMI # Construct a dic with keys as number of reasd per UMI and values as the corresponding UMI strings nread_UMI_dic = defaultdict(lambda: []) for i,item in enumerate(nread_perUMI): nread_UMI_dic[item].append(UMI_real_vec[i]) synthetic_nread_perUMI_percell_unlist = [xs for x in synthetic_nread_perUMI_percell for xs in list(x)] synthetic_realUMI_assign_array = np.array(synthetic_nread_perUMI_percell_unlist).astype(str) synthetic_nread_perUMI_percell_unlist_array = np.array(synthetic_nread_perUMI_percell_unlist) for i in set(synthetic_nread_perUMI_percell_unlist): tmp_ind = np.where(synthetic_nread_perUMI_percell_unlist_array == i)[0] synthetic_realUMI_assign_array[tmp_ind] = np.random.choice(nread_UMI_dic[i], size=len(tmp_ind), replace=True) synthetic_realUMI_assign_vec = list(synthetic_realUMI_assign_array) # a vectir sane length to synthetic_nread_perUMI_percell_unlist, synthetic UMI's number # synthetic_realUMI_assign_vec = [np.random.choice(nread_UMI_dic[nread], size=1, replace=True) for nread in synthetic_nread_perUMI_percell_unlist] # Sample reads for each synthetic UMI synthetic_read_list = [np.random.choice(UMI_read_dict_pergene[synthetic_realUMI_assign_vec[id]], size=synthetic_nread_perUMI_percell_unlist[id], replace=True) for id in range(len(synthetic_realUMI_assign_vec))] synthetic_read_unlist = [xs for x in synthetic_read_list for xs in list(x)] synthetic_read_start_list = [int(xs.split(':', 1)[0]) - 1 for xs in synthetic_read_unlist] # -1 accounts for the BAM coordinates is +1 compared with getfasta bedtools synthetic_read_strand_list = [str(xs.split(':', 1)[1]) for xs in synthetic_read_unlist] # Creat synthetic UMI string UMI_synthetic_uniq = cellbarcode_generator(np.sum(UMI_count_vec), size=10) UMI_synthetic_list = np.repeat(np.array(UMI_synthetic_uniq), synthetic_nread_perUMI_percell_unlist) # Create syntheitc CB string CB_synthetic_repeat = [np.sum(x) for x in synthetic_nread_perUMI_percell] # same length to nozero cells CB_synthetic_list = np.repeat(np.array(random_cellbarcode_list)[nonzer_UMI_zero_id], CB_synthetic_repeat) # Create syntheitc read name string read_name_remaining_list = ["CellNo" + str(nonzer_UMI_zero_id[ind] + 1) + ":" + str(rec_name) + "#" + str(count).zfill(4) for ind in range(len(CB_synthetic_repeat)) for count in range(CB_synthetic_repeat[ind])] read_name_list = [CB_synthetic_list[id] + UMI_synthetic_list[id] + ":" + read_name_remaining_list[id] for id in range(nread_synthetic)] # Create output table jitter_value_vec = np.random.randint(-jitter_size,jitter_size,size=nread_synthetic) # nrow(reads_cur) should equal to nfrag_cur reads_cur = pd.DataFrame({ 'chr': rec[0], 'r1_start_shifted': synthetic_read_start_list + jitter_value_vec, 'r1_end_shifted': synthetic_read_start_list + jitter_value_vec + read_len, 'read_name': read_name_list, 'read_length': read_len, 'strand': synthetic_read_strand_list }) reads_cur.to_csv("%s/%s.read.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a') print("\n[scReadSim] Created:") print("[scReadSim] Read bed file: %s/%s.read.bed" % (outdirectory, read_bedfile_prename))
[docs] def scRNA_CombineBED(outdirectory, gene_read_bedfile_prename, intergene_read_bedfile_prename, BED_filename_combined_pre): """Combine the bed files of foreground and background feature sets into one bed file. Parameters ---------- outdirectory: `str` Directory of `gene_read_bedfile_prename`.txt and `intergene_read_bedfile_prename`.txt. gene_read_bedfile_prename: 'str' File prename of foreground synthetic reads bed file. intergene_read_bedfile_prename: 'str' File prename of background synthetic reads bed file. BED_filename_combined_pre: 'str' Specify the combined syntehtic reads bed file prename. The combined bed file will be output to `outdirectory`. """ print("[scReadSim] Combining synthetic read BED files from genes and inter-genes.") combine_read_cmd = "cat %s/%s.read.bed %s/%s.read.bed | sort -k1,1 -k2,2n > %s/%s.read.bed" % (outdirectory, gene_read_bedfile_prename, outdirectory, intergene_read_bedfile_prename, outdirectory, BED_filename_combined_pre) output, error = subprocess.Popen(combine_read_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create combine synthetic read bed files:\n', error.decode()) print("\n[scReadSim] Created:") print("[scReadSim] Combined synthetic read BED file: %s/%s.read.bed" % (outdirectory, BED_filename_combined_pre)) print("[scReadSim] Done.")
[docs] def scRNA_BED2FASTQ(bedtools_directory, seqtk_directory, referenceGenome_file, outdirectory, BED_filename_combined, synthetic_fastq_prename): """Convert Synthetic reads from BED to FASTQ. Parameters ---------- bedtools_directory: `str` Path to software bedtools. seqtk_directory: `str` Path to software seqtk. referenceGenome_file: `str` Reference genome FASTA file that the synthteic reads should align. outdirectory: `str` Output directory of the synthteic bed file and its corresponding cell barcodes file. BED_filename_combined: `str` Base name of the combined bed file output by function `scRNA_CombineBED`. synthetic_fastq_prename Specify the base name of the output FASTQ files. """ # Create FASTA print('[scReadSim] Generating synthetic read FASTA files...') fasta_read2_cmd = "%s/bedtools getfasta -s -nameOnly -fi %s -bed %s/%s.read.bed -fo %s/%s.read2.strand.bed2fa.fa" % (bedtools_directory, referenceGenome_file, outdirectory, BED_filename_combined, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(fasta_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print(error.decode()) # remove (-) or (+) org_fasta_read2_cmd = "sed '/^>/s/.\{3\}$//' %s/%s.read2.strand.bed2fa.fa > %s/%s.read2.bed2fa.fa" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(org_fasta_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to remove strand infomormation from synthetic read fasta file:\n', error.decode()) # Create Read 1 from Read 2 fasta_read1_cmd = "awk 'NR%%2==0 {print substr(p,2,26);} NR%%2 {p=$0;print p;}' %s/%s.read2.bed2fa.fa > %s/%s.read1.bed2fa.fa" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(fasta_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print(error.decode()) # FASTA to FASTQ print('[scReadSim] Generating synthetic read FASTQ files...') fastq_read1_cmd = "%s/seqtk seq -F 'F' %s/%s.read1.bed2fa.fa > %s/%s.read1.bed2fa.fq" % (seqtk_directory, outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(fastq_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to convert read1 synthetic fasta file to fastq file:', error.decode()) fastq_read2_cmd = "%s/seqtk seq -F 'F' %s/%s.read2.bed2fa.fa > %s/%s.read2.bed2fa.fq" % (seqtk_directory, outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(fastq_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to convert read2 synthetic fasta file to fastq file:', error.decode()) print('[scReadSim] Sorting FASTQ files...') sort_fastq_read1_cmd = "cat %s/%s.read1.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(sort_fastq_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort read1 synthetic fastq file:', error.decode()) sort_fastq_read2_cmd = "cat %s/%s.read2.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(sort_fastq_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort read2 synthetic fastq file:', error.decode()) print("\n[scReadSim] Created:") print("[scReadSim] Read 1 FASTQ file: %s/%s.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename)) print("[scReadSim] Read 2 FASTQ file: %s/%s.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename)) print("[scReadSim] Done.")
[docs] def AlignSyntheticBam_Single(bowtie2_directory, samtools_directory, outdirectory, referenceGenome_name, referenceGenome_dir, synthetic_fastq_prename, output_BAM_pre): """Convert Synthetic reads from FASTQ to BAM. Parameters ---------- bowtie2_directory: `str` Path to software bowtie2. samtools_directory: `str` Path to software samtools. outdirectory: `str` Specify the output directory of the synthteic BAM file. referenceGenome_name: `str` Base name of the eference genome FASTA file. For example, you should input "chr1" for file "chr1.fa". referenceGenome_dir: `str` Path to the reference genome FASTA file. synthetic_fastq_prename: `str` Base name of the synthetic FASTQ file output by function `scRNA_BED2FASTQ`. output_BAM_pre: `str` Specify the base name of the output BAM file. """ print('[scReadSim] Aligning FASTQ files onto reference genome files with Bowtie2...') alignment_cmd = "%s/bowtie2 -x %s/%s -U %s/%s.read2.bed2fa.sorted.fq | %s/samtools view -bS - > %s/%s.synthetic.noCB.bam" % (bowtie2_directory, referenceGenome_dir, referenceGenome_name, outdirectory, synthetic_fastq_prename, samtools_directory, outdirectory, output_BAM_pre) output, error = subprocess.Popen(alignment_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() print('[Bowtie2] Aligning:\n', error.decode()) print('[scReadSim] Alignment done.') print('[scReadSim] Generating cell barcode and UMI barcode tags...') addBC2BAM_header_cmd = "%s/samtools view %s/%s.synthetic.noCB.bam -H > %s/%s.synthetic.noCB.header.sam" % (samtools_directory, outdirectory, output_BAM_pre, outdirectory, output_BAM_pre) output, error = subprocess.Popen(addBC2BAM_header_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() addBC2BAM_cmd = "cat <( cat %s/%s.synthetic.noCB.header.sam ) <( paste <(%s/samtools view %s/%s.synthetic.noCB.bam ) <(%s/samtools view %s/%s.synthetic.noCB.bam | cut -f1 | cut -d':' -f1 | awk '{s=substr($1,1,16)}{g=substr($1,17,length($1))}{printf \"CB:Z:%%s\tUB:Z:%%s\\n\",s,g;}')) | %s/samtools view -bS - > %s/%s.synthetic.bam" % (outdirectory, output_BAM_pre, samtools_directory, outdirectory, output_BAM_pre, samtools_directory, outdirectory, output_BAM_pre, samtools_directory, outdirectory, output_BAM_pre) output, error = subprocess.Popen(addBC2BAM_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to add CB and UB tags to synthetic BAM file:\n', error.decode()) print('[scReadSim] Sorting and indexing BAM file...') sortBAMcmd = "%s/samtools sort %s/%s.synthetic.bam > %s/%s.synthetic.sorted.bam" % (samtools_directory, outdirectory, output_BAM_pre, outdirectory, output_BAM_pre) output, error = subprocess.Popen(sortBAMcmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort synthetic BAM file:\n', error.decode()) indexBAMcmd = "%s/samtools index %s/%s.synthetic.sorted.bam" % (samtools_directory, outdirectory, output_BAM_pre) output, error = subprocess.Popen(indexBAMcmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to index synthetic BAM file:\n', error.decode()) print("\n[scReadSim] Created:") print("[scReadSim] Synthetic read BAM file: %s/%s.synthetic.sorted.bam" % (outdirectory, output_BAM_pre)) print("[scReadSim] Done.")
## Error rate
[docs] def ErrorBase(base, prop, base_call_ref): """Sample random errors. """ err_base_call_id = np.random.choice(a=[0, 1, 2], size=1, p=prop)[0] err_base_call = base_call_ref[base][err_base_call_id] return err_base_call
[docs] def ErroneousRead(real_error_rate_read, read_df, output_fq_file): """Generate random errors according to input real data error rates. """ n_read = int(np.shape(read_df)[0]/4) ## Prepare Error rate real_error_rate_read_A = real_error_rate_read[['a_to_c_error_rate', 'a_to_g_error_rate', 'a_to_t_error_rate']].to_numpy() real_error_rate_read_A_prop = real_error_rate_read_A/real_error_rate_read_A.sum(axis=1,keepdims=1) real_error_rate_read_A_prop[np.isnan(real_error_rate_read_A_prop).any(axis=1),:] = 1/3 real_error_rate_read_C = real_error_rate_read[['c_to_a_error_rate', 'c_to_g_error_rate', 'c_to_t_error_rate']].to_numpy() real_error_rate_read_C_prop = real_error_rate_read_C/real_error_rate_read_C.sum(axis=1,keepdims=1) real_error_rate_read_C_prop[np.isnan(real_error_rate_read_C_prop).any(axis=1),:] = 1/3 real_error_rate_read_G = real_error_rate_read[['g_to_a_error_rate', 'g_to_c_error_rate', 'g_to_t_error_rate']].to_numpy() real_error_rate_read_G_prop = real_error_rate_read_G/real_error_rate_read_G.sum(axis=1,keepdims=1) real_error_rate_read_G_prop[np.isnan(real_error_rate_read_G_prop).any(axis=1),:] = 1/3 real_error_rate_read_T = real_error_rate_read[['t_to_a_error_rate', 't_to_c_error_rate', 't_to_g_error_rate']].to_numpy() real_error_rate_read_T_prop = real_error_rate_read_T/real_error_rate_read_T.sum(axis=1,keepdims=1) real_error_rate_read_T_prop[np.isnan(real_error_rate_read_T_prop).any(axis=1),:] = 1/3 # Base decision matrix real_error_rate_read_prop_dict = {'A': real_error_rate_read_A_prop, 'C': real_error_rate_read_C_prop, 'G': real_error_rate_read_G_prop, 'T': real_error_rate_read_T_prop} # Error decision vector real_error_rate_read_perbase = real_error_rate_read['error_rate'].to_numpy() ## Decide whether error occurs for each read read_length = real_error_rate_read.shape[0] error_read_perbase_indicator = np.zeros((n_read, read_length), dtype=int) random.seed(1) for base_id in range(read_length): error_read_perbase_indicator[:,base_id] = np.random.binomial(n=1, p=real_error_rate_read_perbase[base_id], size=n_read) erroneous_read_id = np.where(np.sum(error_read_perbase_indicator, axis=1) > 0)[0] ## For erroneous reads, generate erroneous base based on the probability matrix base_call_ref = {'A': ['C', 'G', 'T'], 'C': ['A', 'G', 'T'], 'G': ['A', 'C', 'T'], 'T': ['A', 'C', 'G']} random.seed(2021) read_df_witherror = read_df for read_id_tqdm in tqdm(range(len(erroneous_read_id))): read_id = erroneous_read_id[read_id_tqdm] read_cur = read_df[(read_id*4) : (read_id*4 + 4)] bases = list(read_cur[1][0].upper()) Qscores = list(read_cur[3][0]) for errorneous_base_id in np.where(error_read_perbase_indicator[read_id,:] > 0)[0]: if errorneous_base_id < len(bases): base_cur = bases[errorneous_base_id] if base_cur in real_error_rate_read_prop_dict: prop = real_error_rate_read_prop_dict[base_cur][errorneous_base_id] # Decide error base err_base_call = ErrorBase(base_cur, prop, base_call_ref) bases[errorneous_base_id] = err_base_call # Decide Q score # Use 9 (Phred score 24) for erroneous base Qscores[errorneous_base_id] = '9' read_df_witherror[read_id*4+1] = ''.join(bases) read_df_witherror[read_id*4+3] = ''.join(Qscores) ## Write out np.savetxt(output_fq_file, read_df_witherror, fmt='%s')
[docs] def SubstiError(real_error_rate_file, outdirectory, synthetic_fastq_prename): """Generate random errors for single-end sequencing reads according to input real data error rates. Parameters ---------- real_error_rate_file: `str` Path to software fgbio jar script. outdirectory: `str` Specify the output directory of the synthteic FASTQ file with random errors. synthetic_fastq_prename: `str` Specify the base name of the synthetic erroneous reads' FASTQ files. """ # Read in real error rates real_error_rate_dir = real_error_rate_file real_error_rate = pd.read_csv(real_error_rate_dir, header=0, delimiter="\t") # Read in perfect reads read2_fq = outdirectory + "/" + synthetic_fastq_prename + ".read2.bed2fa.fq" read2_df = pd.read_csv(read2_fq, header=None).to_numpy() # Real data quality score # Error rate to Qscore ErroneousRead(real_error_rate, read2_df, outdirectory + "/" + synthetic_fastq_prename + ".ErrorIncluded.read2.bed2fa.fq")
[docs] def scRNA_ErrorBase(fgbio_jarfile, INPUT_bamfile, referenceGenome_file, outdirectory, synthetic_fastq_prename): """Introduce random substitution errors into synthetic reads according to real data error rates. Parameters ---------- fgbio_jarfile: `str` Path to software fgbio jar script. INPUT_bamfile: `str` Input BAM file for anlaysis. referenceGenome_file: 'str' Reference genome FASTA file that the synthteic reads should align. outdirectory: `str` Specify the output directory of the synthteic FASTQ file with random errors. synthetic_fastq_prename: `str` Base name of the synthetic FASTQ files output by function `scATAC_BED2FASTQ`. """ print('[scReadSim] Substitution error calculating...') combine_read1_cmd = "java -jar %s ErrorRateByReadPosition -i %s -r %s -o %s/Real --collapse false" % (fgbio_jarfile, INPUT_bamfile, referenceGenome_file, outdirectory) output, error = subprocess.Popen(combine_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[Messages] Running fgbio on real bam file:\n', error.decode()) # Generate Errors into fastq files real_error_rate_file = outdirectory + "/" + "Real.error_rate_by_read_position.txt" SubstiError(real_error_rate_file, outdirectory, synthetic_fastq_prename) # Combine FASTQs print('[scReadSim] Sorting FASTQ files...') sort_fastq_read1_cmd = "cat %s/%s.read1.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.ErrorIncluded.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(sort_fastq_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort read1 synthetic fastq file:', error.decode()) sort_fastq_read2_cmd = "cat %s/%s.ErrorIncluded.read2.bed2fa.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.ErrorIncluded.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(sort_fastq_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort read2 synthetic fastq file:', error.decode()) print("\n[scReadSim] Created:") print("[scReadSim] Read 1 FASTQ file with substitution error: %s/%s.ErrorIncluded.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename)) print("[scReadSim] Read 2 FASTQ file with substitution error: %s/%s.ErrorIncluded.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename)) print("[scReadSim] Done.")
[docs] def scRNA_GenerateSyntheticRead_MultiSample(INPUT_bamfile, outdirectory, bedtools_directory, seqtk_directory, fgbio_jarfile, referenceGenome_file, read_len=90): """Multi-sample/replicate implement of scReadSim for simulating scRNA-seq synthetic reads. Parameters ---------- INPUT_bamfile: `str` List of input BAM files (use absolute paths to the BAM files). outdirectory: `str` Specify the working directory of scReadSim for generating intermediate and final output files. bedtools_directory: `str` Directory of software bedtools. seqtk_directory: `str` Directory of software seqtk. fgbio_jarfile: `str` Path to software fgbio jar script. referenceGenome_file: 'str' Reference genome FASTA file that the synthteic reads should align. read_len: `int` (default: '90') Specify the length of synthetic reads. Default value is 90 bp. """ for rep_id in range(len(INPUT_bamfile)): print("\n[scReadSim] Generating synthetic reads for sample %s..." % str(rep_id+1)) sample_output_d = outdirectory + "/" + "Rep" + str(rep_id+1) # Obtain bedfile gene_bedfile = sample_output_d + "/" + "scReadSim.Gene.bed" intergene_bedfile = sample_output_d + "/" + "scReadSim.InterGene.bed" # Specify the output count matrices' prenames UMI_gene_count_mat_filename = "Rep%s.gene.countmatrix" % str(rep_id+1) UMI_intergene_count_mat_filename = "Rep%s.intergene.countmatrix" % str(rep_id+1) # Specify the names of synthetic count matrices (generated by GenerateSyntheticCount.scATAC_GenerateSyntheticCount) synthetic_countmat_gene_file = UMI_gene_count_mat_filename + ".scDesign2Simulated.txt" synthetic_countmat_intergene_file = UMI_intergene_count_mat_filename + ".scDesign2Simulated.txt" # Specify the base name of bed files containing synthetic reads OUTPUT_cells_barcode_file = "synthetic_cell_barcode_rep%s.txt" % str(rep_id+1) gene_read_bedfile_prename = "Rep%s.syntheticBAM.gene" % str(rep_id+1) intergene_read_bedfile_prename = "Rep%s.syntheticBAM.intergene" % str(rep_id+1) BED_filename_combined_pre = "Rep%s.syntheticBAM.combined" % str(rep_id+1) synthetic_cell_label_file = UMI_gene_count_mat_filename + ".scDesign2Simulated.CellTypeLabel.txt" # Create synthetic read bed file for peaks print("\n[scReadSim] Generating synthetic read BED file for genes...") scRNA_GenerateBAMCoord(bed_file=gene_bedfile, UMI_count_mat_file=sample_output_d + "/" + synthetic_countmat_gene_file, synthetic_cell_label_file=sample_output_d + "/" + synthetic_cell_label_file, read_bedfile_prename=gene_read_bedfile_prename, INPUT_bamfile=INPUT_bamfile[rep_id], outdirectory=sample_output_d, OUTPUT_cells_barcode_file=OUTPUT_cells_barcode_file, jitter_size=5, read_len=read_len) # Create synthetic read bed file for non-peaks print("\n[scReadSim] Generating synthetic read BED file for inter-genes...") scRNA_GenerateBAMCoord(bed_file=intergene_bedfile, UMI_count_mat_file=sample_output_d + "/" + synthetic_countmat_intergene_file, synthetic_cell_label_file=sample_output_d + "/" + synthetic_cell_label_file, read_bedfile_prename=intergene_read_bedfile_prename, INPUT_bamfile=INPUT_bamfile[rep_id], outdirectory=sample_output_d, OUTPUT_cells_barcode_file=OUTPUT_cells_barcode_file, jitter_size=5, read_len=read_len) # Combine bed file scRNA_CombineBED(outdirectory=sample_output_d, gene_read_bedfile_prename=gene_read_bedfile_prename, intergene_read_bedfile_prename=intergene_read_bedfile_prename, BED_filename_combined_pre=BED_filename_combined_pre) # Generate FASTQ files synthetic_fastq_prename = BED_filename_combined_pre # Convert combined bed file into FASTQ files print("\n") scRNA_BED2FASTQ(bedtools_directory=bedtools_directory, seqtk_directory=seqtk_directory, referenceGenome_file=referenceGenome_file, outdirectory=sample_output_d, BED_filename_combined=BED_filename_combined_pre, synthetic_fastq_prename=synthetic_fastq_prename) # Generate reads with errors in FASTQs print("\n[scReadSim] Generating synthetic read FASTQ files with substitutional error...") scRNA_ErrorBase(fgbio_jarfile=fgbio_jarfile, INPUT_bamfile=INPUT_bamfile[rep_id], referenceGenome_file=referenceGenome_file, outdirectory=sample_output_d, synthetic_fastq_prename=synthetic_fastq_prename)
def generateSuperGeneProp(bed_file, isoform_annotation_file, outdirectory, isoform_proportion_file, gene_ind=None): """Generaet proportions for user-specified isoforms Parameters ---------- bed_file: 'str' Gene bed file. isoform_annotation_file: 'str' Path to the isoform annotation gff file. gene_ind: `str` Genes that are used for generating isoforms isoform_proportion_file: 'str' Specify the name of output isoform proportion file. """ annotation = gffpd.read_gff3(isoform_annotation_file) with open(bed_file) as open_peak: reader = csv.reader(open_peak, delimiter="\t") open_peak = np.asarray(list(reader)) if gene_ind is None: meta_gene_list = open_peak else: if len(gene_ind) > 0: meta_gene_list = open_peak[gene_ind,] else: print("[ERROR]: Input gene selection list has zero elements.") return # Create empty proportion file with column names df = pd.DataFrame(columns=['MetageneID','Genes','GeneID','Proportion']) df.to_csv("%s/%s" % (outdirectory, isoform_proportion_file), index=None, sep='\t') print("\n[scReadSim] Generating Gene Proportions according to Annotation File: %s" % isoform_annotation_file) random.seed(2023) # Loop over metagene list for gene_cur_ind in tqdm(range(len(meta_gene_list))): gene_cur = meta_gene_list[gene_cur_ind] overlapings = annotation.overlaps_with(type='gene', seq_id=gene_cur[0], start=int(gene_cur[1]), end=int(gene_cur[2])) # overlapings_attr_to_columns = overlapings.attributes_to_columns() overlapings_df = overlapings.df gene_cur_name_vec = overlapings_df["attributes"].str.split(';', expand=True)[0].str.split(' ', expand=True)[1].str.replace('"', '') gene_cur_name = ';'.join(gene_cur_name_vec) # all subgenes name concatenated n_genes = overlapings_df.shape[0] # overlapings_df_to_columns = overlapings.df # Sample multinomial prob. using Drichlet prior distribution dirch_alpha = np.random.uniform(size=n_genes) gene_prop = np.random.dirichlet(alpha=dirch_alpha, size=1)[0] df_cur = pd.DataFrame({ 'MetageneID': ["_".join(gene_cur)] * n_genes, 'Genes': [gene_cur_name] * n_genes, 'GeneID': gene_cur_name_vec, 'Proportion': gene_prop }) df_cur.to_csv("%s/%s" % (outdirectory, isoform_proportion_file), header=None, index=None, sep='\t', mode='a', quoting=csv.QUOTE_NONE) print("\n[scReadSim] Created:") print("[scReadSim] SuperGene2Gene Proportion File: %s/%s" % (outdirectory, isoform_proportion_file)) print("[scReadSim] Done.") def scRNA_GenerateBAMCoord_ExonOnly(bed_file, UMI_count_mat_file, synthetic_cell_label_file, INPUT_bamfile, isoform_annotation_file, trancriptome_file, outdirectory, read_bedfile_prename, OUTPUT_cells_barcode_file, UMI_tag='UB:Z', read_len=90, UMI_len=10): """Generate isoforms according to UMI count matrix and ground truth isoforms (with proportions) Parameters ---------- bed_file: 'str' Gene bed file. UMI_count_mat_file: `str` The path to synthetic UMI count matrix. synthetic_cell_label_file: `str` Synthetic cell label file generated by `scRNA_GenerateSyntheticCount`. INPUT_bamfile: `str` Input BAM file for anlaysis. isoform_annotation_file: 'str' Path to the isoform annotation gtf file. trancriptome_file: 'str' Path to the collapsed transcriptome file. outdirectory: `str` Specify the output directory for synthetic reads bed file. read_bedfile_prename: `str` Specify the base name of output bed file. OUTPUT_cells_barcode_file: `str` Specify the file name storing the synthetic cell barcodes. UMI_tag: `str` (default: 'UB:Z') If UMI_modeling is set to True, specify the UMI tag of input BAM file, default value 'UB:Z' is the UMI tag for 10x scRNA-seq. read_len: `int` (default: '90') Specify the length of synthetic reads. Default value is 90 bp. UMI_len: `int` (default: '10') Specify the length of synthetic reads. Default value is 10 bp. Output ------ - BED file containing coordinates for cell-type-specific reads. """ # Generate supergene-gene proportion isoform_proportion_file = "superGene_GeneProp.txt" generateSuperGeneProp(bed_file= bed_file, isoform_annotation_file=isoform_annotation_file, outdirectory=outdirectory, isoform_proportion_file=isoform_proportion_file) # Generate coordinate UMI_count_mat_df = pd.read_csv("%s" % (UMI_count_mat_file), header=None, delimiter="\t") # Remove count matrix first column: feature names UMI_count_mat = UMI_count_mat_df.iloc[: , 1:].to_numpy() UMI_count_mat_cluster = pd.read_csv(synthetic_cell_label_file, header=None, delimiter="\t").to_numpy().flatten() n_cell = np.shape(UMI_count_mat)[1] # Read in SuperGene2gene proportion file isoform_prop_df = pd.read_csv("%s/%s" % (outdirectory,isoform_proportion_file), header=0, delimiter="\t") # Read in SuperGene bed file metagene_list = isoform_prop_df['MetageneID'].unique() open_peak = pd.read_csv(bed_file, header=None, delimiter="\t") open_peak_df = open_peak[0].astype(str) + "_" + open_peak[1].astype(str) + "_" + open_peak[2].astype(str) # Read in input BAM file samfile = pysam.AlignmentFile(INPUT_bamfile, "rb") # Read in merged gene fasta file transcript_dict = SeqIO.to_dict(SeqIO.parse(open(trancriptome_file), "fasta")) # Set random seed random.seed(2023) rng = np.random.default_rng(2023) # Generate random cell barcode random_cellbarcode_list = cellbarcode_generator(n_cell, size=16) with open(outdirectory + "/" + OUTPUT_cells_barcode_file, 'w') as f: for item in random_cellbarcode_list: f.write(item + "\n") cellbarcode_list_withclusters = np.vstack((random_cellbarcode_list, UMI_count_mat_cluster)).transpose() with open(outdirectory + "/" + OUTPUT_cells_barcode_file + ".withSynthCluster", 'w') as f: for item in cellbarcode_list_withclusters: f.write("\t".join(item) + "\n") with open("%s/%s.read.bed" % (outdirectory, read_bedfile_prename), 'w') as fp: pass UMIcountmat_perisoform_percell_list = [] for metagene_cur_id in tqdm(range(len(metagene_list))): metagene_cur = metagene_list[metagene_cur_id] rec = metagene_cur.split("_") UMI_read_dict_pergene = defaultdict(lambda: []) ## Model UMI-Read empirical distribution reads = samfile.fetch(rec[0], int(rec[1]), int(rec[2])) for read in reads: if read.has_tag(UMI_tag): UMI = read.get_tag(UMI_tag) start = read.reference_start if read.is_reverse==1: strand = "-" else: strand = "+" UMI_read_dict_pergene[UMI].append(str(start) + ":" + str(strand)) ## Real data: Obtain the empirical distribution of # reads of UMI in the gene nread_perUMI = [len(x) for x in UMI_read_dict_pergene.values()] # return a vector with the length of # UMIs in the gene # Synthetic isoform proportion isoform_prop_df_cur = isoform_prop_df.loc[isoform_prop_df['MetageneID'] == metagene_cur] # Replace NA proportion value with 0 isoform_prop_df_cur['Proportion'] = isoform_prop_df_cur['Proportion'].fillna(0) # Synthetic UMI count per gene UMI_count_permetagene = UMI_count_mat[np.where(open_peak_df == metagene_cur)[0],:].flatten() # Sample synthetic isoform-level UMI count from isoform prop. and gene-wise UMI count: n_isoform by n_subcell isoform_level_UMIcount = np.transpose(rng.multinomial(n=UMI_count_permetagene, pvals=isoform_prop_df_cur['Proportion'])) # sample multinomial for each cell with different experments number (size = 1) isoform_nonzero_id = np.nonzero(isoform_level_UMIcount.sum(axis=1))[0] UMIcountmat_perisoform_percell_list.append(isoform_level_UMIcount) # store isoform level count matrix into tmp list for isoform_id in isoform_nonzero_id: isoform_name = isoform_prop_df_cur.iloc[isoform_id]['GeneID'] # isoform_name = isoform_prop_df.iloc[isoform_id]['IsoformID'] print("[scReadSim] Generating Synthetic Reads for SuperGene %s Gene %s" % (metagene_cur, isoform_name)) # Sample syntheitc UMI's read number UMI_count_vec = isoform_level_UMIcount[isoform_id,:] # Synthetic umi count nonzer_UMI_zero_id = np.nonzero(UMI_count_vec)[0] synthetic_nread_perUMI_percell = [np.random.choice(nread_perUMI, size=UMI_count_vec[i], replace=True) for i in nonzer_UMI_zero_id] # return a vector with the length of non-zero count cells, each entry idnicates the number of reads for each UMI nread_synthetic = sum(np.sum(x) for x in synthetic_nread_perUMI_percell) # total number of synthetic reads synthetic_nread_perUMI_percell_unlist = [xs for x in synthetic_nread_perUMI_percell for xs in list(x)] # Sample synthetic read starting position (relative to transcripts) isoform_len = len(transcript_dict[isoform_name].seq) synthetic_read_start_list = np.random.randint(low=1, high=isoform_len, size=nread_synthetic) # Assign strand infor to synthetic reads (relative to transcripts; for single-end read, should be +, since transcripts already contain strand infor) synthetic_read_strand_list = ["+"] * nread_synthetic # Creat synthetic UMI string UMI_synthetic_uniq = cellbarcode_generator(np.sum(UMI_count_vec), size=UMI_len) UMI_synthetic_list = np.repeat(np.array(UMI_synthetic_uniq), synthetic_nread_perUMI_percell_unlist) # Organize syntheitc CB string CB_synthetic_repeat = [np.sum(x) for x in synthetic_nread_perUMI_percell] # same length to nozero cells CB_synthetic_list = np.repeat(np.array(random_cellbarcode_list)[nonzer_UMI_zero_id], CB_synthetic_repeat) # Create syntheitc read name string read_name_remaining_list = [str(metagene_cur) + ":"+ str(isoform_name) + "#" + str(count).zfill(4) for ind in range(len(CB_synthetic_repeat)) for count in range(CB_synthetic_repeat[ind])] read_name_list = [CB_synthetic_list[id] + UMI_synthetic_list[id] + ":" + read_name_remaining_list[id] for id in range(nread_synthetic)] # Create output table (seqID is not chr, but for isoform name) reads_cur = pd.DataFrame({ 'seqID': isoform_name, 'r1_start_shifted': synthetic_read_start_list, 'r1_end_shifted': np.clip(synthetic_read_start_list + read_len, None, isoform_len), # make sure read is within transcript length 'read_name': read_name_list, 'read_length': read_len, 'strand': synthetic_read_strand_list }) reads_cur.to_csv("%s/%s.read.bed" % (outdirectory, read_bedfile_prename), header=None, index=None, sep='\t', mode='a') # Output ground truth isoform read count matrix UMIcountmat_perisoform_percell = np.vstack(UMIcountmat_perisoform_percell_list) UMIcountmat_perisoform_percell_pd = pd.DataFrame(UMIcountmat_perisoform_percell, index=isoform_prop_df['GeneID']) UMIcountmat_perisoform_percell_pd.to_csv('%s/GeneLevel.SyntheticRead.UMIcountmatrix.txt' % outdirectory, sep="\t", header=False) print("\n[scReadSim] Created:") print("[scReadSim] Gene-level UMI count matrix file: %s/GeneLevel.SyntheticRead.UMIcountmatrix.txt" % (outdirectory)) print("[scReadSim] Read bed file: %s/%s.read.bed" % (outdirectory, read_bedfile_prename)) def combineFASTQ(gene_synthetic_fastq_prename, intergene_synthetic_fastq_prename, outdirectory, synthetic_fastq_prename): print('[scReadSim] Combine Gene and InterGene FASTQ files...') sort_fastq_read1_cmd = "cat %s/%s.read1.bed2fa.sorted.fq %s/%s.read1.bed2fa.sorted.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.read1.bed2fa.sorted.fq" % (outdirectory, gene_synthetic_fastq_prename, outdirectory, intergene_synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(sort_fastq_read1_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort read1 synthetic fastq file:', error.decode()) sort_fastq_read2_cmd = "cat %s/%s.read2.bed2fa.sorted.fq %s/%s.read2.bed2fa.sorted.fq | paste - - - - | sort -k1,1 -S 3G | tr '\t' '\n' > %s/%s.read2.bed2fa.sorted.fq" % (outdirectory, gene_synthetic_fastq_prename, outdirectory, intergene_synthetic_fastq_prename, outdirectory, synthetic_fastq_prename) output, error = subprocess.Popen(sort_fastq_read2_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort read2 synthetic fastq file:', error.decode()) print("\n[scReadSim] Created:") print("[scReadSim] Read 1 FASTQ file: %s/%s.read1.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename)) print("[scReadSim] Read 2 FASTQ file: %s/%s.read2.bed2fa.sorted.fq" % (outdirectory, synthetic_fastq_prename)) print("[scReadSim] Done.")