Source code for scReadSim.Utility

import csv
import numpy as np
import pysam
import pandas as pd
from collections import defaultdict
from time import process_time
import sys
import subprocess
from tqdm import tqdm
import os
from joblib import Parallel, delayed
from collections import Counter
from pathlib import Path


[docs] def CallPeak(macs3_directory, INPUT_bamfile, outdirectory, MACS3_peakname_pre, qval=0.05): """Perform peak calling using MACS3 Parameters ---------- macs3_directory: `str` Path to software MACS3. INPUT_bamfile: `str` Input BAM file for anlaysis. outdirectory: `str` Output directory of peak calling. MACS3_peakname_pre: `str` Base name of peak calling results for MACS3. """ macs_cmd = "%s/macs3 callpeak -f BAMPE -t %s -g mm -n %s/%s -B -q %s --outdir %s" % (macs3_directory, INPUT_bamfile, outdirectory, MACS3_peakname_pre, qval, outdirectory) output, error = subprocess.Popen(macs_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() print('[MACS3] Call peaks:\n', error.decode())
[docs] def ExtractBAMCoverage(INPUT_bamfile, samtools_directory, outdirectory): """Examine the covered chromosome names for the input bam file. Parameters ---------- INPUT_bamfile: `str` Directory of input BAM file. samtools_directory: `str` Directory of software samtools. outdirectory: `str` Output directory. Return ------ chromosomes_coverd: `list` List of chromosome names that the input bam files covers. """ cmd = "%s/samtools idxstats %s > %s/bam.stats.txt" % (samtools_directory, INPUT_bamfile, outdirectory) output, error = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to rerturn the index BAM information:\n', error.decode()) bamstats = pd.read_csv("%s/bam.stats.txt" % outdirectory, header=None, delimiter="\t").to_numpy() chromosomes_coverd = bamstats[np.nonzero(bamstats[:,2])[0],0].tolist() return chromosomes_coverd
[docs] def scATAC_CreateFeatureSets(INPUT_bamfile, samtools_directory, bedtools_directory, outdirectory, genome_size_file, peak_mode="macs3", macs3_directory=None, INPUT_peakfile=None, INPUT_nonpeakfile=None, OUTPUT_peakfile=None, superset_peakfile=None): """Create the foreground and background feature set for the input scATAC-seq bam file. Parameters ---------- INPUT_bamfile: `str` Directory of input BAM file. samtools_directory: `str` Directory of software samtools. bedtools_directory: `str` Directory of software bedtools. outdirectory: `str` Output directory. genome_size_file: `str` Directory of Genome sizes file. The file should be a tab delimited text file with two columns: first column for the chromosome name, second column indicates the size. peak_mode: `str` (default: macs3) Specify mode for trustworthy peak and non-peak generation, must be one of the following: "macs3", "user", and "superset". macs3_directory: `str` (default: None) Path to software MACS3. Must be specified if `INPUT_peakfile` and `INPUT_nonpeakfile` are None. Must be specified under peak_mode "macs3" or "superset". INPUT_peakfile: `str` (default: None) Directory of user-specified input peak file. Must be specified under peak_mode "user". INPUT_nonpeakfile: `str` (default: None) Directory of user-specified input non-peak file. Must be specified under peak_mode "user". superset_peakfile: `str` (default: None) Directory of a superset of potential chromatin open regions, including sources such as ENCODE cCRE (Candidate Cis-Regulatory Elements) collection. Must be specified under peak_mode "superset". OUTPUT_peakfile: `str` (default: None) Directory of user-specified output peak file. Synthetic scATAC-seq reads will be generated taking `OUTPUT_peakfile` as ground truth peaks. Note that `OUTPUT_peakfile` does not name the generated feature files by function `scATAC_CreateFeatureSets`. """ valid_modes = {"macs3", "user", "superset"} if peak_mode not in valid_modes: raise ValueError("[ERROR]: peak_mode must be one of %r." % valid_modes) chromosomes_coverd = ExtractBAMCoverage(INPUT_bamfile, samtools_directory, outdirectory) search_string_chr = '|'.join(chromosomes_coverd) cmd = "cat %s | grep -Ew '%s' > %s/genome_size_selected.txt" % (genome_size_file, search_string_chr, outdirectory) output, error = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to extract gene regions from genome annotation file:\n', error.decode()) # Input peaks and non-peaks if peak_mode == "macs3": print("[scReadSim] Detected peak mode: macs3") # Define peaks and non-peaks using MACS3 peakfile = outdirectory + "/" + "scReadSim.MACS3.peak.bed" nonpeakfile = outdirectory + "/" + "scReadSim.MACS3.nonpeak.bed" # Call peaks print("[scReadSim] Will identify trustworthy peaks and non-peaks using MACS3") print("[scReadSim] Generating peaks using MACS3...") CallPeak(macs3_directory, INPUT_bamfile, outdirectory, "scReadSim_MACS3_Stringent", qval=0.01) cmd = "%s/bedtools sort -i %s/scReadSim_MACS3_Stringent_peaks.narrowPeak | %s/bedtools merge > %s/scReadSim.MACS3.peak.bed" % (bedtools_directory, outdirectory, bedtools_directory, outdirectory) output, error = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create feature set:\n', error.decode()) print("[scReadSim] Peaks generated: %s/scReadSim.MACS3.peak.bed" % outdirectory) # Call non-peaks print("[scReadSim] Generating non-peaks using MACS3...") CallPeak(macs3_directory, INPUT_bamfile, outdirectory, "scReadSim_MACS3_LessStringent", qval=0.1) cmd = "%s/bedtools sort -i %s/scReadSim_MACS3_LessStringent_peaks.narrowPeak | %s/bedtools merge > %s/scReadSim_MACS3.nonpeak.tmp.bed" % (bedtools_directory, outdirectory, bedtools_directory, outdirectory) output, error = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create feature set:\n', error.decode()) # Calculate the inter peaks as non-peaks complement_cmd = "%s/bedtools complement -i %s/scReadSim_MACS3.nonpeak.tmp.bed -g %s/genome_size_selected.txt > %s/scReadSim.MACS3.nonpeak.bed" % (bedtools_directory, outdirectory, outdirectory, outdirectory) output, error = subprocess.Popen(complement_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create complementary feature set:\n', error.decode()) print("[scReadSim] Non-peaks generated: %s/scReadSim.MACS3.nonpeak.bed" % outdirectory) elif peak_mode == "user": print("[scReadSim] Detected peak mode: user") print("[scReadSim] Will generate trustworthy peaks and non-peaks from user-specified peak and non-peaks") if INPUT_peakfile is None or INPUT_nonpeakfile is None: print("[scReadSim] INPUT_peakfile or INPUT_nonpeakfile not specified!") raise ValueError("[ERROR]: Under peak_mode user, INPUT_peakfile and INPUT_nonpeakfile should be specified.") peakfile = "scReadSim.UserInput.peak.bed" nonpeakfile = "scReadSim.UserInput.nonpeak.bed" # Merge and sort peak file print("[scReadSim] Merging input peak file: %s" % INPUT_peakfile) cmd = "%s/bedtools sort -i %s | %s/bedtools merge > %s/%s" % (bedtools_directory, INPUT_peakfile, bedtools_directory, outdirectory, peakfile) output, error = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort input peak set:\n', error.decode()) print("[scReadSim] Merging input non-peak File: %s" % INPUT_nonpeakfile) print("[scReadSim] Peaks generated: %s/%s" % (outdirectory, peakfile)) # Merge and sort non-peak file cmd = "%s/bedtools sort -i %s | %s/bedtools merge > %s/%s" % (bedtools_directory, INPUT_nonpeakfile, bedtools_directory, outdirectory, nonpeakfile) output, error = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to merge input non-peak feature set:\n', error.decode()) print("[scReadSim] Non-peaks generated: %s/%s" % (outdirectory, nonpeakfile)) elif peak_mode == "superset": print("[scReadSim] Detected peak mode: superset") print("[scReadSim] Will generate trustworthy peaks using a superset of possible open chromatin regions") if superset_peakfile is None: print("[scReadSim] Argument superset_peakfile not specified!") raise ValueError("[ERROR]: Under peak_mode superset, superset_peakfile should be specified.") peakfile = "scReadSim.superset.peak.bed" nonpeakfile = "scReadSim.superset.nonpeak.bed" # Merge and sort superset_peakfile_tmp print("[scReadSim] Merging and sorting superset peak file: %s" % superset_peakfile) cmd = "%s/bedtools sort -i %s | %s/bedtools merge > %s/scReadSim.superset.peak.tmp.bed" % (bedtools_directory, superset_peakfile, bedtools_directory, outdirectory) output, error = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort input peak set:\n', error.decode()) # Calculate the inter peaks as superset_nonpeakfile_tmp complement_cmd = "%s/bedtools complement -i %s/scReadSim.superset.peak.tmp.bed -g %s/genome_size_selected.txt > %s/scReadSim.superset.nonpeak.tmp.bed" % (bedtools_directory, outdirectory, outdirectory, outdirectory) output, error = subprocess.Popen(complement_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create complementary feature set:\n', error.decode()) # Identify peaks using MACS3 print("[scReadSim] Identifying potential peaks from input BAM file using MACS3...") CallPeak(macs3_directory, INPUT_bamfile, outdirectory, "scReadSim_MACS3_Stringent", qval=0.01) cmd = "%s/bedtools sort -i %s/scReadSim_MACS3_Stringent_peaks.narrowPeak | %s/bedtools merge > %s/scReadSim.MACS3.peak.bed" % (bedtools_directory, outdirectory, bedtools_directory, outdirectory) output, error = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create feature set:\n', error.decode()) # Call non-peaks using MACS3 print("[scReadSim] Identifying potential non-peaks from input BAM file using MACS3...") CallPeak(macs3_directory, INPUT_bamfile, outdirectory, "scReadSim_MACS3_LessStringent", qval=0.1) cmd = "%s/bedtools sort -i %s/scReadSim_MACS3_LessStringent_peaks.narrowPeak | %s/bedtools merge > %s/scReadSim_MACS3.nonpeak.tmp.bed" % (bedtools_directory, outdirectory, bedtools_directory, outdirectory) output, error = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create feature set:\n', error.decode()) # Calculate the inter peaks as non-peaks complement_cmd = "%s/bedtools complement -i %s/scReadSim_MACS3.nonpeak.tmp.bed -g %s/genome_size_selected.txt > %s/scReadSim.MACS3.nonpeak.bed" % (bedtools_directory, outdirectory, outdirectory, outdirectory) output, error = subprocess.Popen(complement_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create complementary feature set:\n', error.decode()) # Loop superset_peak, find those overlapping half a superset peak (-f 0.5) print("[scReadSim] Selecting final trustworthy peaks from superset peak list...") intersect_peak_cmd = "%s/bedtools intersect -a %s/scReadSim.superset.peak.tmp.bed -b %s/scReadSim.MACS3.peak.bed -f 0.5 > %s/scReadSim.superset.peak.bed" % (bedtools_directory, outdirectory, outdirectory, outdirectory) output, error = subprocess.Popen(intersect_peak_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create complementary feature set:\n', error.decode()) print("[scReadSim] Peaks generated: %s/%s" % (outdirectory, peakfile)) # Loop superset_nonpeak, find those overlapping half a superset nonpeak print("[scReadSim] Selecting final trustworthy non-peaks from superset non-peak list...") intersect_nonpeak_cmd = "%s/bedtools intersect -a %s/scReadSim.superset.nonpeak.tmp.bed -b %s/scReadSim.MACS3.nonpeak.bed -f 0.5 > %s/scReadSim.superset.nonpeak.bed" % (bedtools_directory, outdirectory, outdirectory, outdirectory) output, error = subprocess.Popen(intersect_nonpeak_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create complementary feature set:\n', error.decode()) print("[scReadSim] Nno-peaks generated: %s/%s" % (outdirectory, nonpeakfile)) # Union peaks and non-peaks print("[scReadSim] Generating gray areas...") cmd = "cat %s/%s %s/%s | bedtools sort | bedtools merge > %s/Peak_NonPeaks_Union.bed" % (outdirectory, peakfile, outdirectory, nonpeakfile, outdirectory) output, error = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() complement_cmd = "%s/bedtools complement -i %s/Peak_NonPeaks_Union.bed -g %s/genome_size_selected.txt > %s/scReadSim.grayareas.bed" % (bedtools_directory, outdirectory, outdirectory, outdirectory) output, error = subprocess.Popen(complement_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create gray area feature set:\n', error.decode()) print("[scReadSim] Gray areas generated: %s/scReadSim.grayareas.bed" % outdirectory) print('\n[scReadSim] Created:') print('[scReadSim] Peak File: %s/%s' % (outdirectory, peakfile)) print('[scReadSim] Non-Peak File: %s/%s' % (outdirectory, nonpeakfile)) print('[scReadSim] Gray Area File: %s/scReadSim.grayareas.bed' % (outdirectory)) # Output Peak Mode if OUTPUT_peakfile is not None: output_peakfile = "scReadSim.output.peak.bed" output_nonpeakfile = "scReadSim.output.nonpeak.bed" print("\n[scReadSim] User-specified Output Peaks Detected: %s" % OUTPUT_peakfile) # Merge and sort output peak file print("[scReadSim] Merging Output Peak File: %s" % OUTPUT_peakfile) cmd = "%s/bedtools sort -i %s | %s/bedtools merge > %s/%s" % (bedtools_directory, OUTPUT_peakfile, bedtools_directory, outdirectory, output_peakfile) output, error = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to sort output peak set:\n', error.decode()) print("Generating Output Non-Peaks...") complement_cmd = "%s/bedtools complement -i %s -g %s/genome_size_selected.txt > %s/%s" % (bedtools_directory, OUTPUT_peakfile, outdirectory, outdirectory, output_nonpeakfile) output, error = subprocess.Popen(complement_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create output non-peak feature set:\n', error.decode()) print('\n[scReadSim] Created:') print('[scReadSim] Output Peak File: %s/%s' % (outdirectory, output_peakfile)) print('[scReadSim] Output Non-Peak File: %s/%s' % (outdirectory, output_nonpeakfile)) print('[scReadSim] Done!')
[docs] def scRNA_CreateFeatureSets(INPUT_bamfile, samtools_directory, bedtools_directory, outdirectory, genome_annotation, genome_size_file): """Create the foreground and background feature set for the input scRNA-seq bam file. Parameters ---------- INPUT_bamfile: `str` Input BAM file. samtools_directory: `str` Path to software `samtools`. bedtools_directory: `str` Path to software `bedtools`. outdirectory: `str` Specify the output directory of the features files. genome_annotation: `str` Genome annotation file for the reference genome that the input BAM aligned on or the synthetic BAM should align on. genome_size_file: `str` Genome sizes file. The file should be a tab delimited text file with two columns: first column for the chromosome name, second column indicates the size. """ genome_size_df = pd.read_csv(genome_size_file, header=None, delimiter="\t") chromosomes_coverd = ExtractBAMCoverage(INPUT_bamfile, samtools_directory, outdirectory) search_string_chr = '|'.join(chromosomes_coverd) cmd = "cat %s | grep -Ew '%s' > %s/genome_size_selected.txt" % (genome_size_file, search_string_chr, outdirectory) output, error = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to extract corresponding chromosomes from genome size file:\n', error.decode()) cmd = """awk -F"\t" '$3=="gene"' %s | cut -f1,4,5 > %s/gene_region.bed""" % (genome_annotation, outdirectory) output, error = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to extract gene regions from genome annotation file:\n', error.decode()) cmd = "%s/bedtools sort -i %s/gene_region.bed | %s/bedtools merge | grep -Ew '%s' > %s/scReadSim.Gene.bed" % (bedtools_directory, outdirectory, bedtools_directory, search_string_chr, outdirectory) print("[scReadSim] Generating Bed File for Genes...") output, error = subprocess.Popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create feature set:\n', error.decode()) os.system("rm %s/gene_region.bed" % outdirectory) print("[scReadSim] Generating Bed File for InterGenes...") complement_cmd = "%s/bedtools complement -i %s/scReadSim.Gene.bed -g %s/genome_size_selected.txt > %s/scReadSim.InterGene.bed" % (bedtools_directory, outdirectory, outdirectory, outdirectory) output, error = subprocess.Popen(complement_cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() if error: print('[ERROR] Fail to create complementary feature set:\n', error.decode()) print('\n[scReadSim] Created:') print('[scReadSim] Gene Bed File: %s/scReadSim.Gene.bed' % (outdirectory)) print('[scReadSim] InterGene Bed File: %s/scReadSim.InterGene.bed' % (outdirectory)) print('[scReadSim] Done!')
[docs] def countmat_mainloop(rec_id): """Construct count vector for each scATAC-seq feature. """ count_array = np.zeros(cells_n, dtype=int) # initialize rec = open_peak[rec_id] rec_name = '_'.join((rec[0], str(rec[1]), str(rec[2]))) samfile = pysam.AlignmentFile(INPUT_bamfile_glb, "rb") reads = samfile.fetch(rec[0], int(rec[1]), int(rec[2])) # question: what about feature intersection or half overlap? cell_iter = [] cell_idx_ls = [] for read in reads: cell_iter.append(read.qname.split(":")[0].upper()) for cell in cell_iter: if cell in cells_barcode: cell_idx_ls.append(cells_barcode.index(cell)) counter = Counter(cell_idx_ls) keys = list(counter.keys()) values = list(counter.values()) count_array[keys] = values count_array_withPeak = np.insert(count_array.astype(str), 0, rec_name) return count_array_withPeak
[docs] def scATAC_bam2countmat_paral(cells_barcode_file, bed_file, INPUT_bamfile, outdirectory, count_mat_filename, n_cores=1): """Construct count matrix for scATAC-seq BAM file. Parameters ---------- cells_barcode_file: `str` Cell barcode file corresponding to the input BAM file. bed_file: `str` Features bed file to generate the count matrix. INPUT_bamfile: `str` Input BAM file for anlaysis. outdirectory: `str` Specify the output directory of the count matrix file. count_mat_filename: `str` Specify the base name of output count matrix. n_cores: `int` (default: 1) Specify the number of cores for parallel computing when generating count matrix. """ cells = pd.read_csv(cells_barcode_file, sep="\t", header=None) cells = cells.values.tolist() # Specify global vars global open_peak, cells_n, cells_barcode, INPUT_bamfile_glb INPUT_bamfile_glb = INPUT_bamfile cells_barcode = [item[0] for item in cells] with open(bed_file) as open_peak: reader = csv.reader(open_peak, delimiter="\t") open_peak = np.asarray(list(reader)) k = 0 cellsdic = defaultdict(lambda: [None]) for cell in cells_barcode: cellsdic[cell] = k k += 1 k = 0 peaksdic = defaultdict(lambda: [None]) for rec in open_peak: rec_name = '_'.join(rec) peaksdic[rec_name] = k k += 1 cells_n = len(cells_barcode) peaks_n = len(open_peak) print("[scReadSim] Generating read count matrix...\n") try: mat_array = Parallel(n_jobs=n_cores, backend='multiprocessing')(delayed(countmat_mainloop)(rec_id) for rec_id in (range(len(open_peak)))) except: print("[scReadSim] Parallel computing failed. Start computing with one core...") mat_array = Parallel(n_jobs=1, backend='multiprocessing')(delayed(countmat_mainloop)(rec_id) for rec_id in (range(len(open_peak)))) para_countmat = np.array(mat_array) with open("%s/%s.txt" % (outdirectory, count_mat_filename), 'w') as outsfile: for rec_id in tqdm(range(len(open_peak))): print("\t".join([str(x) for x in para_countmat[rec_id,:]]),file = outsfile) print('[scReadSim] Created:') print('[scReadSim] Read count matrix: %s/%s.txt' % (outdirectory, count_mat_filename)) print('[scReadSim] Done!')
[docs] def scRNA_UMIcountmat_mainloop(rec_id): """Construct count vector for each scRNA-seq feature. """ UMI_currlist = [["empty UMI"] for _ in range(cells_n)] rec = open_peak[rec_id] rec_name = '_'.join((rec[0], str(rec[1]), str(rec[2]))) samfile = pysam.AlignmentFile(INPUT_bamfile_glb, "rb") reads = samfile.fetch(rec[0], int(rec[1]), int(rec[2])) for read in reads: cell = read.qname.split(":")[0].upper() if cell in cells_barcode: try: if read.has_tag(UMI_tag_glb): UMI = read.get_tag(UMI_tag_glb) UMI_currlist[cellsdic[cell]].append(UMI) except KeyError: pass UMI_count_array = [len(set(UMIs_percell))-1 for UMIs_percell in UMI_currlist] UMI_count_array.insert(0,rec_name) return UMI_count_array
[docs] def scRNA_bam2countmat_paral(cells_barcode_file, bed_file, INPUT_bamfile, outdirectory, count_mat_filename, UMI_modeling=True, UMI_tag="UB:Z", n_cores=1): """Construct read (or UMI) count matrix for scRNA-seq BAM file. Parameters ---------- cells_barcode_file: `str` Cell barcode file corresponding to the input BAM file. bed_file: `str` Features bed file to generate the count matrix. INPUT_bamfile: `str` Input BAM file for anlaysis. outdirectory: `str` Specify the output directory of the count matrix file. count_mat_filename: `str` Specify the base name of output read (or UMI) count matrix. UMI_modeling: `bool` (default: True) Specify whether scReadSim should model UMI count of the input BAM file. UMI_tag: `str` (default: 'UB:Z') If UMI_modeling is set to True, specify the UMI tag of input BAM file, default value 'UB:Z' is the UMI tag for 10x scRNA-seq. n_cores: `int` (default: 1) Specify the number of cores for parallel computing when generating count matrix. """ cells = pd.read_csv(cells_barcode_file, sep="\t", header=None) cells = cells.values.tolist() # Specify global vars global open_peak, cells_barcode, INPUT_bamfile_glb, UMI_tag_glb, cellsdic UMI_tag_glb = UMI_tag INPUT_bamfile_glb = INPUT_bamfile cells_barcode = [item[0] for item in cells] with open(bed_file) as open_peak: reader = csv.reader(open_peak, delimiter="\t") open_peak = np.asarray(list(reader)) k = 0 cellsdic = defaultdict(lambda: [None]) for cell in cells_barcode: cellsdic[cell] = k k += 1 k = 0 peaksdic = defaultdict(lambda: [None]) for rec in open_peak: rec_name = '_'.join(rec) peaksdic[rec_name] = k k += 1 global cells_n cells_n = len(cells_barcode) peaks_n = len(open_peak) if UMI_modeling == True: print("[scReadSim] UMI Mode Detected.") print("[scReadSim] Generating UMI Count Matrix...") try: UMI_countmat_array = Parallel(n_jobs=n_cores, backend='multiprocessing')(delayed(scRNA_UMIcountmat_mainloop)(rec_id) for rec_id in (range(len(open_peak)))) except: print("[scReadSim] Parallel computing failed. Start computing with one core...") UMI_countmat_array = Parallel(n_jobs=1, backend='multiprocessing')(delayed(scRNA_UMIcountmat_mainloop)(rec_id) for rec_id in (range(len(open_peak)))) print("[scReadSim] Generated UMI Count Matrix.") UMI_countmat = np.array(UMI_countmat_array) print("[scReadSim] Writing UMI Count Matrix TXT File...") with open("%s/%s.txt" % (outdirectory, count_mat_filename), 'w') as outsfile: for rec_id in tqdm(range(len(open_peak))): print("\t".join([str(x) for x in UMI_countmat[rec_id,:]]),file = outsfile) print("[scReadSim] Created:") print("[scReadSim] UMI Count Matrix %s.txt" % count_mat_filename) else: print("[scReadSim] Detected that UMI Mode Is Off.") print("[scReadSim] Generating Read Count Matrix...") try: read_countmat_array = Parallel(n_jobs=n_cores, backend='multiprocessing')(delayed(countmat_mainloop)(rec_id) for rec_id in (range(len(open_peak)))) except: print("[scReadSim] Parallel computing failed. Start computing with one core...") read_countmat_array = Parallel(n_jobs=1, backend='multiprocessing')(delayed(countmat_mainloop)(rec_id) for rec_id in (range(len(open_peak)))) read_countmat = np.array(read_countmat_array) print("[scReadSim] Writing Read Count Matrix TXT File...") with open("%s/%s.txt" % (outdirectory, count_mat_filename), 'w') as outsfile: for rec_id in tqdm(range(len(open_peak))): print("\t".join([str(x) for x in read_countmat[rec_id,:]]),file = outsfile) print("[scReadSim] Created:") print("[scReadSim] Read count matrix %s.txt" % count_mat_filename) print("[scReadSim] Done.\n")
[docs] def scATAC_bam2countmat_OutputPeak(cells_barcode_file, assignment_file, INPUT_bamfile, outdirectory, count_mat_filename): """Construct count matrix for task with user input features set. Parameters ---------- cells_barcode_file: `str` Cell barcode file corresponding to the input BAM file. assignment_file: `str` Features mapping file output by function `Utility.FeatureMapping`. INPUT_bamfile: `str` Input BAM file for anlaysis. outdirectory: `str` Specify the output directory of the count matrix file. count_mat_filename: `str` Specify the base name of output count matrix. """ cells = pd.read_csv(cells_barcode_file, sep="\t", header=None) cells = cells.values.tolist() cells_barcode = [item[0] for item in cells] with open("%s/%s.txt" % (outdirectory, count_mat_filename), 'w') as outsfile: samfile = pysam.AlignmentFile(INPUT_bamfile, "rb") with open(assignment_file) as open_peak: reader = csv.reader(open_peak, delimiter="\t") open_peak = np.asarray(list(reader)) k = 0 cellsdic = defaultdict(lambda: [None]) for cell in cells_barcode: cellsdic[cell] = k k += 1 k = 0 peaksdic = defaultdict(lambda: [None]) for rec in open_peak: rec_name = '_'.join(rec) peaksdic[rec_name] = k k += 1 cells_n = len(cells_barcode) peaks_n = len(open_peak) # marginal_count_vec = [0] * len(open_peak) print("[scReadSim] Generating count matrix...") # for rec in open_peak: for rec_id in tqdm(range(len(open_peak))): rec = open_peak[rec_id] rec_name = '_'.join(rec) currcounts = [0]*cells_n reads = samfile.fetch(rec[3], int(rec[4]), int(rec[5])) for read in reads: cell = read.qname.split(":")[0].upper() if cell in cells_barcode: try: currcounts[cellsdic[cell]] += 1 except KeyError: pass # marginal_count_vec[rec_id] = sum(currcounts) # if sum(currcounts) > 0: print(rec_name + "\t" + "\t".join([str(x) for x in currcounts]),file = outsfile)
[docs] def find_nearest_peak(array, value, k, ref_read_density): """Find the index of largest-read-density input peak from `array` with a length similar to `value`. """ array = np.asarray(array) # idx = (np.abs(array - value)).argmin() idx = (np.abs(array - value)).argpartition(k) idx_ReadDensity = ref_read_density[idx[:k]].argmax() final_idx = idx[idx_ReadDensity] return final_idx
[docs] def find_nearest_nonpeak(array, value, k, ref_read_density): """Find the index of largest-read-density input non-peak from `array` with a length similar to `value`. """ array = np.asarray(array) # idx = (np.abs(array - value)).argmin() idx = (np.abs(array - value)).argpartition(k) idx_ReadDensity = ref_read_density[idx[:k]].argmin() final_idx = idx[idx_ReadDensity] return final_idx
def fragment_length(open_peak): output = np.asarray([int(x[2]) - int(x[1]) for x in open_peak]) return output
[docs] def match_peak(MarginalCountList, output_peaks, input_peaks, outdirectory, assignment_file, n_top): """Map from output peaks to input peaks according to the similarity of peak length. """ with open("%s" % (output_peaks)) as true_peak_file: reader = csv.reader(true_peak_file, delimiter="\t") true_peak_set = np.asarray(list(reader)) with open("%s" % (input_peaks)) as ref_peak_file: reader = csv.reader(ref_peak_file, delimiter="\t") ref_peak_set = np.asarray(list(reader)) ref_marginal_count = np.asarray(MarginalCountList) ref_peak_fraglen = fragment_length(ref_peak_set) ref_read_density = ref_marginal_count.astype(int) / ref_peak_fraglen # ref_peak_fraglen.view('i8,i8,i8').sort(order=['f0'], axis=0) n_top_select = min(len(ref_marginal_count)-1, n_top) with open("%s/%s" % (outdirectory, assignment_file), 'w') as outsfile: for true_peak_id in tqdm(range(len(true_peak_set))): true_peak = true_peak_set[true_peak_id] true_length = int(true_peak[2]) - int(true_peak[1]) idx = find_nearest_peak(ref_peak_fraglen, true_length, n_top_select, ref_read_density) print("\t".join(true_peak[0:3]) + '\t' + "\t".join(ref_peak_set[idx][0:3]), file=outsfile)
[docs] def match_nonpeak(MarginalCountList ,output_nonpeaks, input_nonpeaks, outdirectory, assignment_file, n_top): """Map from output non-peaks to input non-peaks according to the similarity of peak length. """ with open("%s" % (output_nonpeaks)) as true_peak_file: reader = csv.reader(true_peak_file, delimiter="\t") true_peak_set = np.asarray(list(reader)) with open("%s" % (input_nonpeaks)) as ref_peak_file: reader = csv.reader(ref_peak_file, delimiter="\t") ref_peak_set = np.asarray(list(reader)) ref_marginal_count = np.asarray(MarginalCountList) ref_peak_fraglen = fragment_length(ref_peak_set) ref_read_density = ref_marginal_count.astype(int) / ref_peak_fraglen n_top_select = min(len(ref_marginal_count)-1, n_top) # ref_peak_fraglen.view('i8,i8,i8').sort(order=['f0'], axis=0) with open("%s/%s" % (outdirectory, assignment_file), 'w') as outsfile: for true_peak_id in tqdm(range(len(true_peak_set))): true_peak = true_peak_set[true_peak_id] true_length = int(true_peak[2]) - int(true_peak[1]) idx = find_nearest_nonpeak(ref_peak_fraglen, true_length, n_top_select, ref_read_density) print("\t".join(true_peak[0:3]) + '\t' + "\t".join(ref_peak_set[idx][0:3]), file=outsfile)
[docs] def bam2MarginalCount(bed_file, sam_filename): """Count read overlapped for each feature from input bed file. """ samfile = pysam.AlignmentFile(sam_filename, "rb") with open("%s" % (bed_file)) as open_peak: reader = csv.reader(open_peak, delimiter="\t") open_peak = np.asarray(list(reader)) k = 0 peaksdic = defaultdict(lambda: [None]) for rec in open_peak: rec_name = '_'.join(rec) peaksdic[rec_name] = k k += 1 peaks_n = len(open_peak) MarginalCountList = np.empty((peaks_n), dtype="int") print("[scReadSim] Converting marginal count vector...") # for rec in open_peak: for rec_id in tqdm(range(len(open_peak))): rec = open_peak[rec_id] rec_name = '_'.join(rec) currcounts = 0 reads = samfile.fetch(rec[0], int(rec[1]), int(rec[2])) for read in reads: currcounts += 1 # if sum(currcounts) > 0: MarginalCountList[rec_id] = int(currcounts) # print(rec_name + "\t" + str(currcounts),file = outsfile) return MarginalCountList
[docs] def FeatureMapping(INPUT_bamfile, input_peaks, input_nonpeaks, output_peaks, output_nonpeaks, outdirectory, assignment_peak_file, assignment_nonpeak_file, n_top=50): """Obtain mappings between input and output peaks, input and output non-peaks. The mappings are output as `assignment_peak_file` and `assignment_nonpeak_file` within `outdirectory`. Parameters ---------- INPUT_bamfile: `str` Input BAM file for anlaysis. input_peaks: `str` BED file of user specified (or generated by scReadSim+MACS3) input peaks. input_nonpeaks: `str` BED file of user specified (or generated by scReadSim+MACS3) input nonpeaks. output_peaks: `str` BED file of user specified output peaks. output_nonpeaks: `str` BED file of output non-peaks, generated by function 'scATAC_CreateFeatureSets'. outdirectory: `str` Output directory. assignment_peak_file: `str` Specify the name of peak mapping file. assignment_nonpeak_file: `str` Specify the name of nonpeak mapping file. n_top: 'int'(default: 50) Specify the number of input peaks (or non-peaks) with the most similar length as the candidate mapped input peaks (or non-peaks) for each the output peak (or non-peak). From the candidate input peaks (or non-peaks), scReadSim further selects the one with largest read density for peak mapping (smallest read density for non-peak mapping). """ peak_MarginalCountList = bam2MarginalCount(input_peaks, INPUT_bamfile) print("[scReadSim] Mapping Input Peaks and Output Peaks...") match_peak(peak_MarginalCountList, output_peaks, input_peaks, outdirectory, assignment_peak_file, n_top) nonpeak_MarginalCountList = bam2MarginalCount(input_nonpeaks, INPUT_bamfile) print("[scReadSim] Mapping Input Non-Peaks and Output Non-Peaks...") match_nonpeak(nonpeak_MarginalCountList, output_nonpeaks, input_nonpeaks, outdirectory, assignment_nonpeak_file, n_top) print('[scReadSim] Created:') print('[scReadSim] Mapping File between Input and Output Peaks: %s/%s.txt' % (outdirectory, assignment_peak_file)) print('[scReadSim] Mapping File between Input and Output Peaks: %s/%s.txt' % (outdirectory, assignment_nonpeak_file)) print('[scReadSim] Done!')
# Multi-sample def scATAC_CreateFeatureSets_MultiSample(INPUT_bamfile, samtools_directory, bedtools_directory, outdirectory, genome_size_file, macs3_directory, superset_peakfile, OUTPUT_peakfile=None): """Multi-sample/replicate implement of scReadSim for generating scATAC-seq features. Parameters ---------- INPUT_bamfile: `str` List of input BAM files (use absolute paths to the BAM files). samtools_directory: `str` Directory of software samtools. bedtools_directory: `str` Directory of software bedtools. outdirectory: `str` Specify the working directory of scReadSim for generating intermediate and final output files. genome_size_file: `str` Directory of Genome sizes file. The file should be a tab delimited text file with two columns: first column for the chromosome name, second column indicates the size. macs3_directory: `str` Path to software MACS3. superset_peakfile: `str` Directory of a superset of potential chromatin open regions, including sources such as ENCODE cCRE (Candidate Cis-Regulatory Elements) collection. OUTPUT_peakfile: `str` (default: None) Directory of user-specified output peak file. Synthetic scATAC-seq reads will be generated taking `OUTPUT_peakfile` as ground truth peaks. Note that `OUTPUT_peakfile` does not name the generated feature files by function `scATAC_CreateFeatureSets`. """ if len(INPUT_bamfile) > 1: print("[scReadSim] Multiple input BAM files detected.") # Create individual output directories print("[scReadSim] Creating individual output directory for each sample/replicate.") for rep_id in range(len(INPUT_bamfile)): sample_output_d = outdirectory + "/" + "Rep" + str(rep_id+1) if not Path(sample_output_d).is_dir(): os.mkdir(sample_output_d) # Preparing features for each sample print("\n[scReadSim] Creating features for sample %s." % str(rep_id+1)) scATAC_CreateFeatureSets(peak_mode="superset", INPUT_bamfile=INPUT_bamfile[rep_id], samtools_directory=samtools_directory, bedtools_directory=bedtools_directory, outdirectory=sample_output_d, genome_size_file=genome_size_file, macs3_directory=macs3_directory, superset_peakfile=superset_peakfile, OUTPUT_peakfile=OUTPUT_peakfile) elif len(INPUT_bamfile) == 1: raise ValueError("[ERROR]: Only one input BAM file detected. Please use single sample version scReadSim.Utility.scATAC_CreateFeatureSets instead.")
[docs] def scATAC_bam2countmat_paral_MultiSample(cells_barcode_file, INPUT_bamfile, outdirectory, n_cores=1): """Multi-sample/replicate implement of scReadSim for constructing count matrices for scATAC-seq BAM file. Parameters ---------- cells_barcode_file: `str` List of cell barcode files corresponding to the input BAM files. INPUT_bamfile: `str` List of input BAM files (use absolute paths to the BAM files). outdirectory: `str` Specify the working directory of scReadSim for generating intermediate and final output files. n_cores: `int` (default: 1) Specify the number of cores for parallel computing when generating count matrix. """ if len(cells_barcode_file) != len(INPUT_bamfile): raise ValueError("[ERROR]: Number of input cell barcode files does not match the number of inputBAM files!") for rep_id in range(len(INPUT_bamfile)): print("\n[scReadSim] Creating count matrices for sample %s." % str(rep_id+1)) sample_output_d = outdirectory + "/" + "Rep" + str(rep_id+1) # Obtain bedfile peak_bedfile = sample_output_d + "/" + "scReadSim.superset.peak.bed" nonpeak_bedfile = sample_output_d + "/" + "scReadSim.superset.nonpeak.bed" # Specify the output count matrices' prenames count_mat_peak_filename = "Rep%s.peak.countmatrix" % str(rep_id+1) count_mat_nonpeak_filename = "Rep%s.nonpeak.countmatrix" % str(rep_id+1) # Construct count matrix for peaks print("\n[scReadSim] Generating read counts for peaks...") scATAC_bam2countmat_paral(cells_barcode_file=cells_barcode_file[rep_id], bed_file=peak_bedfile, INPUT_bamfile=INPUT_bamfile[rep_id], outdirectory=sample_output_d, count_mat_filename=count_mat_peak_filename, n_cores=n_cores) # Construct count matrix for non-peaks print("\n[scReadSim] Generating read counts for non-peaks...") scATAC_bam2countmat_paral(cells_barcode_file=cells_barcode_file[rep_id], bed_file=nonpeak_bedfile, INPUT_bamfile=INPUT_bamfile[rep_id], outdirectory=sample_output_d, count_mat_filename=count_mat_nonpeak_filename, n_cores=n_cores)
def scRNA_CreateFeatureSets_MultiSample(INPUT_bamfile, samtools_directory, bedtools_directory, outdirectory, genome_size_file, genome_annotation): """Multi-sample/replicate implement of scReadSim for generating scRNA-seq features. Parameters ---------- INPUT_bamfile: `str` List of input BAM files (use absolute paths to the BAM files). samtools_directory: `str` Path to software `samtools`. bedtools_directory: `str` Path to software `bedtools`. outdirectory: `str` Specify the working directory of scReadSim for generating intermediate and final output files. genome_size_file: `str` Genome sizes file. The file should be a tab delimited text file with two columns: first column for the chromosome name, second column indicates the size. genome_annotation: `str` Genome annotation file for the reference genome that the input BAM aligned on or the synthetic BAM should align on. """ if len(INPUT_bamfile) > 1: print("[scReadSim] Multiple input BAM files detected.") # Create individual output directories print("[scReadSim] Creating individual output directory for each sample/replicate.") for rep_id in range(len(INPUT_bamfile)): sample_output_d = outdirectory + "/" + "Rep" + str(rep_id+1) if not Path(sample_output_d).is_dir(): os.mkdir(sample_output_d) # Preparing features for each sample print("\n[scReadSim] Creating features for sample %s." % str(rep_id+1)) scRNA_CreateFeatureSets(INPUT_bamfile=INPUT_bamfile[rep_id], samtools_directory=samtools_directory, bedtools_directory=bedtools_directory, outdirectory=sample_output_d, genome_annotation=genome_annotation, genome_size_file=genome_size_file) elif len(INPUT_bamfile) == 1: raise ValueError("[ERROR]: Only one input BAM file detected. Please use single sample version scReadSim.Utility.scRNA_CreateFeatureSets instead.")
[docs] def scRNA_bam2countmat_paral_MultiSample(cells_barcode_file, INPUT_bamfile, outdirectory, n_cores=1, UMI_modeling=True, UMI_tag = "UB:Z"): """Multi-sample/replicate implement of scReadSim for constructing count matrices for scRNA-seq BAM file. Parameters ---------- cells_barcode_file: `str` List of cell barcode files corresponding to the input BAM files. INPUT_bamfile: `str` List of input BAM files (use absolute paths to the BAM files). outdirectory: `str` Specify the working directory of scReadSim for generating intermediate and final output files. n_cores: `int` (default: 1) Specify the number of cores for parallel computing when generating count matrix. UMI_modeling: `bool` (default: True) Specify whether scReadSim should model UMI count of the input BAM file. UMI_tag: `str` (default: 'UB:Z') If UMI_modeling is set to True, specify the UMI tag of input BAM file, default value 'UB:Z' is the UMI tag for 10x scRNA-seq. """ if len(cells_barcode_file) != len(INPUT_bamfile): raise ValueError("[ERROR]: Number of input cell barcode files does not match the number of inputBAM files!") for rep_id in range(len(INPUT_bamfile)): print("\n[scReadSim] Creating count matrices for sample %s." % str(rep_id+1)) sample_output_d = outdirectory + "/" + "Rep" + str(rep_id+1) # Obtain bedfile # Specify the path to bed files generated by Utility.scRNA_CreateFeatureSets gene_bedfile = sample_output_d + "/" + "scReadSim.Gene.bed" intergene_bedfile = sample_output_d + "/" + "scReadSim.InterGene.bed" # Specify the output count matrices' prenames UMI_gene_count_mat_filename = "Rep%s.gene.countmatrix" % str(rep_id+1) UMI_intergene_count_mat_filename = "Rep%s.intergene.countmatrix" % str(rep_id+1) # Construct count matrix for genes print("\n[scReadSim] Generating read counts for genes...") scRNA_bam2countmat_paral(cells_barcode_file=cells_barcode_file[rep_id], bed_file=gene_bedfile, INPUT_bamfile=INPUT_bamfile[rep_id], outdirectory=sample_output_d, count_mat_filename=UMI_gene_count_mat_filename, UMI_modeling=UMI_modeling, UMI_tag = UMI_tag, n_cores=n_cores) print("\n[scReadSim] Generating read counts for inter-genes...") # Construct count matrix for inter-genes scRNA_bam2countmat_paral(cells_barcode_file=cells_barcode_file[rep_id], bed_file=intergene_bedfile, INPUT_bamfile=INPUT_bamfile[rep_id], outdirectory=sample_output_d, count_mat_filename=UMI_intergene_count_mat_filename, UMI_modeling=UMI_modeling, UMI_tag = UMI_tag, n_cores=n_cores)