3.SNV calling.RmdWe used SComatic to obtain SNV profile of tumor cells from scATAC-seq data. For more details, please refer to https://github.com/cortes-ciriano-lab/SComatic.
# Create a directory for SNV calling in the terminal
mkdir -p 3.SNV_calling
# Step1: split bam files for different cell types
utput_dir1=step1_BamCellTypes
mkdir -p $output_dir1
python ~/App/SComatic-main/scripts/SplitBam/SplitBamCellTypes.py \
--bam /Path/to/bam/file \
--meta ../1.Cellanno/HCC.celltype.txt\
--id example \
--min_MQ 30 \
--outdir $output_dir1
################################################################################
# Step2: summarize base count
REF=/Path/to/refdata-cellranger-arc-GRCh38-2020-A-2.0.0/fasta/genome.fa
output_dir2=step2_BaseCellCounts
mkdir -p $output_dir2
for bam in $(ls -d ./step1_BamCellTypes/*bam);do
# Cell type
cell_type=$(basename $bam | awk -F'.' '{print $(NF-1)}')
# Temp folder
temp=$output_dir2/temp_${cell_type}
mkdir -p $temp
# Command line to submit to cluster
python ~/App/SComatic-main/scripts/BaseCellCounter/BaseCellCounter.py --bam $bam \
--ref $REF \
--chrom all \
--out_folder $output_dir2 \
--bed /Path/to/ATAC/bed/file \
--min_bq 30 \
--min_mq 30 \
--tmp_dir $temp \
--nprocs 30
rm -rf $temp
done
################################################################################
# Step3: merge counts
output_dir3=step3_BaseCellCountsMerged
mkdir -p $output_dir3
python ~/App/SComatic-main/scripts/MergeCounts/MergeBaseCellCounts.py --tsv_folder ./step2_BaseCellCounts --outfile ${output_dir3}/BaseCellCounts.AllCellTypes.tsv
################################################################################
# Step4: Mutation detection
# Step 4.1
output_dir4=/step4_VariantCalling
mkdir -p $output_dir4
sample=example
python ~/App/SComatic-main/scripts/BaseCellCalling/BaseCellCalling.step1.py \
--infile step3_BaseCellCountsMerged/BaseCellCounts.AllCellTypes.tsv \
--outfile ${output_dir4}/${sample} \
--ref $REF
# Step 4.2
PON=/Path/to/PoN.scATACseq.hg38.tsv
python ~/App/SComatic-main/scripts/BaseCellCalling/BaseCellCalling.step2.py \
--infile ${output_dir4}/${sample}.calling.step1.tsv \
--outfile ${output_dir4}/${sample} \
--pon $PON
################################################################################
# Step5: site calling
STEP4_1=/step4_VariantCalling/${sample}.calling.step1.tsv
output_dir5=step5_UniqueCellCallableSites
mkdir -p $output_dir5
for bam in $(ls -d ./step1_BamCellTypes/*bam);do
cell_type=$(basename $bam | awk -F'.' '{print $(NF-1)}')
echo $cell_type
temp=$output_dir5/temp_${cell_type}
mkdir -p $temp
python ~/App/SComatic-main/scripts/SitesPerCell/SitesPerCell.py --bam $bam \
--infile ./step4_VariantCalling/${sample}.calling.step1.tsv \
--min_mq 30 \
--ref $REF \
--out_folder $output_dir5 \
--tmp_dir $temp \
--nprocs 60
echo
done
################################################################################
# Step6: get genotypes
output_dir6=step6_SingleCellAlleles
mkdir -p $output_dir6
for bam in $(ls -d ./step1_BamCellTypes/*bam);do
cell_type=$(basename $bam | awk -F'.' '{print $(NF-1)}')
temp=$output_dir6/temp_${cell_type}
mkdir -p $temp
python ~/App/SComatic-main/scripts/SingleCellGenotype/SingleCellGenotype.py --bam $bam \
--infile ${STEP4_2_pass} \
--nprocs 30 \
--meta $META \
--outfile ${output_dir6}/${cell_type}.single_cell_genotype.tsv \
--tmp_dir $temp \
--min_mq 30 \
--ref $REF
rm -rf $temp
doneAfter running the pipeling of SComatic, we can generate a cell-by-SNV matrix for MAAS integration as follows. This step will keep cells with favorable callable sites and remove low-quality mutations.
rm(list = ls())
library(foreach)
library(tibble)
library(data.table)
library(dplyr)
library(stringr)
library(reshape2)
genotype <- fread("../4.SNV_calling/step6_SingleCellAlleles/Tumor.single_cell_genotype.tsv")
colnames(genotype)[1] <- "chr"
genotype1 <- genotype %>% dplyr::filter(Num_reads >= 3 & grepl("^chr", chr) & genotype$ALT_expected != ".")
genotype1 <- genotype1 %>%
dplyr::mutate(genotype = ifelse(
str_detect(ALT_expected, "\\|") &
str_detect(Base_observed, ALT_expected) &
(Cell_type_observed %in% Cell_type_expected),
1,
ifelse(
Base_observed == str_extract(ALT_expected, "^[^,]+") &
(Cell_type_observed %in% Cell_type_expected),
1,
0
)
))
genotype1$pos <- paste0(genotype1$chr, ":", genotype1$Start, ":", genotype1$REF, ">", str_extract(genotype1$ALT_expected, "^[^,]+"))
# Filter cells with callable sites less than 2000
callable.site <- fread("../4.SNV_calling/step5_UniqueCellCallableSites/example.Tumor.SitesPerCell.tsv")
callable.site <- callable.site[callable.site$SitesPerCell > 10000,] ## Require cells with at least 10,000 callable sites
genotype1 <- genotype1[genotype1$CB %in% callable.site$CB,]
genotype1 <- genotype1 %>% group_by(pos) %>% dplyr::filter(n() >= 20) %>% ungroup() # Maintain SNVs in at least 20 tumor cells
snv <- dcast(genotype1[,c("CB", "pos", "genotype")], pos ~ CB, fill = 0, drop = FALSE)
snv <- column_to_rownames(snv, var = colnames(snv)[1])
snv[snv > 1] <- 1
# saveRDS(snv, "snv.mat.raw.rds")
#################################################################################
# Generate matrix for SNV denoising
x <- readRDS("snv.mat.raw.rds")
x <- x[!grepl("chrM", rownames(x)),] # Remove SNVs in chrM
saveRDS(x, "snv.mat.rds")
write.table(t(x), "snv.CBMinput.txt", sep = "\t", quote = F, row.names = F, col.names = F)
snv[1:5,1:5]#> AAACTCGCATTCTTTG AAACTCGTCCATTGTT AAACTCGTCGTTGTTT
#> chr1:634045:T>G 1 0 0
#> chr16:46390534:G>A 0 0 0
#> chr17:22521407:G>A 0 1 0
#> chr17:22521415:T>C 0 0 0
#> chr17:22521426:T>C 0 0 0
#> AAACTGCAGAGTCCGA AAACTGCCAGGGAGTT
#> chr1:634045:T>G 0 0
#> chr16:46390534:G>A 0 0
#> chr17:22521407:G>A 0 0
#> chr17:22521415:T>C 0 0
#> chr17:22521426:T>C 0 0