3.SNV calling • MAAS

We used SComatic to obtain SNV profile of tumor cells from scATAC-seq data. For more details, please refer to https://github.com/cortes-ciriano-lab/SComatic.

# Create a directory for SNV calling in the terminal
mkdir -p 3.SNV_calling

# Step1: split bam files for different cell types
utput_dir1=step1_BamCellTypes
mkdir -p $output_dir1

python ~/App/SComatic-main/scripts/SplitBam/SplitBamCellTypes.py \
        --bam /Path/to/bam/file \
        --meta ../1.Cellanno/HCC.celltype.txt\
        --id example \
        --min_MQ 30 \
        --outdir $output_dir1

################################################################################

# Step2: summarize base count
REF=/Path/to/refdata-cellranger-arc-GRCh38-2020-A-2.0.0/fasta/genome.fa
  
output_dir2=step2_BaseCellCounts
mkdir -p $output_dir2

for bam in $(ls -d ./step1_BamCellTypes/*bam);do
        # Cell type
        cell_type=$(basename $bam | awk -F'.' '{print $(NF-1)}')

        # Temp folder
        temp=$output_dir2/temp_${cell_type}
        mkdir -p $temp

        # Command line to submit to cluster
        python ~/App/SComatic-main/scripts/BaseCellCounter/BaseCellCounter.py --bam $bam \
                --ref $REF \
                --chrom all \
                --out_folder $output_dir2 \
                --bed /Path/to/ATAC/bed/file \
                --min_bq 30 \
                --min_mq 30 \
                --tmp_dir $temp \
                --nprocs 30

        rm -rf $temp
done

################################################################################

# Step3: merge counts
output_dir3=step3_BaseCellCountsMerged
mkdir -p $output_dir3

python ~/App/SComatic-main/scripts/MergeCounts/MergeBaseCellCounts.py --tsv_folder ./step2_BaseCellCounts --outfile ${output_dir3}/BaseCellCounts.AllCellTypes.tsv

################################################################################

# Step4: Mutation detection
# Step 4.1
output_dir4=/step4_VariantCalling
mkdir -p $output_dir4

sample=example

python ~/App/SComatic-main/scripts/BaseCellCalling/BaseCellCalling.step1.py \
                  --infile step3_BaseCellCountsMerged/BaseCellCounts.AllCellTypes.tsv \
                  --outfile ${output_dir4}/${sample} \
                  --ref $REF

# Step 4.2
PON=/Path/to/PoN.scATACseq.hg38.tsv

python ~/App/SComatic-main/scripts/BaseCellCalling/BaseCellCalling.step2.py \
                  --infile ${output_dir4}/${sample}.calling.step1.tsv \
                  --outfile ${output_dir4}/${sample} \
                  --pon $PON

################################################################################

# Step5: site calling
STEP4_1=/step4_VariantCalling/${sample}.calling.step1.tsv

output_dir5=step5_UniqueCellCallableSites
mkdir -p $output_dir5

for bam in $(ls -d ./step1_BamCellTypes/*bam);do
            cell_type=$(basename $bam | awk -F'.' '{print $(NF-1)}')
            echo $cell_type

            temp=$output_dir5/temp_${cell_type}
            mkdir -p $temp

            python ~/App/SComatic-main/scripts/SitesPerCell/SitesPerCell.py --bam $bam    \
                    --infile ./step4_VariantCalling/${sample}.calling.step1.tsv   \
                    --min_mq 30 \
                    --ref $REF \
                    --out_folder $output_dir5 \
                    --tmp_dir $temp \
                    --nprocs 60
            echo
done

################################################################################

# Step6: get genotypes

output_dir6=step6_SingleCellAlleles
mkdir -p $output_dir6

for bam in $(ls -d ./step1_BamCellTypes/*bam);do

        cell_type=$(basename $bam | awk -F'.' '{print $(NF-1)}')

        temp=$output_dir6/temp_${cell_type}
        mkdir -p $temp

        python ~/App/SComatic-main/scripts/SingleCellGenotype/SingleCellGenotype.py --bam $bam  \
                --infile ${STEP4_2_pass}   \
                --nprocs 30   \
                --meta $META   \
                --outfile ${output_dir6}/${cell_type}.single_cell_genotype.tsv  \
                --tmp_dir $temp  \
                --min_mq 30 \
                --ref $REF

        rm -rf $temp
done

After running the pipeling of SComatic, we can generate a cell-by-SNV matrix for MAAS integration as follows. This step will keep cells with favorable callable sites and remove low-quality mutations.

rm(list = ls())
library(foreach)
library(tibble)
library(data.table)
library(dplyr)
library(stringr)
library(reshape2)

genotype <- fread("../4.SNV_calling/step6_SingleCellAlleles/Tumor.single_cell_genotype.tsv")
colnames(genotype)[1] <- "chr"
genotype1 <- genotype %>% dplyr::filter(Num_reads >= 3 & grepl("^chr", chr) & genotype$ALT_expected != ".")


genotype1 <- genotype1 %>%
  dplyr::mutate(genotype = ifelse(
    str_detect(ALT_expected, "\\|") & 
      str_detect(Base_observed, ALT_expected) & 
      (Cell_type_observed %in% Cell_type_expected), 
    1, 
    ifelse(
      Base_observed == str_extract(ALT_expected, "^[^,]+") & 
        (Cell_type_observed %in% Cell_type_expected), 
      1, 
      0
    )
  ))
genotype1$pos <- paste0(genotype1$chr, ":", genotype1$Start, ":", genotype1$REF, ">", str_extract(genotype1$ALT_expected, "^[^,]+"))

# Filter cells with callable sites less than 2000
callable.site <- fread("../4.SNV_calling/step5_UniqueCellCallableSites/example.Tumor.SitesPerCell.tsv")
callable.site <- callable.site[callable.site$SitesPerCell > 10000,] ## Require cells with at least 10,000 callable sites
genotype1 <- genotype1[genotype1$CB %in% callable.site$CB,]
genotype1 <- genotype1 %>% group_by(pos) %>% dplyr::filter(n() >= 20) %>% ungroup() # Maintain SNVs in at least 20 tumor cells

snv <- dcast(genotype1[,c("CB", "pos", "genotype")], pos ~ CB, fill = 0, drop = FALSE)
snv <- column_to_rownames(snv, var = colnames(snv)[1])
snv[snv > 1] <- 1
# saveRDS(snv, "snv.mat.raw.rds")

#################################################################################
# Generate matrix for SNV denoising
x <- readRDS("snv.mat.raw.rds")
x <- x[!grepl("chrM", rownames(x)),] # Remove SNVs in chrM
saveRDS(x, "snv.mat.rds")
write.table(t(x), "snv.CBMinput.txt", sep = "\t", quote = F, row.names = F, col.names = F)

snv[1:5,1:5]

#>                    AAACTCGCATTCTTTG AAACTCGTCCATTGTT AAACTCGTCGTTGTTT
#> chr1:634045:T>G                   1                0                0
#> chr16:46390534:G>A                0                0                0
#> chr17:22521407:G>A                0                1                0
#> chr17:22521415:T>C                0                0                0
#> chr17:22521426:T>C                0                0                0
#>                    AAACTGCAGAGTCCGA AAACTGCCAGGGAGTT
#> chr1:634045:T>G                   0                0
#> chr16:46390534:G>A                0                0
#> chr17:22521407:G>A                0                0
#> chr17:22521415:T>C                0                0
#> chr17:22521426:T>C                0                0