#!/bin/bash # --------------------------------------------------------------------------------- # This file shows steps taken to process Igor Ponomerev's June 2017 ATAC-seq data. # It is a record of what was done, NOT meant to be run as an actual script. # --------------------------------------------------------------------------------- # ============================================================================== # On GsafCbig01 (inital setup and data retrieval) # ============================================================================== mkdir -p /stor/work/GSAF/abattenh/2017_07.igor_atacseq/fq rsync -avrW /mnt/corral/SA17121/Project_JA17277/ fq/ # done cd /stor/work/GSAF/abattenh/2017_07.igor_atacseq/fq fastqc *.gz unzip *.zip rm *.zip # done cd /stor/work/GSAF/abattenh/2017_07.igor_atacseq/fq for f in *.fastq.gz; do echo -e "$((`zcat $f | wc -l` / 4))\t$f" | tee ${f}_stats.txt; done ls *.gz_stats.txt | xargs cat > fq_stats.txt rm *.gz_stats.txt # ============================================================================== # At TACC (stampede) # ============================================================================== # sync Anna's code rsync -avrP --delete abattenh@iyerstor01.icmb.utexas.edu:/home/abattenhouse/sequencing/code/ $WORK/seq/code/ rsync -avrP --delete abattenh@iyerstor01.icmb.utexas.edu:/home/abattenhouse/sequencing/genbrowse/ $WORK/seq/genbrowse/ rsync -avrP --delete --exclude=.git \ abattenh@iyerstor01.icmb.utexas.edu:/home/abattenhouse/gitdir/iyerlab/ $WORK/gitdir/iyerlab/ # access Anna's scripts... export path_code=/work/01063/abattenh/seq/code # xfer data from GsafCbig01 mkdir -p $SCRATCH/proj/igor_atac rsync -avrW abattenh@gsafstor01.ccbb.utexas.edu:/stor/work/GSAF/abattenh/2017_07.igor_atacseq/ \ $SCRATCH/proj/igor_atac/ # ------------------------------------------------------------- # genomic alignment w/bowtie2 local, no adapter trimming # ------------------------------------------------------------- mkdir -p $SCRATCH/proj/igor_atac/mm9/bowtie2 cd $SCRATCH/proj/igor_atac/mm9/bowtie2 ln -s -f ../../fq ls fq/*_R1*gz # fq/50knuclei_S56_L007_R1_001.fastq.gz # fq/5knuclei_S77_L008_R1_001.fastq.gz # mm9.cmds export aln_args='-t -X2000 --no-mixed --no-discordant'; $path_code/script/align/align_bowtie2_illumina.sh local ./fq/50knuclei_S56_L007_R1_001.fastq.gz brain_50k_nuclei mm9 1 export aln_args='-t -X2000 --no-mixed --no-discordant'; $path_code/script/align/align_bowtie2_illumina.sh local ./fq/5knuclei_S77_L008_R1_001.fastq.gz brain_5k_nuclei mm9 1 # done launcher_maker.py -n mm9.cmds -v -w 2 -t 10 -a human_brains sbatch mm9.slurm; showq -u grep 'Total map' *samst* # brain_50k_nuclei.samstats.txt: Total mapped: 18264886 (89.2 %) # brain_5k_nuclei.samstats.txt: Total mapped: 57301216 (77.5 %) # -------------------------------------------- mkdir -p $SCRATCH/proj/igor_atac/mm10/bowtie2 cd $SCRATCH/proj/igor_atac/mm10/bowtie2 ln -s -f ../../fq # mm10.cmds export aln_args='-t -X2000 --no-mixed --no-discordant'; $path_code/script/align/align_bowtie2_illumina.sh local ./fq/50knuclei_S56_L007_R1_001.fastq.gz brain_50k_nuclei mm10 1 export aln_args='-t -X2000 --no-mixed --no-discordant'; $path_code/script/align/align_bowtie2_illumina.sh local ./fq/5knuclei_S77_L008_R1_001.fastq.gz brain_5k_nuclei mm10 1 # done launcher_maker.py -n mm10.cmds -v -w 2 -t 10 -a human_brains sbatch mm10.slurm; showq -u grep 'Total map' *samst* # brain_50k_nuclei.samstats.txt: Total mapped: 18275462 (89.3 %) # brain_5k_nuclei.samstats.txt: Total mapped: 57332234 (77.6 %) # ------------------------------------------------------------- # Transfer to Iyer POD to run MultiQC # ------------------------------------------------------------- rsync -avrW $SCRATCH/proj/igor_atac/ \ abattenh@iyerstor01.ccbb.utexas.edu:/seq/align/2017_07.igor_atacseq/ # ============================================================================== # On Adler3 (iyercomp03) # ============================================================================== # ============================================================= # Post-alignment processing # ============================================================= function mapQ_hist() { local $pfx=$1 mq=${pfx}.mapq.txt tail -n +2 $mq | awk 'BEGIN{ FS="\t";OFS="\t"; n0=0; n10=0; n20=0; n30=0; n40=0; n99=0 } { if ($1 == 0) { n0 = $3 } else if ($1 < 10) { n10 = n10 + $3 } else if ($1 < 20) { n20 = n20 + $3 } else if ($1 < 30) { n30 = n30 + $3 } else if ($1 < 40) { n40 = n40 + $3 } else { n99 = n99 + $3 } } END{ print "q0", n0; print "1-9", n10; print "10-19", n20; print "20-29", n30; print "30-39", n40; print "40+", n99; } ' } cd /seq/align/2017_07.igor_atacseq/mm9/bowtie2 for mq in *.mapq.txt; do pfx=${mq%%.mapq.txt} echo "$mq - $pfx" mapQ_hist $pfx > ${pfx}.nofilt.mapq_histogram.tsv done cd /seq/align/2017_07.igor_atacseq/mm10/bowtie2 for mq in *.mapq.txt; do pfx=${mq%%.mapq.txt} echo "$mq - $pfx" mapQ_hist $pfx > ${pfx}.nofilt.mapq_histogram.tsv done # ---------------------------------------------------- # filter duplicates, low mapQ & unmapped reads # ---------------------------------------------------- mkdir -p /seq/align/2017_07.igor_atacseq/mm9/filtered cd /seq/align/2017_07.igor_atacseq/mm9/filtered ls ../bowtie2/*.sort.dup.bam | xargs ln -s -f -t . for bam in *.sort.dup.bam; do pfx=${bam%%.sort.dup.bam} echo "$bam - $pfx" bam2=${pfx}.nodup.q20.sort.bam samtools view -b -F 0xF04 -f 0x3 -q 20 $bam > $bam2 samtools index $bam2 samtools idxstats $bam2 > ${pfx}.idxstats.txt $path_code/perl/alignmentUtils.pl bamstats --bam-file $bam2 --pfx=$pfx > ${pfx}.samstats.txt done rm -f *.sort.dup.bam # ---------------------------------------------------- mkdir -p /seq/align/2017_07.igor_atacseq/mm10/filtered cd /seq/align/2017_07.igor_atacseq/mm10/filtered ls ../bowtie2/*.sort.dup.bam | xargs ln -s -f -t . for bam in *.sort.dup.bam; do pfx=${bam%%.sort.dup.bam} echo "$bam - $pfx" bam2=${pfx}.nodup.q20.sort.bam samtools view -b -F 0xF04 -f 0x3 -q 20 $bam > $bam2 samtools index $bam2 samtools idxstats $bam2 > ${pfx}.idxstats.txt $path_code/perl/alignmentUtils.pl bamstats --bam-file $bam2 --pfx=$pfx > ${pfx}.samstats.txt done rm -f *.sort.dup.bam # ---------------------------------------------------- # generate genome coverage info # ---------------------------------------------------- # Note: the -fs option (to use insert size instead of read length) is new in bedtools 2.26, # which is currently only installed on the Iyer POD cd /seq/align/2017_07.igor_atacseq/mm10/bowtie2 for bam in *.sort.dup.bam; do pfx=${bam%%.sort.dup.bam} echo "$bam - $pfx" /stor/system/opt/bedtools-2.26.0/bin/bedtools genomecov -ibam $bam -fs > $pfx.genomecov.txt done cd /seq/align/2017_07.igor_atacseq/mm10/filtered for bam in *.nodup.q20.sort.bam; do pfx=${bam%%.nodup.q20.sort.bam} echo "$bam - $pfx" /stor/system/opt/bedtools-2.26.0/bin/bedtools genomecov -ibam $bam -fs > $pfx.genomecov.txt done # ============================================================= # MultiQC support # ============================================================= # ---------------------------------------------------- # Prepare data for custom MultiQC reports # ---------------------------------------------------- cd /seq/align/2017_07.igor_atacseq/mm10/filtered for f in ../bowtie2/*.flagstat.txt; do bn=`basename $f` pfx=${bn%%.flagstat.txt} echo "$f - $pfx" cp -p $f ${pfx}.nofilt.flagstat.txt cat ../bowtie2/${pfx}.dupinfo.txt | sed 's/[.]sort/.nofilt/g' > ${pfx}.nofilt.dupinfo.txt done cd /seq/align/2017_07.igor_atacseq/mm10/filtered for f in *.insertsz.txt; do pfx=${f%%.insertsz.txt} echo "$f - $pfx" tail -n +2 $f | cut -f 1,3 | grep -P -v '^[-]' > ${pfx}.bowtie2_isizes.tsv done cd /seq/align/2017_07.igor_atacseq/mm10/filtered # format is (tab delim): # genome for gc in *.genomecov.txt; do pfx=${gc%%.genomecov.txt} echo "$gc - $pfx" cat $gc | grep genome | perl -e '%ht; while (<>) { @F = split(/\t/, $_); #print "(@F)\n"; $n = $F[1] + 0; if ($n == 0) { $bin = "a_none"; } elsif ($n <= 2) { $bin = "b_1-2"; } elsif ($n <= 10) { $bin = "c_3-10"; } elsif ($n <= 50) { $bin = "d_11-50"; } elsif ($n <= 100) { $bin = "e_51-100"; } else { $bin = "f_100+"; } $ht{$bin} += $F[2]; } foreach(sort(keys(%ht))) { $str = $_; $str =~s/[a-z]_//; print "$str\t$ht{$_}\n"; } ' > ${pfx}.genomecov.tsv done # ---------------------------------------------------- # Run MultiQC with custom configuration file # ---------------------------------------------------- # One-time setup cd /seq/align/2017_07.igor_atacseq ln -s -f ~/gitdir/iyerlab/projects/igor_atacseq/align_2017_07.igor_atacseq.sh ln -s -f ~/gitdir/iyerlab/projects/igor_atacseq/multiqc_config.yaml mkdir -p /seq/align/2017_07.igor_atacseq/for_multiqc # Gather files we want MultiQC to use. # These could be left in their respective directories, but I've seen issues with that. cd /seq/align/2017_07.igor_atacseq rm -f for_multiqc/*.* cp -p mm10/bowtie2/*.nofilt.mapq_histogram.tsv for_multiqc/ cp -p mm10/filtered/*.nofilt.flagstat.txt for_multiqc/ cp -p mm10/filtered/*.nofilt.dupinfo.txt for_multiqc/ cp -p mm10/filtered/*.idxstats.txt for_multiqc/ cp -p mm10/filtered/*.bowtie2_isizes.tsv for_multiqc/ cp -p mm10/filtered/*.genomecov.tsv for_multiqc/ # Generate the report cd /seq/align/2017_07.igor_atacseq rm -rf mqc_*; multiqc . # Note: MultiQC sometimes gets confused if a previous report exists # Also, if the resulting report is named multiqc_* instead of mqc_*, there # was a problem with the multiqc_config.yaml file. Run like this to debug: rm -rf mqc_*; multiqc -c multiqc_config.yaml -v . # Copy artifacts to web-accessible area on Corral scp -p /seq/align/2017_07.igor_atacseq/mqc_report.html \ abattenh@corral.tacc.utexas.edu:/corral-tacc/utexas/iyerlab/www/igor/ scp -p /seq/align/2017_07.igor_atacseq/multiqc_config.yaml \ abattenh@corral.tacc.utexas.edu:/corral-tacc/utexas/iyerlab/www/igor/ scp -p /seq/align/2017_07.igor_atacseq/align_2017_07.igor_atacseq.sh \ abattenh@corral.tacc.utexas.edu:/corral-tacc/utexas/iyerlab/www/igor/README.igor_atacseq.txt scp -p /seq/align/2017_07.igor_atacseq/for_multiqc/brain_5k_nuclei.genomecov.tsv \ /seq/align/2017_07.igor_atacseq/for_multiqc/brain_5k_nuclei.nofilt.mapq_histogram.tsv \ abattenh@corral.tacc.utexas.edu:/corral-tacc/utexas/iyerlab/www/igor/ # To veiw this report, navigate to: http://web.corral.tacc.utexas.edu/iyer/igor/mqc_report.html # Some supporting files are in this directory: http://web.corral.tacc.utexas.edu/iyer/igor/ # ============================================================= # Transfer back to GSAF POD for Dennis... # ============================================================= rsync -avrW --exclude="align_2017_07.igor_atacseq.sh" --exclude="multiqc_config.yaml" \ /seq/align/2017_07.igor_atacseq/ \ abattenh@gsafstor01.ccbb.utexas.edu:/stor/work/GSAF/abattenh/2017_07.igor_atacseq/ scp -p /seq/align/2017_07.igor_atacseq/align_2017_07.igor_atacseq.sh \ /seq/align/2017_07.igor_atacseq/multiqc_config.yaml \ abattenh@gsafstor01.ccbb.utexas.edu:/stor/work/GSAF/abattenh/2017_07.igor_atacseq/