From 53b52ea70ac60ec11d6faeb5c3a503a11f0489b5 Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Fri, 7 Jun 2019 17:22:12 +0200 Subject: [PATCH] Added bam2bigwig.sh to scripts. The script is taken from bioinfo_utils. --- scripts/bam2bigwig.sh | 170 ++++++++++++++++++++++++++++++++++++++++++ setup.py | 2 +- 2 files changed, 171 insertions(+), 1 deletion(-) create mode 100755 scripts/bam2bigwig.sh diff --git a/scripts/bam2bigwig.sh b/scripts/bam2bigwig.sh new file mode 100755 index 0000000..8a96ad5 --- /dev/null +++ b/scripts/bam2bigwig.sh @@ -0,0 +1,170 @@ +#!/bin/dash +# Usage: bam2bigwig.sh <sorted.bam> <bin_file> <library_name> <orientation> <strandedness> [<normalization>] <bigwig> +# <bin_file> should be a bed file representing 10 bp bins along the genome +# <orientation> must be "all", "fwd" or "rev" +# <strandedness> must be "F" or "R" +# It can be "U" (see http://sailfish.readthedocs.io/en/master/library_type.html), +# but anything else than "R" will be treated as "F". +# <normalization> can be either a file or a number +# If it is a file, it should contain at least two columns, +# the first one with library names and the second with size factors. +# If it is a value, it shoud be the size factor. +# Requires: +# * bedops (for bedmap) +# * python3 -> module load Python/3.6.0 +# * samtools -> module load samtools/1.9 +# * bedtools -> module load bedtools/2.25.0 +# * mawk -> use alias if absent +# * parallel (for niceload) -> module load parallel/20170122 +# * UCSC-tools (for bedGraphToBigWig) -> module load UCSC-tools/v373 + +workdir=$(mktemp -d) + +cleanup() +{ + rm -rf ${workdir} +} + +# http://linuxcommand.org/wss0150.php +PROGNAME=$(basename $0) + +error_exit () +{ +# ---------------------------------------------------------------- +# Function for exit due to fatal program error +# Accepts 1 argument: +# string containing descriptive error message +# ---------------------------------------------------------------- + cleanup + echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2 + exit 1 +} + +mawk --version 1> /dev/null 2> /dev/null || alias mawk='awk' + +bam=${1} +bin_file=${2} +# Example of how to generate the bin_file from iGenome data: +# cd /Genomes/C_elegans +# mawk '$1 == "@SQ" {print $2"\t0\t"$3}' Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.dict \ +# | sed 's/SN://g' | sed 's/LN://' \ +# | bedops --chop 10 - \ +# > Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/genome_binned_10.bed +lib_name=${3} +orient=${4} +# "F" is library type is normally stranded, "R" if it is reverse-stranded +strandedness=${5} + + +if [ $# = 7 ] +then + norm=${6} + bigwig=${7} +else + norm="" + bigwig=${6} +fi + + +bam_name=$(basename ${bam}) +mapping_name=${bam_name%_sorted.bam} +mapping_dir=$(dirname ${bam}) + +case ${orient} in + "all") + orient_filter="" + ;; + "fwd") + if [ "${strandedness}" = "R" ] + then + orient_filter="-strand -" + else + orient_filter="-strand +" + fi + ;; + "rev") + if [ "${strandedness}" = "R" ] + then + orient_filter="-strand +" + else + orient_filter="-strand -" + fi + ;; + *) + error_exit "unrecognised orientation ${orient}" + ;; +esac + +bigwig_dir=$(dirname ${bigwig}) +echo "working in ${workdir}" +mkdir -p ${bigwig_dir} + +if [ "${norm}" ] +then + if [ -f ${norm} ] + then + echo "reading scaling factor for ${lib_name} in ${norm}" + size=$(grep -w "^${lib_name}" ${norm} | cut -f2) + if [ ! ${size} ] + then + error_exit "size could not be found on column 2, line ${lib_name} of ${norm}" + #cleanup && error_exit "size could not be found on column 2, line ${lib_name} of ${norm}" + else + echo "found ${size} for ${lib_name}" + fi + else + # Not a file + echo "assuming ${norm} is the size factor" + size="${norm}" + fi + scale=$(python3 -c "print(1.0 / ${size})") + scaling="-scale ${scale}" + bedgraph="${workdir}/${mapping_name}_norm_${orient}_genomecov.bg" +else + scaling="" + bedgraph="${workdir}/${mapping_name}_${orient}_genomecov.bg" +fi + +echo "getting chromosome bounds" +genome_file="${workdir}/chrom_sizes.txt" +samtools view -H ${bam} \ + | mawk '$1 == "@SQ" {print $2"\t"$3}' \ + | sed 's/SN://g' | sed 's/LN://' \ + > ${genome_file} || error_exit "making ${genome_file} failed" + #> ${genome_file} || cleanup && error_exit "making ${genome_file} failed" + +compute_coverage() +{ + cmd="niceload --noswap bedtools genomecov -bg -split ${orient_filter} ${scaling} -ibam ${bam}" + eval ${cmd} \ + | mawk '{ print $1"\t"$2"\t"$3"\tid-"NR"\t"$4; }' | sort-bed - \ + || error_exit "compute_coverage failed" + #|| cleanup && error_exit "compute_coverage failed" +} + +# TODO: find how to run this with niceload (there may be an issue with the quoting) +make_bins() +{ + cmd="bedmap --faster --echo --mean --delim \"\t\" --skip-unmapped ${bin_file} -" + eval ${cmd} || error_exit "make_bins failed" + #eval ${cmd} || cleanup && error_exit "make_bins failed" +} + +compute_coverage | make_bins \ + > ${bedgraph} || error_exit "generating bedgraph failed" + #> ${bedgraph} || cleanup && error_exit "generating bedgraph failed" + +echo "making bigwig" +niceload --noswap bedGraphToBigWig ${bedgraph} ${genome_file} ${bigwig} || error_exit "bedGraphToBigWig failed" +#bedGraphToBigWig ${bedgraph} ${genome_file} ${bigwig} || cleanup && error_exit "bedGraphToBigWig failed" + +echo "removing ${bedgraph}" +rm -f ${bedgraph} + +echo "removing ${genome_file}" +rm -f ${genome_file} + +cleanup +# rm -rf ${workdir} + +exit 0 diff --git a/setup.py b/setup.py index d66bee8..5e771f4 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ setup( author_email="blaise.li@normalesup.org", license="MIT", packages=find_packages(), - scripts=["scripts/sam2indexedbam.sh"], + scripts=["scripts/bam2bigwig.sh", "scripts/sam2indexedbam.sh"], install_requires=[ #"libworkflows @ git+https://gitlab+deploy-token-31:isEzpsgbNf2sJMdUDy2g@gitlab.pasteur.fr/bli/libworkflows.git@744dd79b579577cb6e131653260d7990946be3ad#egg=libworkflows-0.1", #"libworkflows @ git+https://gitlab+deploy-token-31:isEzpsgbNf2sJMdUDy2g@gitlab.pasteur.fr/bli/libworkflows.git#egg=libworkflows-0.1", -- GitLab