From 53b52ea70ac60ec11d6faeb5c3a503a11f0489b5 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Fri, 7 Jun 2019 17:22:12 +0200
Subject: [PATCH] Added bam2bigwig.sh to scripts.

The script is taken from bioinfo_utils.
---
 scripts/bam2bigwig.sh | 170 ++++++++++++++++++++++++++++++++++++++++++
 setup.py              |   2 +-
 2 files changed, 171 insertions(+), 1 deletion(-)
 create mode 100755 scripts/bam2bigwig.sh

diff --git a/scripts/bam2bigwig.sh b/scripts/bam2bigwig.sh
new file mode 100755
index 0000000..8a96ad5
--- /dev/null
+++ b/scripts/bam2bigwig.sh
@@ -0,0 +1,170 @@
+#!/bin/dash
+# Usage: bam2bigwig.sh <sorted.bam> <bin_file> <library_name> <orientation> <strandedness> [<normalization>] <bigwig>
+# <bin_file> should be a bed file representing 10 bp bins along the genome
+# <orientation> must be "all", "fwd" or "rev"
+# <strandedness> must be "F" or "R"
+# It can be "U" (see http://sailfish.readthedocs.io/en/master/library_type.html),
+# but anything else than "R" will be treated as "F".
+# <normalization> can be either a file or a number
+# If it is a file, it should contain at least two columns,
+# the first one with library names and the second with size factors.
+# If it is a value, it shoud be the size factor.
+# Requires:
+# * bedops (for bedmap)
+# * python3 -> module load Python/3.6.0
+# * samtools -> module load samtools/1.9
+# * bedtools -> module load bedtools/2.25.0
+# * mawk -> use alias if absent
+# * parallel (for niceload) -> module load parallel/20170122
+# * UCSC-tools (for bedGraphToBigWig) -> module load UCSC-tools/v373
+
+workdir=$(mktemp -d)
+
+cleanup()
+{
+    rm -rf ${workdir}
+}
+
+# http://linuxcommand.org/wss0150.php
+PROGNAME=$(basename $0)
+
+error_exit ()
+{
+#	----------------------------------------------------------------
+#	Function for exit due to fatal program error
+#		Accepts 1 argument:
+#			string containing descriptive error message
+#	----------------------------------------------------------------
+    cleanup
+    echo "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2
+    exit 1
+}
+
+mawk --version 1> /dev/null 2> /dev/null || alias mawk='awk'
+
+bam=${1}
+bin_file=${2}
+# Example of how to generate the bin_file from iGenome data:
+#    cd /Genomes/C_elegans
+#    mawk '$1 == "@SQ" {print $2"\t0\t"$3}' Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.dict \
+#        | sed 's/SN://g' | sed 's/LN://' \
+#        | bedops --chop 10 - \
+#        > Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/genome_binned_10.bed
+lib_name=${3}
+orient=${4}
+# "F" is library type is normally stranded, "R" if it is reverse-stranded
+strandedness=${5}
+
+
+if [ $# = 7 ]
+then
+    norm=${6}
+    bigwig=${7}
+else
+    norm=""
+    bigwig=${6}
+fi
+
+
+bam_name=$(basename ${bam})
+mapping_name=${bam_name%_sorted.bam}
+mapping_dir=$(dirname ${bam})
+
+case ${orient} in
+    "all")
+        orient_filter=""
+        ;;
+    "fwd")
+        if [ "${strandedness}" = "R" ]
+        then
+            orient_filter="-strand -"
+        else
+            orient_filter="-strand +"
+        fi
+        ;;
+    "rev")
+        if [ "${strandedness}" = "R" ]
+        then
+            orient_filter="-strand +"
+        else
+            orient_filter="-strand -"
+        fi
+        ;;
+    *)
+        error_exit "unrecognised orientation ${orient}"
+        ;;
+esac
+
+bigwig_dir=$(dirname ${bigwig})
+echo "working in ${workdir}"
+mkdir -p ${bigwig_dir}
+
+if [ "${norm}" ]
+then
+    if [ -f ${norm} ]
+    then
+        echo "reading scaling factor for ${lib_name} in ${norm}"
+        size=$(grep -w "^${lib_name}" ${norm} | cut -f2)
+        if [ ! ${size} ]
+        then
+            error_exit "size could not be found on column 2, line ${lib_name} of ${norm}"
+            #cleanup && error_exit "size could not be found on column 2, line ${lib_name} of ${norm}"
+        else
+            echo "found ${size} for ${lib_name}"
+        fi
+    else
+        # Not a file
+        echo "assuming ${norm} is the size factor"
+        size="${norm}"
+    fi
+    scale=$(python3 -c "print(1.0 / ${size})")
+    scaling="-scale ${scale}"
+    bedgraph="${workdir}/${mapping_name}_norm_${orient}_genomecov.bg"
+else
+    scaling=""
+    bedgraph="${workdir}/${mapping_name}_${orient}_genomecov.bg"
+fi
+
+echo "getting chromosome bounds"
+genome_file="${workdir}/chrom_sizes.txt"
+samtools view -H ${bam} \
+    | mawk '$1 == "@SQ" {print $2"\t"$3}' \
+    | sed 's/SN://g' | sed 's/LN://' \
+    > ${genome_file} || error_exit "making ${genome_file} failed"
+    #> ${genome_file} || cleanup && error_exit "making ${genome_file} failed"
+
+compute_coverage()
+{
+    cmd="niceload --noswap bedtools genomecov -bg -split ${orient_filter} ${scaling} -ibam ${bam}"
+    eval ${cmd} \
+        | mawk '{ print $1"\t"$2"\t"$3"\tid-"NR"\t"$4; }' | sort-bed - \
+        || error_exit "compute_coverage failed"
+        #|| cleanup && error_exit "compute_coverage failed"
+}
+
+# TODO: find how to run this with niceload (there may be an issue with the quoting)
+make_bins()
+{
+    cmd="bedmap --faster --echo --mean --delim \"\t\" --skip-unmapped ${bin_file} -"
+    eval ${cmd} || error_exit "make_bins failed"
+    #eval ${cmd} || cleanup && error_exit "make_bins failed"
+}
+
+compute_coverage | make_bins \
+    > ${bedgraph} || error_exit "generating bedgraph failed"
+    #> ${bedgraph} || cleanup && error_exit "generating bedgraph failed"
+
+echo "making bigwig"
+niceload --noswap bedGraphToBigWig ${bedgraph} ${genome_file} ${bigwig} || error_exit "bedGraphToBigWig failed"
+#bedGraphToBigWig ${bedgraph} ${genome_file} ${bigwig} || cleanup && error_exit "bedGraphToBigWig failed"
+
+echo "removing ${bedgraph}"
+rm -f ${bedgraph}
+
+echo "removing ${genome_file}"
+rm -f ${genome_file}
+
+cleanup
+# rm -rf ${workdir}
+
+exit 0
diff --git a/setup.py b/setup.py
index d66bee8..5e771f4 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@ setup(
     author_email="blaise.li@normalesup.org",
     license="MIT",
     packages=find_packages(),
-    scripts=["scripts/sam2indexedbam.sh"],
+    scripts=["scripts/bam2bigwig.sh", "scripts/sam2indexedbam.sh"],
     install_requires=[
         #"libworkflows @ git+https://gitlab+deploy-token-31:isEzpsgbNf2sJMdUDy2g@gitlab.pasteur.fr/bli/libworkflows.git@744dd79b579577cb6e131653260d7990946be3ad#egg=libworkflows-0.1",
         #"libworkflows @ git+https://gitlab+deploy-token-31:isEzpsgbNf2sJMdUDy2g@gitlab.pasteur.fr/bli/libworkflows.git#egg=libworkflows-0.1",
-- 
GitLab