Skip to content
Snippets Groups Projects
Commit 9dd243ee authored by Blaise Li's avatar Blaise Li
Browse files

Updates in iCLIP, with different mapping params.

Read size classes have been assigned bowtie2 mapping settings based on
the results of mapping benchmarks.
parent 3b3ae2b0
No related branches found
No related tags found
No related merge requests found
...@@ -23,8 +23,10 @@ if major < 3 or (major == 3 and minor < 6): ...@@ -23,8 +23,10 @@ if major < 3 or (major == 3 and minor < 6):
import os import os
OPJ = os.path.join OPJ = os.path.join
from distutils.util import strtobool
from glob import glob from glob import glob
from subprocess import CalledProcessError from subprocess import CalledProcessError
from sys import stderr
from yaml import safe_load as yload from yaml import safe_load as yload
from collections import defaultdict from collections import defaultdict
...@@ -42,6 +44,7 @@ mpl.rcParams["font.family"] = "sans-serif" ...@@ -42,6 +44,7 @@ mpl.rcParams["font.family"] = "sans-serif"
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from idconvert import gene_ids_data_dir
from libhts import make_empty_bigwig, median_ratio_to_pseudo_ref_size_factors, plot_histo from libhts import make_empty_bigwig, median_ratio_to_pseudo_ref_size_factors, plot_histo
from libworkflows import get_chrom_sizes, cleanup_and_backup from libworkflows import get_chrom_sizes, cleanup_and_backup
from libworkflows import last_lines, ensure_relative, SHELL_FUNCTIONS, warn_context from libworkflows import last_lines, ensure_relative, SHELL_FUNCTIONS, warn_context
...@@ -59,7 +62,8 @@ aligner = config["aligner"] ...@@ -59,7 +62,8 @@ aligner = config["aligner"]
######################## ########################
# config["genome_dict"] can be either the path to a genome configuration file # config["genome_dict"] can be either the path to a genome configuration file
# or a dict # or a dict
if isinstance(config["genome_dict"], str): if isinstance(config["genome_dict"], (str, bytes)):
print(f"loading {config['genome_dict']}", file=stderr)
with open(config["genome_dict"]) as fh: with open(config["genome_dict"]) as fh:
genome_dict = yload(fh) genome_dict = yload(fh)
else: else:
...@@ -73,7 +77,7 @@ genome_db = genome_dict["db"][aligner] ...@@ -73,7 +77,7 @@ genome_db = genome_dict["db"][aligner]
genome_binned = genome_dict["binned"] genome_binned = genome_dict["binned"]
annot_dir = genome_dict["annot_dir"] annot_dir = genome_dict["annot_dir"]
# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"] # TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
convert_dir = genome_dict["convert_dir"] convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir)
gene_lists_dir = genome_dict["gene_lists_dir"] gene_lists_dir = genome_dict["gene_lists_dir"]
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt"))) avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
...@@ -81,6 +85,8 @@ merged_fastq = config["merged_fastq"] ...@@ -81,6 +85,8 @@ merged_fastq = config["merged_fastq"]
barcode_dict = config["barcode_dict"] barcode_dict = config["barcode_dict"]
BARCODES = list(barcode_dict.keys()) BARCODES = list(barcode_dict.keys())
MAX_DIFF = config["max_diff"] MAX_DIFF = config["max_diff"]
upload_on_err = strtobool(str(config.get("upload_on_err", "True")))
#output_dir = config["output_dir"] #output_dir = config["output_dir"]
#workdir: config["output_dir"] #workdir: config["output_dir"]
output_dir = os.path.abspath(".") output_dir = os.path.abspath(".")
...@@ -116,14 +122,27 @@ SIZE_SELECTED = [f"{read_type}_{size_range}" for (read_type, size_range) in prod ...@@ -116,14 +122,27 @@ SIZE_SELECTED = [f"{read_type}_{size_range}" for (read_type, size_range) in prod
# TODO: Have different settings for different size ranges # TODO: Have different settings for different size ranges
# recommended k-mer length for D. melanogaster is 20 # recommended k-mer length for D. melanogaster is 20
# However, reads shorter thant the k-mer length will be ignored. # However, reads shorter than the k-mer length will be ignored.
# http://crac.gforge.inria.fr/documentation/crac/#sec-2 # http://crac.gforge.inria.fr/documentation/crac/#sec-2
alignment_settings = { alignment_settings = {
#"bowtie2": "-L 6 -i S,1,0.8 -N 0", #"bowtie2": "-L 6 -i S,1,0.8 -N 0",
"bowtie2": "-L 6 -i S,1,0.8 -N 1", "bowtie2": {
"12-18": "--end-to-end -L 12 -i S,1,0.5 -N 1",
"21-24": "--local -L 14 -i S,1,0.5 -N 1",
"26-40": "--local -L 15 -i S,1,0.5 -N 1",
"48-52": "--local -L 16 -i S,1,0.5 -N 1",
"default": "-L 6 -i S,1,0.8 -N 0",
},
# Small RNA-seq parameters may not be compatible with --local # Small RNA-seq parameters may not be compatible with --local
#"bowtie2": "--local -L 6 -i S,1,0.8 -N 0", #"bowtie2": "--local -L 6 -i S,1,0.8 -N 0",
"crac": "-k 20 --stranded --use-x-in-cigar"} "crac": {
"12-18": "-k 20 --stranded --use-x-in-cigar",
"21-24": "-k 20 --stranded --use-x-in-cigar",
"26-40": "-k 20 --stranded --use-x-in-cigar",
"48-52": "-k 20 --stranded --use-x-in-cigar",
"default": "-k 20 --stranded --use-x-in-cigar",
}
}
# Lower stringency settings, to remap the unmapped # Lower stringency settings, to remap the unmapped
realignment_settings = { realignment_settings = {
# Try with almost-default settings # Try with almost-default settings
...@@ -420,8 +439,20 @@ rule plot_size_distribution: ...@@ -420,8 +439,20 @@ rule plot_size_distribution:
plot_histo(output[0], data, title) plot_histo(output[0], data, title)
#WITH_ADAPT = ["adapt_deduped", "adapt_nodedup"]
#POST_TRIMMING = ["noadapt_deduped"] + WITH_ADAPT
#SIZE_RANGES = ["12-18", "21-24", "26-40", "48-52"]
#SIZE_SELECTED = [f"{read_type}_{size_range}" for (read_type, size_range) in product(WITH_ADAPT, SIZE_RANGES)]
size_range_re = re.compile(".*_(\d+-\d+)")
def set_alignment_settings(wildcards): def set_alignment_settings(wildcards):
return alignment_settings[aligner] # Determine whether there is a size range, extract it or use a default one for the alignment settings
# size_range = wildcards.read_type.split("_")[-1]
m = size_range_re.match(wildcards.read_type)
if m:
(size_range,) = m.groups()
else:
size_range = "default"
return alignment_settings[aligner][size_range]
def set_realignment_settings(wildcards): def set_realignment_settings(wildcards):
...@@ -563,8 +594,8 @@ rule feature_count_reads: ...@@ -563,8 +594,8 @@ rule feature_count_reads:
message: message:
"Counting {wildcards.orientation} {wildcards.biotype} {wildcards.read_type} reads for {wildcards.lib}_{wildcards.rep} with featureCounts." "Counting {wildcards.orientation} {wildcards.biotype} {wildcards.read_type} reads for {wildcards.lib}_{wildcards.rep} with featureCounts."
log: log:
log = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{lib}_{rep}_{read_type}.log"), log = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}.log"),
err = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{lib}_{rep}_{read_type}.err") err = OPJ(log_dir, "{trimmer}", "feature_count_reads", "{lib}_{rep}_{read_type}_{biotype}_{orientation}.err")
shell: shell:
""" """
# converter="/pasteur/entites/Mhe/Genomes/C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes_id2name.pickle" # converter="/pasteur/entites/Mhe/Genomes/C_elegans/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes_id2name.pickle"
...@@ -756,6 +787,7 @@ onsuccess: ...@@ -756,6 +787,7 @@ onsuccess:
onerror: onerror:
shell(f"rm -rf {output_dir}_err") shell(f"rm -rf {output_dir}_err")
shell(f"cp -rp {output_dir} {output_dir}_err") shell(f"cp -rp {output_dir} {output_dir}_err")
cleanup_and_backup(output_dir + "_err", config) if upload_on_err:
cleanup_and_backup(output_dir + "_err", config)
print("iCLIP data analysis failed.") print("iCLIP data analysis failed.")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment