Skip to content
Snippets Groups Projects
build_file_dicts.py 11.74 KiB
#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
"""This script pickles python dictionaries that associate raw data files,
bigwig files and library names."""
import sys


major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
    sys.exit("Need at least python 3.6\n")


import os
import warnings
import re

OPJ = os.path.join
OPR = os.path.realpath


def formatwarning(message, category, filename, lineno, line=None):
    """Used to format warning messages."""
    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)


# https://stackoverflow.com/a/22661838/1878788
warnings.simplefilter('always', UserWarning)
warnings.formatwarning = formatwarning

from pickle import dump, HIGHEST_PROTOCOL
from pathlib import Path
from yaml import load as yload
from collections import defaultdict
from cytoolz import valmap
from snakemake import Workflow


def dump_dicts(path, dicts):
    with open(path, "wb") as pickle_file:
        for d in dicts:
            dump(d, pickle_file, HIGHEST_PROTOCOL)


# https://stackoverflow.com/a/1319675/1878788
class TooManyValues(ValueError):
    def __init__(self, message="Too many values.", value=None):
        super().__init__(message)
        self.value = value


def unlist(l):
    try:
        (elem,) = set(l)
    except ValueError as err:
        warnings.warn(f"There are more than one element in {l}.\n")
        warnings.warn("Selecting an arbitrary one.\n")
        (elem, *_) = set(l)
        # raise TooManyValues(value=elem) from err
    return elem


def main():
    config = yload(open(sys.argv[1], "r"))

    ref_info = config["ref"]
    data_dir = ref_info["paper"]

    data_info = config["data"]
    LIBTYPES = list(data_info.keys())

    # libtype_info = {
    #     "RNA-seq": {
    #         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/RNA-seq/RNA-seq.snakefile",
    #         "rule": "make_normalized_bigwig",
    #         "default_wildcards": {
    #             "orientation": "all",
    #             "mapped_type": "on_{genome}",
    #             "norm_type": "protein_coding"
    #         }
    #     },
    #     "sRNA-seq": {
    #         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/sRNA-seq/sRNA-seq.snakefile",
    #         "rule": "make_normalized_bigwig",
    #         "default_wildcards": {
    #             "orientation": "all",
    #             "read_type": "all_siRNA",
    #             "norm": "non_structural"
    #         }
    #     },
    #     "sRNA-IP-seq": {
    #         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/sRNA-seq/sRNA-seq.snakefile",
    #         "rule": "make_normalized_bigwig",
    #         "default_wildcards": {
    #             "orientation": "all",
    #             "read_type": "all_siRNA",
    #             "norm": "non_structural"
    #         }
    #     },
    #     "GRO-seq": {
    #         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/GRO-seq/GRO-seq.snakefile",
    #         "rule": "make_normalized_bigwig",
    #         "default_wildcards": {
    #             "trimmer": "cutadapt",
    #             "orientation": "all",
    #             "norm_type": "protein_coding"
    #         }
    #     }
    # }
    libtype_info = yload(open(
        OPJ(os.path.dirname(OPR(sys.argv[0])), "libtype_info.yaml"),
        "r"))
    assert set(LIBTYPES) <= set(libtype_info.keys())

    # We use lists so that we can check that lists have only one element
    # instead of overwriting values.
    # dict of dicts {rawdat: [library,]}
    rawdat2lib = {
        "RNA-seq": defaultdict(list),
        "sRNA-seq": defaultdict(list),
        "sRNA-IP-seq": defaultdict(list),
        "GRO-seq": defaultdict(list),
    }
    # dict of dicts {library: [rawdat,]}
    lib2rawdat = {
        "RNA-seq": defaultdict(list),
        "sRNA-seq": defaultdict(list),
        "sRNA-IP-seq": defaultdict(list),
        "GRO-seq": defaultdict(list),
    }
    # dict of dicts {rawdat: [(cond, rep),]}
    rawdat2condrep = {
        "RNA-seq": defaultdict(list),
        "sRNA-seq": defaultdict(list),
        "sRNA-IP-seq": defaultdict(list),
        "GRO-seq": defaultdict(list),
    }
    # dict of dicts {(cond, rep): [rawdat,]}
    condrep2rawdat = {
        "RNA-seq": defaultdict(list),
        "sRNA-seq": defaultdict(list),
        "sRNA-IP-seq": defaultdict(list),
        "GRO-seq": defaultdict(list),
    }
    # dict of dicts {(cond, rep): [bw,]}
    condrep2bw = {
        "RNA-seq": defaultdict(list),
        "sRNA-seq": defaultdict(list),
        "sRNA-IP-seq": defaultdict(list),
        "GRO-seq": defaultdict(list),
    }
    # dict of dicts {bw: [(cond, rep),]}
    bw2condrep = {
        "RNA-seq": defaultdict(list),
        "sRNA-seq": defaultdict(list),
        "sRNA-IP-seq": defaultdict(list),
        "GRO-seq": defaultdict(list),
    }

    vars_from_analyses = set()
    vars_from_loading_sf = set()
    for (libtype, analyses) in data_info.items():
        # snakefile for this libtype
        analysis_snakefile = libtype_info[libtype]["snakefile"]
        # name of the rule that produces bigwig files
        bw_rulename = libtype_info[libtype]["rule"]
        for (analysis, analysis_info) in analyses.items():
            ###
            # determine pattern of the bigwig file path
            ###
            from_folder = OPR(analysis_info["from_folder"])
            # configuration for this analysis
            analysis_config = yload(open(analysis_info["config"], "r"))
            common_vars = set(analysis_config.keys()) & (set(dir()) - vars_from_analyses)
            assert not common_vars, f"Variable overwriting hazard!\n{common_vars}"
            vars_from_analyses |= set(analysis_config.keys())
            # This may be needed for analysis types with configurable genome
            genome = analysis_config.get(
                "genome_dict",
                # default genome name will be "C_elegans"
                {"name": "C_elegans"})["name"]
            # Load the snakefile to get information
            # about the rule making bigwig files
            # Based on snakemake/__init__.py and snakemake/workflow.py
            #before_loading_vars = list(recdir())
            #before_loading_vars = dir()
            sf = Workflow(
                analysis_snakefile,
                overwrite_config=analysis_config)
            # TODO: find out other ways to inspect rules from another workflow
            # or how to undefine rules
            # This overwrites a lib2raw dict!!
            # Probably other variables are set, likely from analysis_config
            sf.include(
                analysis_snakefile,
                overwrite_first_rule=True)
            sf.check()
            #after_loading_vars = dir()
            #after_loading_vars = list(recdir())
            # Use pattern matching so that it fails if we have not exactly
            # one bigwig file in the output of the rule
            [bw_pattern] = sf._rules[bw_rulename].output
            #for varname in set(after_loading_vars) - set(before_loading_vars):
            #    # https://stackoverflow.com/a/26545111/1878788
            #    del globals()[varname]
            #for varname in dir():
            #    if varname.startswith("__rule"):
            #        print(varname)
            #        del globals()[varname]
            bw_pattern = re.sub(",[^}]*}", "}", bw_pattern)
            ###
            # determine raw data files for this analysis
            ###
            for (library, rawdat) in analysis_info["libraries"].items():
                assert Path(rawdat).exists()
                # if os.path.islink(rawdat):
                #     #warnings.warn(f"{rawdat} is a link.\n")
                #     rawdat = os.readlink(rawdat)
                rawdat = OPR(rawdat)
                rawdat2lib[libtype][rawdat].append(library)
                lib2rawdat[libtype][library].append(rawdat)
            ###
            # determine (cond, rep) pairs associated to a given raw file
            ###
            for (cond, reps) in analysis_config["lib2raw"].items():
                for (rep, rawdat) in reps.items():
                    assert Path(rawdat).exists()
                    # if os.path.islink(rawdat):
                    #     #warnings.warn(f"{rawdat} is a link.\n")
                    #     rawdat = os.readlink(rawdat)
                    rawdat = OPR(rawdat)
                    #if rawdat in rawdat2lib:
                    if True:
                        rawdat2condrep[libtype][rawdat].append((cond, rep))
                        condrep2rawdat[libtype][(cond, rep)].append(rawdat)
                    bw = OPJ(from_folder, bw_pattern.format(
                        lib=cond, rep=rep,
                        **libtype_info[libtype]["default_wildcards"]).format(
                            genome=genome))
                    try:
                        assert Path(bw).exists()
                    except AssertionError as err:
                        print(bw_pattern)
                        print(bw)
                        print(OPR(bw))
                        raise
                    condrep2bw[libtype][(cond, rep)].append(bw)
                    bw2condrep[libtype][bw].append((cond, rep))
        # print(libtype)
        for (k, v) in rawdat2condrep[libtype].items():
            if len(v) > 1:
                print(k)
                print(v)
                if k in rawdat2lib:
                    warnings.warn(f"Multiple {libtype} libraries for a submitted one.")
                else:
                    warnings.warn(f"Not sure we should worry about the above multiple {libtype} libraries: They do not seem to match submitted raw data files.")

    ##
    # Transform lists into unique values
    ##
    # for (k, d) in rawdat2lib.items():
    #     try:
    #         rawdat2lib[k] = valmap(unlist, d)
    #     except TooManyValues as err:
    #         warnings.warn(f"rawdat2lib[{k}]\n{str(err)}")
    #         rawdat2lib[k] = err.value
    # for (k, d) in lib2rawdat.items():
    #     try:
    #         lib2rawdat[k] = valmap(unlist, d)
    #     except TooManyValues as err:
    #         warnings.warn(f"lib2rawdat[{k}]\n{str(err)}")
    #         lib2rawdat[k] = err.value
    # for (k, d) in rawdat2condrep.items():
    #     try:
    #         rawdat2condrep[k] = valmap(unlist, d)
    #     except TooManyValues as err:
    #         warnings.warn(f"rawdat2condrep[{k}]\n{str(err)}")
    #         rawdat2condrep[k] = err.value
    # for (k, d) in condrep2rawdat.items():
    #     try:
    #         condrep2rawdat[k] = valmap(unlist, d)
    #     except TooManyValues as err:
    #         warnings.warn(f"condrep2rawdat[{k}]\n{str(err)}")
    #         condrep2rawdat[k] = err.value
    # for (k, d) in condrep2bw.items():
    #     try:
    #         condrep2bw[k] = valmap(unlist, d)
    #     except TooManyValues as err:
    #         warnings.warn(f"condrep2bw[{k}]\n{str(err)}")
    #         condrep2bw[k] = err.value
    # for (k, d) in bw2condrep.items():
    #     try:
    #         bw2condrep[k] = valmap(unlist, d)
    #     except TooManyValues as err:
    #         warnings.warn(f"bw2condrep[{k}]\n{str(err)}")
    #         bw2condrep[k] = err.value
    for (k, d) in rawdat2lib.items():
        rawdat2lib[k] = valmap(unlist, d)
    for (k, d) in lib2rawdat.items():
        lib2rawdat[k] = valmap(unlist, d)
    for (k, d) in rawdat2condrep.items():
        rawdat2condrep[k] = valmap(unlist, d)
    for (k, d) in condrep2rawdat.items():
        condrep2rawdat[k] = valmap(unlist, d)
    for (k, d) in condrep2bw.items():
        condrep2bw[k] = valmap(unlist, d)
    for (k, d) in bw2condrep.items():
        bw2condrep[k] = valmap(unlist, d)

    os.makedirs(data_dir, exist_ok=True)
    dump_dicts(
        OPJ(data_dir, "file_dicts.pickle"),
        [rawdat2lib, lib2rawdat,
         rawdat2condrep, condrep2rawdat,
         condrep2bw, bw2condrep])

    return 0

sys.exit(main())