build_file_dicts.py

#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
"""This script pickles python dictionaries that associate raw data files,
bigwig files and library names."""
import sys


major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
    sys.exit("Need at least python 3.6\n")


import os
import warnings
import re

OPJ = os.path.join
OPR = os.path.realpath


def formatwarning(message, category, filename, lineno, line=None):
    """Used to format warning messages."""
    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)


# https://stackoverflow.com/a/22661838/1878788
warnings.simplefilter('always', UserWarning)
warnings.formatwarning = formatwarning

from glob import glob
from pickle import dump, HIGHEST_PROTOCOL
from pathlib import Path
from yaml import safe_load as yload
from collections import defaultdict
from cytoolz import valmap
from snakemake import Workflow


def default_to_regular(d):
    """See https://stackoverflow.com/a/26496899/1878788"""
    #if isinstance(d, defaultdict):
    if isinstance(d, dict):
        d = {k: default_to_regular(v) for k, v in d.items()}
    return d


def dump_dicts(path, dicts):
    with open(path, "wb") as pickle_file:
        for d in dicts:
            dump(default_to_regular(d), pickle_file, HIGHEST_PROTOCOL)


# https://stackoverflow.com/a/1319675/1878788
class TooManyValues(ValueError):
    def __init__(self, message="Too many values.", value=None):
        super().__init__(message)
        self.value = value


# def listdd():
#     return defaultdict(list)


def unlist(l):
    try:
        (elem,) = set(l)
    except ValueError as err:
        elems_repr = "\n".join(map(str, l))
        warnings.warn(f"There are more than one element in l:\n{elems_repr}\n")
        warnings.warn("Selecting an arbitrary one.\n")
        (elem, *_) = set(l)
        # raise TooManyValues(value=elem) from err
    return elem

def do_unlist(msg):
    def unlist_with_msg(l):
        print(msg, file=sys.stderr)
        if isinstance(l, str):
            raise ValueError(f"l is a string: {l}!\n")
        return unlist(l)
    return unlist_with_msg


def main():
    config = yload(open(sys.argv[1], "r"))

    ref_info = config["ref"]
    data_dir = ref_info["paper"]

    data_info = config["data"]
    LIBTYPES = list(data_info.keys())

    # libtype_info = {
    #     "RNA-seq": {
    #         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/RNA-seq/RNA-seq.snakefile",
    #         "rule": "make_normalized_bigwig",
    #         "default_wildcards": {
    #             "orientation": "all",
    #             "mapped_type": "on_{genome}",
    #             "norm_type": "protein_coding"
    #         }
    #     },
    #     "sRNA-seq": {
    #         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/sRNA-seq/sRNA-seq.snakefile",
    #         "rule": "make_normalized_bigwig",
    #         "default_wildcards": {
    #             "orientation": "all",
    #             "read_type": "all_siRNA",
    #             "norm": "non_structural"
    #         }
    #     },
    #     "sRNA-IP-seq": {
    #         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/sRNA-seq/sRNA-seq.snakefile",
    #         "rule": "make_normalized_bigwig",
    #         "default_wildcards": {
    #             "orientation": "all",
    #             "read_type": "all_siRNA",
    #             "norm": "non_structural"
    #         }
    #     },
    #     "GRO-seq": {
    #         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/GRO-seq/GRO-seq.snakefile",
    #         "rule": "make_normalized_bigwig",
    #         "default_wildcards": {
    #             "trimmer": "cutadapt",
    #             "orientation": "all",
    #             "norm_type": "protein_coding"
    #         }
    #     }
    # }
    libtype_info = yload(open(
        OPJ(os.path.dirname(OPR(sys.argv[0])), "libtype_info.yaml"),
        "r"))
    assert set(LIBTYPES) <= set(libtype_info.keys())

    # TODO: make per-analysis subdicts?
    # We use lists so that we can check that lists have only one element
    # instead of overwriting values.
    # dict of dicts {rawdat: [library,]}
    rawdat2lib = {
        "Ribo-seq": defaultdict(list),
        "RNA-seq": defaultdict(list),
        "sRNA-seq": defaultdict(list),
        "sRNA-IP-seq": defaultdict(list),
        "GRO-seq": defaultdict(list),
    }
    # dict of dicts of dicts {library: [rawdat,]}
    lib2rawdat = {
        "Ribo-seq": defaultdict(list),
        "RNA-seq": defaultdict(list),
        "sRNA-seq": defaultdict(list),
        "sRNA-IP-seq": defaultdict(list),
        "GRO-seq": defaultdict(list),
    }
    # dict of dicts of dicts {rawdat: [(library, cond, rep),]}
    rawdat2condrep = {
        "Ribo-seq": defaultdict(list),
        "RNA-seq": defaultdict(list),
        "sRNA-seq": defaultdict(list),
        "sRNA-IP-seq": defaultdict(list),
        "GRO-seq": defaultdict(list),
    }
    # dict of dicts of dicts {(library, cond, rep): [rawdat,]}
    condrep2rawdat = {
        "Ribo-seq": defaultdict(list),
        "RNA-seq": defaultdict(list),
        "sRNA-seq": defaultdict(list),
        "sRNA-IP-seq": defaultdict(list),
        "GRO-seq": defaultdict(list),
    }
    # dict of dicts of dicts {(library, cond, rep): [bw,]}
    condrep2bw = {
        "Ribo-seq": defaultdict(list),
        "RNA-seq": defaultdict(list),
        "sRNA-seq": defaultdict(list),
        "sRNA-IP-seq": defaultdict(list),
        "GRO-seq": defaultdict(list),
    }
    # dict of dicts of dicts {bw: [(library, cond, rep),]}
    bw2condrep = {
        "Ribo-seq": defaultdict(list),
        "RNA-seq": defaultdict(list),
        "sRNA-seq": defaultdict(list),
        "sRNA-IP-seq": defaultdict(list),
        "GRO-seq": defaultdict(list),
    }

    vars_from_analyses = set()
    vars_from_loading_sf = set()
    issues = defaultdict(list)
    for (libtype, analyses) in data_info.items():
        # TODO: use snakefile copied in the analysis folder, if present,
        # this one otherwise
        # snakefile for this libtype
        libtype_snakefile = libtype_info[libtype]["snakefile"]
        # name of the rule that produces bigwig files
        bw_rulename = libtype_info[libtype]["rule"]
        for (analysis, analysis_info) in analyses.items():
            # results folder
            try:
                results_folder = OPR(analysis_info["results_folder"])
            except KeyError:
                results_folder = OPR(analysis_info["from_folder"])
            ###
            # determine pattern of the bigwig file path
            ###
            # Can we unambiguously get the configuration file
            # that has been used for this analysis?
            try:
                [analysis_config_file] = glob(OPJ(results_folder, "*.yaml"))
            except ValueError:
                warnings.warn(
                    "Either no or more than one .yaml file "
                    f"found in {results_folder}\n"
                    "Looking for one explicitly set.\n")
                analysis_config_file = analysis_info["config"]
            analysis_config = yload(open(analysis_config_file, "r"))
            # Can we unambiguously get the snakefile
            # that has been used for this analysis?
            try:
                [analysis_snakefile] = glob(OPJ(results_folder, "*.snakefile"))
            except ValueError:
                warnings.warn(
                    "Either no or more than one .snakefile file "
                    f"found in {results_folder}\n"
                    f"Using default {libtype_snakefile}\n")
                analysis_snakefile = libtype_snakefile
            common_vars = set(analysis_config.keys()) & (set(dir()) - vars_from_analyses)
            assert not common_vars, f"Variable overwriting hazard!\n{common_vars}"
            vars_from_analyses |= set(analysis_config.keys())
            # This may be needed for analysis types with configurable genome
            genome_dict = analysis_config.get(
                "genome_dict",
                # default genome name will be "C_elegans"
                {"name": "C_elegans"})
            if isinstance(genome_dict, (str, bytes)):
                with open(analysis_config["genome_dict"]) as gendict_fh:
                    genome_dict = yload(gendict_fh)
            genome = genome_dict["name"]
            # Load the snakefile to get information
            # about the rule making bigwig files
            # Based on snakemake/__init__.py and snakemake/workflow.py
            #before_loading_vars = list(recdir())
            #before_loading_vars = dir()
            sf = Workflow(
                analysis_snakefile,
                overwrite_config=analysis_config)
            # TODO: find out other ways to inspect rules from another workflow
            # or how to undefine rules
            # This overwrites a lib2raw dict!!
            # Probably other variables are set, likely from analysis_config
            sf.include(
                analysis_snakefile,
                overwrite_first_rule=True)
            sf.check()
            #after_loading_vars = dir()
            #after_loading_vars = list(recdir())
            # Use pattern matching so that it fails if we have not exactly
            # one bigwig file in the output of the rule
            try:
                [bw_pattern] = sf._rules[bw_rulename].output
            except ValueError:
                try:
                    # Sometimes, a rule may also output a "methods" file
                    [bw_pattern, methods_pattern] = sf._rules[bw_rulename].output
                    assert methods_pattern.endswith("_methods.txt")
                except ValueError:
                    print(analysis_snakefile, bw_rulename, file=sys.stderr)
                    raise
            #for varname in set(after_loading_vars) - set(before_loading_vars):
            #    # https://stackoverflow.com/a/26545111/1878788
            #    del globals()[varname]
            #for varname in dir():
            #    if varname.startswith("__rule"):
            #        print(varname)
            #        del globals()[varname]
            bw_pattern = re.sub(",[^}]*}", "}", bw_pattern)
            ###
            # determine raw data files for this analysis
            ###
            raws_wanted = set()
            # If empty, not read as a dict and doesn't have items
            if analysis_info["libraries"]:
                # library: sample name for submission
                for (library, rawdat) in analysis_info["libraries"].items():
                    assert Path(rawdat).exists()
                    # if os.path.islink(rawdat):
                    #     #warnings.warn(f"{rawdat} is a link.\n")
                    #     rawdat = os.readlink(rawdat)
                    rawdat = OPR(rawdat)
                    raws_wanted.add(rawdat)
                    rawdat2lib[libtype][rawdat].append(library)
                    lib2rawdat[libtype][library].append(rawdat)
            ###
            # determine (cond, rep) pairs associated to a given raw file
            ###
            for (cond, reps) in analysis_config["lib2raw"].items():
                for (rep, rawdat) in reps.items():
                    assert Path(rawdat).exists()
                    # if os.path.islink(rawdat):
                    #     #warnings.warn(f"{rawdat} is a link.\n")
                    #     rawdat = os.readlink(rawdat)
                    rawdat = OPR(rawdat)
                    if rawdat not in raws_wanted:
                        continue
                    #if rawdat in rawdat2lib:
                    # library (i.e. sumbission sample name)
                    # is used to desambiguate
                    [library, ] = rawdat2lib[libtype][rawdat]
                    if True:
                        if len(rawdat2condrep[libtype][rawdat]):
                            issues[libtype].append(
                                (
                                    analysis,
                                    f"rawdat2condrep[{libtype}][{rawdat}]",
                                    library, cond, rep,
                                    "\n".join(map(str, rawdat2condrep[libtype][rawdat]))))
                        if len(condrep2rawdat[libtype][(library, cond, rep)]):
                            issues[libtype].append(
                                (
                                    analysis,
                                    f"condrep2rawdat[{libtype}][{(library, cond, rep)}]",
                                    rawdat,
                                    "\n".join(map(str, condrep2rawdat[libtype][(library, cond, rep)]))))
                        rawdat2condrep[libtype][rawdat].append((library, cond, rep))
                        condrep2rawdat[libtype][(library, cond, rep)].append(rawdat)
                    bw = OPJ(results_folder, bw_pattern.format(
                        lib=cond, rep=rep,
                        **libtype_info[libtype]["default_wildcards"]).format(
                            genome=genome))
                    try:
                        assert Path(bw).exists()
                    except AssertionError as err:
                        print(bw_pattern)
                        print(bw)
                        print(OPR(bw))
                        raise
                    condrep2bw[libtype][(library, cond, rep)].append(bw)
                    bw2condrep[libtype][bw].append((library, cond, rep))
        # print(libtype)
        for (rawdat, v) in rawdat2condrep[libtype].items():
            if len(v) > 1:
                print(rawdat, file=sys.stderr)
                print(v, file=sys.stderr)
                if rawdat in rawdat2lib:
                    warnings.warn(f"Multiple {libtype} libraries for a submitted one.")
                else:
                    warnings.warn(f"Not sure we should worry about the above multiple {libtype} libraries: They do not seem to match submitted raw data files.")
        if issues[libtype]:
            print(f"Some issues for {libtype}:", file=sys.stderr)
            for t in issues[libtype]:
                print(t, file=sys.stderr)

    ##
    # Transform lists into unique values
    ##
    for (libtype, d) in rawdat2lib.items():
        #rawdat2lib[libtype] = valmap(unlist, d)
        rawdat2lib[libtype] = valmap(do_unlist(f"vm rawdat2lib[{libtype}]"), d)
    for (libtype, d) in lib2rawdat.items():
        #lib2rawdat[libtype] = valmap(unlist, d)
        lib2rawdat[libtype] = valmap(do_unlist(f"vm lib2rawdat[{libtype}]"), d)
    for (libtype, d) in rawdat2condrep.items():
        #rawdat2condrep[libtype] = valmap(unlist, d)
        rawdat2condrep[libtype] = valmap(do_unlist(f"vm rawdat2condrep[{libtype}]"), d)
    for (libtype, d) in condrep2rawdat.items():
        #condrep2rawdat[libtype] = valmap(unlist, d)
        condrep2rawdat[libtype] = valmap(do_unlist(f"vm condrep2rawdat[{libtype}]"), d)
    for (libtype, d) in condrep2bw.items():
        #condrep2bw[libtype] = valmap(unlist, d)
        condrep2bw[libtype] = valmap(do_unlist(f"vm condrep2bw[{libtype}]"), d)
    for (libtype, d) in bw2condrep.items():
        #bw2condrep[libtype] = valmap(unlist, d)
        bw2condrep[libtype] = valmap(do_unlist(f"vm bw2condrep[{libtype}]"), d)

    os.makedirs(data_dir, exist_ok=True)
    dump_dicts(
        OPJ(data_dir, "file_dicts.pickle"),
        [rawdat2lib, lib2rawdat,
         rawdat2condrep, condrep2rawdat,
         condrep2bw, bw2condrep])

    return 0

sys.exit(main())