Skip to content
Snippets Groups Projects
Select Git revision
  • 1adbd411338c861673b638216c6853b162c7fbee
  • master default protected
2 results

build_file_dicts.py

Blame
  • build_file_dicts.py 13.40 KiB
    #!/usr/bin/env python3
    # vim: set fileencoding=<utf-8> :
    """This script pickles python dictionaries that associate raw data files,
    bigwig files and library names."""
    import sys
    
    
    major, minor = sys.version_info[:2]
    if major < 3 or (major == 3 and minor < 6):
        sys.exit("Need at least python 3.6\n")
    
    
    import os
    import warnings
    import re
    
    OPJ = os.path.join
    OPR = os.path.realpath
    
    
    def formatwarning(message, category, filename, lineno, line=None):
        """Used to format warning messages."""
        return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)
    
    
    # https://stackoverflow.com/a/22661838/1878788
    warnings.simplefilter('always', UserWarning)
    warnings.formatwarning = formatwarning
    
    from pickle import dump, HIGHEST_PROTOCOL
    from pathlib import Path
    from yaml import safe_load as yload
    from collections import defaultdict
    from cytoolz import valmap
    from snakemake import Workflow
    
    
    def default_to_regular(d):
        """See https://stackoverflow.com/a/26496899/1878788"""
        #if isinstance(d, defaultdict):
        if isinstance(d, dict):
            d = {k: default_to_regular(v) for k, v in d.items()}
        return d
    
    
    def dump_dicts(path, dicts):
        with open(path, "wb") as pickle_file:
            for d in dicts:
                dump(default_to_regular(d), pickle_file, HIGHEST_PROTOCOL)
    
    
    # https://stackoverflow.com/a/1319675/1878788
    class TooManyValues(ValueError):
        def __init__(self, message="Too many values.", value=None):
            super().__init__(message)
            self.value = value
    
    
    # def listdd():
    #     return defaultdict(list)
    
    
    def unlist(l):
        try:
            (elem,) = set(l)
        except ValueError as err:
            elems_repr = "\n".join(map(str, l))
            warnings.warn(f"There are more than one element in l:\n{elems_repr}\n")
            warnings.warn("Selecting an arbitrary one.\n")
            (elem, *_) = set(l)
            # raise TooManyValues(value=elem) from err
        return elem
    
    def do_unlist(msg):
        def unlist_with_msg(l):
            print(msg, file=sys.stderr)
            if isinstance(l, str):
                raise ValueError(f"l is a string: {l}!\n")
            return unlist(l)
        return unlist_with_msg
    
    
    def main():
        config = yload(open(sys.argv[1], "r"))
    
        ref_info = config["ref"]
        data_dir = ref_info["paper"]
    
        data_info = config["data"]
        LIBTYPES = list(data_info.keys())
    
        # libtype_info = {
        #     "RNA-seq": {
        #         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/RNA-seq/RNA-seq.snakefile",
        #         "rule": "make_normalized_bigwig",
        #         "default_wildcards": {
        #             "orientation": "all",
        #             "mapped_type": "on_{genome}",
        #             "norm_type": "protein_coding"
        #         }
        #     },
        #     "sRNA-seq": {
        #         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/sRNA-seq/sRNA-seq.snakefile",
        #         "rule": "make_normalized_bigwig",
        #         "default_wildcards": {
        #             "orientation": "all",
        #             "read_type": "all_siRNA",
        #             "norm": "non_structural"
        #         }
        #     },
        #     "sRNA-IP-seq": {
        #         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/sRNA-seq/sRNA-seq.snakefile",
        #         "rule": "make_normalized_bigwig",
        #         "default_wildcards": {
        #             "orientation": "all",
        #             "read_type": "all_siRNA",
        #             "norm": "non_structural"
        #         }
        #     },
        #     "GRO-seq": {
        #         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/GRO-seq/GRO-seq.snakefile",
        #         "rule": "make_normalized_bigwig",
        #         "default_wildcards": {
        #             "trimmer": "cutadapt",
        #             "orientation": "all",
        #             "norm_type": "protein_coding"
        #         }
        #     }
        # }
        libtype_info = yload(open(
            OPJ(os.path.dirname(OPR(sys.argv[0])), "libtype_info.yaml"),
            "r"))
        assert set(LIBTYPES) <= set(libtype_info.keys())
    
        # TODO: make per-analysis subdicts?
        # We use lists so that we can check that lists have only one element
        # instead of overwriting values.
        # dict of dicts {rawdat: [library,]}
        rawdat2lib = {
            "Ribo-seq": defaultdict(list),
            "RNA-seq": defaultdict(list),
            "sRNA-seq": defaultdict(list),
            "sRNA-IP-seq": defaultdict(list),
            "GRO-seq": defaultdict(list),
        }
        # dict of dicts of dicts {library: [rawdat,]}
        lib2rawdat = {
            "Ribo-seq": defaultdict(list),
            "RNA-seq": defaultdict(list),
            "sRNA-seq": defaultdict(list),
            "sRNA-IP-seq": defaultdict(list),
            "GRO-seq": defaultdict(list),
        }
        # dict of dicts of dicts {rawdat: [(library, cond, rep),]}
        rawdat2condrep = {
            "Ribo-seq": defaultdict(list),
            "RNA-seq": defaultdict(list),
            "sRNA-seq": defaultdict(list),
            "sRNA-IP-seq": defaultdict(list),
            "GRO-seq": defaultdict(list),
        }
        # dict of dicts of dicts {(library, cond, rep): [rawdat,]}
        condrep2rawdat = {
            "Ribo-seq": defaultdict(list),
            "RNA-seq": defaultdict(list),
            "sRNA-seq": defaultdict(list),
            "sRNA-IP-seq": defaultdict(list),
            "GRO-seq": defaultdict(list),
        }
        # dict of dicts of dicts {(library, cond, rep): [bw,]}
        condrep2bw = {
            "Ribo-seq": defaultdict(list),
            "RNA-seq": defaultdict(list),
            "sRNA-seq": defaultdict(list),
            "sRNA-IP-seq": defaultdict(list),
            "GRO-seq": defaultdict(list),
        }
        # dict of dicts of dicts {bw: [(library, cond, rep),]}
        bw2condrep = {
            "Ribo-seq": defaultdict(list),
            "RNA-seq": defaultdict(list),
            "sRNA-seq": defaultdict(list),
            "sRNA-IP-seq": defaultdict(list),
            "GRO-seq": defaultdict(list),
        }
    
        vars_from_analyses = set()
        vars_from_loading_sf = set()
        issues = defaultdict(list)
        for (libtype, analyses) in data_info.items():
            # snakefile for this libtype
            analysis_snakefile = libtype_info[libtype]["snakefile"]
            # name of the rule that produces bigwig files
            bw_rulename = libtype_info[libtype]["rule"]
            for (analysis, analysis_info) in analyses.items():
                ###
                # determine pattern of the bigwig file path
                ###
                from_folder = OPR(analysis_info["from_folder"])
                # configuration for this analysis
                analysis_config = yload(open(analysis_info["config"], "r"))
                common_vars = set(analysis_config.keys()) & (set(dir()) - vars_from_analyses)
                assert not common_vars, f"Variable overwriting hazard!\n{common_vars}"
                vars_from_analyses |= set(analysis_config.keys())
                # This may be needed for analysis types with configurable genome
                genome = analysis_config.get(
                    "genome_dict",
                    # default genome name will be "C_elegans"
                    {"name": "C_elegans"})["name"]
                # Load the snakefile to get information
                # about the rule making bigwig files
                # Based on snakemake/__init__.py and snakemake/workflow.py
                #before_loading_vars = list(recdir())
                #before_loading_vars = dir()
                sf = Workflow(
                    analysis_snakefile,
                    overwrite_config=analysis_config)
                # TODO: find out other ways to inspect rules from another workflow
                # or how to undefine rules
                # This overwrites a lib2raw dict!!
                # Probably other variables are set, likely from analysis_config
                sf.include(
                    analysis_snakefile,
                    overwrite_first_rule=True)
                sf.check()
                #after_loading_vars = dir()
                #after_loading_vars = list(recdir())
                # Use pattern matching so that it fails if we have not exactly
                # one bigwig file in the output of the rule
                [bw_pattern] = sf._rules[bw_rulename].output
                #for varname in set(after_loading_vars) - set(before_loading_vars):
                #    # https://stackoverflow.com/a/26545111/1878788
                #    del globals()[varname]
                #for varname in dir():
                #    if varname.startswith("__rule"):
                #        print(varname)
                #        del globals()[varname]
                bw_pattern = re.sub(",[^}]*}", "}", bw_pattern)
                ###
                # determine raw data files for this analysis
                ###
                raws_wanted = set()
                # If empty, not read as a dict and doesn't have items
                if analysis_info["libraries"]:
                    # library: sample name for submission
                    for (library, rawdat) in analysis_info["libraries"].items():
                        assert Path(rawdat).exists()
                        # if os.path.islink(rawdat):
                        #     #warnings.warn(f"{rawdat} is a link.\n")
                        #     rawdat = os.readlink(rawdat)
                        rawdat = OPR(rawdat)
                        raws_wanted.add(rawdat)
                        rawdat2lib[libtype][rawdat].append(library)
                        lib2rawdat[libtype][library].append(rawdat)
                ###
                # determine (cond, rep) pairs associated to a given raw file
                ###
                for (cond, reps) in analysis_config["lib2raw"].items():
                    for (rep, rawdat) in reps.items():
                        assert Path(rawdat).exists()
                        # if os.path.islink(rawdat):
                        #     #warnings.warn(f"{rawdat} is a link.\n")
                        #     rawdat = os.readlink(rawdat)
                        rawdat = OPR(rawdat)
                        if rawdat not in raws_wanted:
                            continue
                        #if rawdat in rawdat2lib:
                        # library (i.e. sumbission sample name)
                        # is used to desambiguate
                        [library, ] = rawdat2lib[libtype][rawdat]
                        if True:
                            if len(rawdat2condrep[libtype][rawdat]):
                                issues[libtype].append(
                                    (
                                        analysis,
                                        f"rawdat2condrep[{libtype}][{rawdat}]",
                                        library, cond, rep,
                                        "\n".join(map(str, rawdat2condrep[libtype][rawdat]))))
                            if len(condrep2rawdat[libtype][(library, cond, rep)]):
                                issues[libtype].append(
                                    (
                                        analysis,
                                        f"condrep2rawdat[{libtype}][{(library, cond, rep)}]",
                                        rawdat,
                                        "\n".join(map(str, condrep2rawdat[libtype][(library, cond, rep)]))))
                            rawdat2condrep[libtype][rawdat].append((library, cond, rep))
                            condrep2rawdat[libtype][(library, cond, rep)].append(rawdat)
                        bw = OPJ(from_folder, bw_pattern.format(
                            lib=cond, rep=rep,
                            **libtype_info[libtype]["default_wildcards"]).format(
                                genome=genome))
                        try:
                            assert Path(bw).exists()
                        except AssertionError as err:
                            print(bw_pattern)
                            print(bw)
                            print(OPR(bw))
                            raise
                        condrep2bw[libtype][(library, cond, rep)].append(bw)
                        bw2condrep[libtype][bw].append((library, cond, rep))
            # print(libtype)
            for (rawdat, v) in rawdat2condrep[libtype].items():
                if len(v) > 1:
                    print(rawdat, file=sys.stderr)
                    print(v, file=sys.stderr)
                    if rawdat in rawdat2lib:
                        warnings.warn(f"Multiple {libtype} libraries for a submitted one.")
                    else:
                        warnings.warn(f"Not sure we should worry about the above multiple {libtype} libraries: They do not seem to match submitted raw data files.")
            if issues[libtype]:
                print(f"Some issues for {libtype}:", file=sys.stderr)
                for t in issues[libtype]:
                    print(t, file=sys.stderr)
    
        ##
        # Transform lists into unique values
        ##
        for (libtype, d) in rawdat2lib.items():
            #rawdat2lib[libtype] = valmap(unlist, d)
            rawdat2lib[libtype] = valmap(do_unlist(f"vm rawdat2lib[{libtype}]"), d)
        for (libtype, d) in lib2rawdat.items():
            #lib2rawdat[libtype] = valmap(unlist, d)
            lib2rawdat[libtype] = valmap(do_unlist(f"vm lib2rawdat[{libtype}]"), d)
        for (libtype, d) in rawdat2condrep.items():
            #rawdat2condrep[libtype] = valmap(unlist, d)
            rawdat2condrep[libtype] = valmap(do_unlist(f"vm rawdat2condrep[{libtype}]"), d)
        for (libtype, d) in condrep2rawdat.items():
            #condrep2rawdat[libtype] = valmap(unlist, d)
            condrep2rawdat[libtype] = valmap(do_unlist(f"vm condrep2rawdat[{libtype}]"), d)
        for (libtype, d) in condrep2bw.items():
            #condrep2bw[libtype] = valmap(unlist, d)
            condrep2bw[libtype] = valmap(do_unlist(f"vm condrep2bw[{libtype}]"), d)
        for (libtype, d) in bw2condrep.items():
            #bw2condrep[libtype] = valmap(unlist, d)
            bw2condrep[libtype] = valmap(do_unlist(f"vm bw2condrep[{libtype}]"), d)
    
        os.makedirs(data_dir, exist_ok=True)
        dump_dicts(
            OPJ(data_dir, "file_dicts.pickle"),
            [rawdat2lib, lib2rawdat,
             rawdat2condrep, condrep2rawdat,
             condrep2bw, bw2condrep])
    
        return 0
    
    sys.exit(main())