#!/usr/bin/env python3 # vim: set fileencoding=<utf-8> : """This script pickles python dictionaries that associate raw data files, bigwig files and library names.""" import sys major, minor = sys.version_info[:2] if major < 3 or (major == 3 and minor < 6): sys.exit("Need at least python 3.6\n") import os import warnings import re OPJ = os.path.join OPR = os.path.realpath def formatwarning(message, category, filename, lineno, line=None): """Used to format warning messages.""" return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message) # https://stackoverflow.com/a/22661838/1878788 warnings.simplefilter('always', UserWarning) warnings.formatwarning = formatwarning from pickle import dump, HIGHEST_PROTOCOL from pathlib import Path from yaml import load as yload from collections import defaultdict from cytoolz import valmap from snakemake import Workflow def dump_dicts(path, dicts): with open(path, "wb") as pickle_file: for d in dicts: dump(d, pickle_file, HIGHEST_PROTOCOL) # https://stackoverflow.com/a/1319675/1878788 class TooManyValues(ValueError): def __init__(self, message="Too many values.", value=None): super().__init__(message) self.value = value def unlist(l): try: (elem,) = set(l) except ValueError as err: warnings.warn(f"There are more than one element in {l}.\n") warnings.warn("Selecting an arbitrary one.\n") (elem, *_) = set(l) # raise TooManyValues(value=elem) from err return elem def main(): config = yload(open(sys.argv[1], "r")) ref_info = config["ref"] data_dir = ref_info["paper"] data_info = config["data"] LIBTYPES = list(data_info.keys()) # libtype_info = { # "RNA-seq": { # "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/RNA-seq/RNA-seq.snakefile", # "rule": "make_normalized_bigwig", # "default_wildcards": { # "orientation": "all", # "mapped_type": "on_{genome}", # "norm_type": "protein_coding" # } # }, # "sRNA-seq": { # "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/sRNA-seq/sRNA-seq.snakefile", # "rule": "make_normalized_bigwig", # "default_wildcards": { # "orientation": "all", # "read_type": "all_siRNA", # "norm": "non_structural" # } # }, # "sRNA-IP-seq": { # "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/sRNA-seq/sRNA-seq.snakefile", # "rule": "make_normalized_bigwig", # "default_wildcards": { # "orientation": "all", # "read_type": "all_siRNA", # "norm": "non_structural" # } # }, # "GRO-seq": { # "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/GRO-seq/GRO-seq.snakefile", # "rule": "make_normalized_bigwig", # "default_wildcards": { # "trimmer": "cutadapt", # "orientation": "all", # "norm_type": "protein_coding" # } # } # } libtype_info = yload(open( OPJ(os.path.dirname(OPR(sys.argv[0])), "libtype_info.yaml"), "r")) assert set(LIBTYPES) <= set(libtype_info.keys()) # We use lists so that we can check that lists have only one element # instead of overwriting values. # dict of dicts {rawdat: [library,]} rawdat2lib = { "RNA-seq": defaultdict(list), "sRNA-seq": defaultdict(list), "sRNA-IP-seq": defaultdict(list), "GRO-seq": defaultdict(list), } # dict of dicts {library: [rawdat,]} lib2rawdat = { "RNA-seq": defaultdict(list), "sRNA-seq": defaultdict(list), "sRNA-IP-seq": defaultdict(list), "GRO-seq": defaultdict(list), } # dict of dicts {rawdat: [(cond, rep),]} rawdat2condrep = { "RNA-seq": defaultdict(list), "sRNA-seq": defaultdict(list), "sRNA-IP-seq": defaultdict(list), "GRO-seq": defaultdict(list), } # dict of dicts {(cond, rep): [rawdat,]} condrep2rawdat = { "RNA-seq": defaultdict(list), "sRNA-seq": defaultdict(list), "sRNA-IP-seq": defaultdict(list), "GRO-seq": defaultdict(list), } # dict of dicts {(cond, rep): [bw,]} condrep2bw = { "RNA-seq": defaultdict(list), "sRNA-seq": defaultdict(list), "sRNA-IP-seq": defaultdict(list), "GRO-seq": defaultdict(list), } # dict of dicts {bw: [(cond, rep),]} bw2condrep = { "RNA-seq": defaultdict(list), "sRNA-seq": defaultdict(list), "sRNA-IP-seq": defaultdict(list), "GRO-seq": defaultdict(list), } vars_from_analyses = set() vars_from_loading_sf = set() for (libtype, analyses) in data_info.items(): # snakefile for this libtype analysis_snakefile = libtype_info[libtype]["snakefile"] # name of the rule that produces bigwig files bw_rulename = libtype_info[libtype]["rule"] for (analysis, analysis_info) in analyses.items(): ### # determine pattern of the bigwig file path ### from_folder = OPR(analysis_info["from_folder"]) # configuration for this analysis analysis_config = yload(open(analysis_info["config"], "r")) common_vars = set(analysis_config.keys()) & (set(dir()) - vars_from_analyses) assert not common_vars, f"Variable overwriting hazard!\n{common_vars}" vars_from_analyses |= set(analysis_config.keys()) # This may be needed for analysis types with configurable genome genome = analysis_config.get( "genome_dict", # default genome name will be "C_elegans" {"name": "C_elegans"})["name"] # Load the snakefile to get information # about the rule making bigwig files # Based on snakemake/__init__.py and snakemake/workflow.py #before_loading_vars = list(recdir()) #before_loading_vars = dir() sf = Workflow( analysis_snakefile, overwrite_config=analysis_config) # TODO: find out other ways to inspect rules from another workflow # or how to undefine rules # This overwrites a lib2raw dict!! # Probably other variables are set, likely from analysis_config sf.include( analysis_snakefile, overwrite_first_rule=True) sf.check() #after_loading_vars = dir() #after_loading_vars = list(recdir()) # Use pattern matching so that it fails if we have not exactly # one bigwig file in the output of the rule [bw_pattern] = sf._rules[bw_rulename].output #for varname in set(after_loading_vars) - set(before_loading_vars): # # https://stackoverflow.com/a/26545111/1878788 # del globals()[varname] #for varname in dir(): # if varname.startswith("__rule"): # print(varname) # del globals()[varname] bw_pattern = re.sub(",[^}]*}", "}", bw_pattern) ### # determine raw data files for this analysis ### for (library, rawdat) in analysis_info["libraries"].items(): assert Path(rawdat).exists() # if os.path.islink(rawdat): # #warnings.warn(f"{rawdat} is a link.\n") # rawdat = os.readlink(rawdat) rawdat = OPR(rawdat) rawdat2lib[libtype][rawdat].append(library) lib2rawdat[libtype][library].append(rawdat) ### # determine (cond, rep) pairs associated to a given raw file ### for (cond, reps) in analysis_config["lib2raw"].items(): for (rep, rawdat) in reps.items(): assert Path(rawdat).exists() # if os.path.islink(rawdat): # #warnings.warn(f"{rawdat} is a link.\n") # rawdat = os.readlink(rawdat) rawdat = OPR(rawdat) #if rawdat in rawdat2lib: if True: rawdat2condrep[libtype][rawdat].append((cond, rep)) condrep2rawdat[libtype][(cond, rep)].append(rawdat) bw = OPJ(from_folder, bw_pattern.format( lib=cond, rep=rep, **libtype_info[libtype]["default_wildcards"]).format( genome=genome)) try: assert Path(bw).exists() except AssertionError as err: print(bw_pattern) print(bw) print(OPR(bw)) raise condrep2bw[libtype][(cond, rep)].append(bw) bw2condrep[libtype][bw].append((cond, rep)) # print(libtype) for (k, v) in rawdat2condrep[libtype].items(): if len(v) > 1: print(k) print(v) if k in rawdat2lib: warnings.warn(f"Multiple {libtype} libraries for a submitted one.") else: warnings.warn(f"Not sure we should worry about the above multiple {libtype} libraries: They do not seem to match submitted raw data files.") ## # Transform lists into unique values ## # for (k, d) in rawdat2lib.items(): # try: # rawdat2lib[k] = valmap(unlist, d) # except TooManyValues as err: # warnings.warn(f"rawdat2lib[{k}]\n{str(err)}") # rawdat2lib[k] = err.value # for (k, d) in lib2rawdat.items(): # try: # lib2rawdat[k] = valmap(unlist, d) # except TooManyValues as err: # warnings.warn(f"lib2rawdat[{k}]\n{str(err)}") # lib2rawdat[k] = err.value # for (k, d) in rawdat2condrep.items(): # try: # rawdat2condrep[k] = valmap(unlist, d) # except TooManyValues as err: # warnings.warn(f"rawdat2condrep[{k}]\n{str(err)}") # rawdat2condrep[k] = err.value # for (k, d) in condrep2rawdat.items(): # try: # condrep2rawdat[k] = valmap(unlist, d) # except TooManyValues as err: # warnings.warn(f"condrep2rawdat[{k}]\n{str(err)}") # condrep2rawdat[k] = err.value # for (k, d) in condrep2bw.items(): # try: # condrep2bw[k] = valmap(unlist, d) # except TooManyValues as err: # warnings.warn(f"condrep2bw[{k}]\n{str(err)}") # condrep2bw[k] = err.value # for (k, d) in bw2condrep.items(): # try: # bw2condrep[k] = valmap(unlist, d) # except TooManyValues as err: # warnings.warn(f"bw2condrep[{k}]\n{str(err)}") # bw2condrep[k] = err.value for (k, d) in rawdat2lib.items(): rawdat2lib[k] = valmap(unlist, d) for (k, d) in lib2rawdat.items(): lib2rawdat[k] = valmap(unlist, d) for (k, d) in rawdat2condrep.items(): rawdat2condrep[k] = valmap(unlist, d) for (k, d) in condrep2rawdat.items(): condrep2rawdat[k] = valmap(unlist, d) for (k, d) in condrep2bw.items(): condrep2bw[k] = valmap(unlist, d) for (k, d) in bw2condrep.items(): bw2condrep[k] = valmap(unlist, d) os.makedirs(data_dir, exist_ok=True) dump_dicts( OPJ(data_dir, "file_dicts.pickle"), [rawdat2lib, lib2rawdat, rawdat2condrep, condrep2rawdat, condrep2bw, bw2condrep]) return 0 sys.exit(main())