Select Git revision
build_file_dicts.py
build_file_dicts.py 13.40 KiB
#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
"""This script pickles python dictionaries that associate raw data files,
bigwig files and library names."""
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
sys.exit("Need at least python 3.6\n")
import os
import warnings
import re
OPJ = os.path.join
OPR = os.path.realpath
def formatwarning(message, category, filename, lineno, line=None):
"""Used to format warning messages."""
return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)
# https://stackoverflow.com/a/22661838/1878788
warnings.simplefilter('always', UserWarning)
warnings.formatwarning = formatwarning
from pickle import dump, HIGHEST_PROTOCOL
from pathlib import Path
from yaml import safe_load as yload
from collections import defaultdict
from cytoolz import valmap
from snakemake import Workflow
def default_to_regular(d):
"""See https://stackoverflow.com/a/26496899/1878788"""
#if isinstance(d, defaultdict):
if isinstance(d, dict):
d = {k: default_to_regular(v) for k, v in d.items()}
return d
def dump_dicts(path, dicts):
with open(path, "wb") as pickle_file:
for d in dicts:
dump(default_to_regular(d), pickle_file, HIGHEST_PROTOCOL)
# https://stackoverflow.com/a/1319675/1878788
class TooManyValues(ValueError):
def __init__(self, message="Too many values.", value=None):
super().__init__(message)
self.value = value
# def listdd():
# return defaultdict(list)
def unlist(l):
try:
(elem,) = set(l)
except ValueError as err:
elems_repr = "\n".join(map(str, l))
warnings.warn(f"There are more than one element in l:\n{elems_repr}\n")
warnings.warn("Selecting an arbitrary one.\n")
(elem, *_) = set(l)
# raise TooManyValues(value=elem) from err
return elem
def do_unlist(msg):
def unlist_with_msg(l):
print(msg, file=sys.stderr)
if isinstance(l, str):
raise ValueError(f"l is a string: {l}!\n")
return unlist(l)
return unlist_with_msg
def main():
config = yload(open(sys.argv[1], "r"))
ref_info = config["ref"]
data_dir = ref_info["paper"]
data_info = config["data"]
LIBTYPES = list(data_info.keys())
# libtype_info = {
# "RNA-seq": {
# "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/RNA-seq/RNA-seq.snakefile",
# "rule": "make_normalized_bigwig",
# "default_wildcards": {
# "orientation": "all",
# "mapped_type": "on_{genome}",
# "norm_type": "protein_coding"
# }
# },
# "sRNA-seq": {
# "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/sRNA-seq/sRNA-seq.snakefile",
# "rule": "make_normalized_bigwig",
# "default_wildcards": {
# "orientation": "all",
# "read_type": "all_siRNA",
# "norm": "non_structural"
# }
# },
# "sRNA-IP-seq": {
# "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/sRNA-seq/sRNA-seq.snakefile",
# "rule": "make_normalized_bigwig",
# "default_wildcards": {
# "orientation": "all",
# "read_type": "all_siRNA",
# "norm": "non_structural"
# }
# },
# "GRO-seq": {
# "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/GRO-seq/GRO-seq.snakefile",
# "rule": "make_normalized_bigwig",
# "default_wildcards": {
# "trimmer": "cutadapt",
# "orientation": "all",
# "norm_type": "protein_coding"
# }
# }
# }
libtype_info = yload(open(
OPJ(os.path.dirname(OPR(sys.argv[0])), "libtype_info.yaml"),
"r"))
assert set(LIBTYPES) <= set(libtype_info.keys())
# TODO: make per-analysis subdicts?
# We use lists so that we can check that lists have only one element
# instead of overwriting values.
# dict of dicts {rawdat: [library,]}
rawdat2lib = {
"Ribo-seq": defaultdict(list),
"RNA-seq": defaultdict(list),
"sRNA-seq": defaultdict(list),
"sRNA-IP-seq": defaultdict(list),
"GRO-seq": defaultdict(list),
}
# dict of dicts of dicts {library: [rawdat,]}
lib2rawdat = {
"Ribo-seq": defaultdict(list),
"RNA-seq": defaultdict(list),
"sRNA-seq": defaultdict(list),
"sRNA-IP-seq": defaultdict(list),
"GRO-seq": defaultdict(list),
}
# dict of dicts of dicts {rawdat: [(library, cond, rep),]}
rawdat2condrep = {
"Ribo-seq": defaultdict(list),
"RNA-seq": defaultdict(list),
"sRNA-seq": defaultdict(list),
"sRNA-IP-seq": defaultdict(list),
"GRO-seq": defaultdict(list),
}
# dict of dicts of dicts {(library, cond, rep): [rawdat,]}
condrep2rawdat = {
"Ribo-seq": defaultdict(list),
"RNA-seq": defaultdict(list),
"sRNA-seq": defaultdict(list),
"sRNA-IP-seq": defaultdict(list),
"GRO-seq": defaultdict(list),
}
# dict of dicts of dicts {(library, cond, rep): [bw,]}
condrep2bw = {
"Ribo-seq": defaultdict(list),
"RNA-seq": defaultdict(list),
"sRNA-seq": defaultdict(list),
"sRNA-IP-seq": defaultdict(list),
"GRO-seq": defaultdict(list),
}
# dict of dicts of dicts {bw: [(library, cond, rep),]}
bw2condrep = {
"Ribo-seq": defaultdict(list),
"RNA-seq": defaultdict(list),
"sRNA-seq": defaultdict(list),
"sRNA-IP-seq": defaultdict(list),
"GRO-seq": defaultdict(list),
}
vars_from_analyses = set()
vars_from_loading_sf = set()
issues = defaultdict(list)
for (libtype, analyses) in data_info.items():
# snakefile for this libtype
analysis_snakefile = libtype_info[libtype]["snakefile"]
# name of the rule that produces bigwig files
bw_rulename = libtype_info[libtype]["rule"]
for (analysis, analysis_info) in analyses.items():
###
# determine pattern of the bigwig file path
###
from_folder = OPR(analysis_info["from_folder"])
# configuration for this analysis
analysis_config = yload(open(analysis_info["config"], "r"))
common_vars = set(analysis_config.keys()) & (set(dir()) - vars_from_analyses)
assert not common_vars, f"Variable overwriting hazard!\n{common_vars}"
vars_from_analyses |= set(analysis_config.keys())
# This may be needed for analysis types with configurable genome
genome = analysis_config.get(
"genome_dict",
# default genome name will be "C_elegans"
{"name": "C_elegans"})["name"]
# Load the snakefile to get information
# about the rule making bigwig files
# Based on snakemake/__init__.py and snakemake/workflow.py
#before_loading_vars = list(recdir())
#before_loading_vars = dir()
sf = Workflow(
analysis_snakefile,
overwrite_config=analysis_config)
# TODO: find out other ways to inspect rules from another workflow
# or how to undefine rules
# This overwrites a lib2raw dict!!
# Probably other variables are set, likely from analysis_config
sf.include(
analysis_snakefile,
overwrite_first_rule=True)
sf.check()
#after_loading_vars = dir()
#after_loading_vars = list(recdir())
# Use pattern matching so that it fails if we have not exactly
# one bigwig file in the output of the rule
[bw_pattern] = sf._rules[bw_rulename].output
#for varname in set(after_loading_vars) - set(before_loading_vars):
# # https://stackoverflow.com/a/26545111/1878788
# del globals()[varname]
#for varname in dir():
# if varname.startswith("__rule"):
# print(varname)
# del globals()[varname]
bw_pattern = re.sub(",[^}]*}", "}", bw_pattern)
###
# determine raw data files for this analysis
###
raws_wanted = set()
# If empty, not read as a dict and doesn't have items
if analysis_info["libraries"]:
# library: sample name for submission
for (library, rawdat) in analysis_info["libraries"].items():
assert Path(rawdat).exists()
# if os.path.islink(rawdat):
# #warnings.warn(f"{rawdat} is a link.\n")
# rawdat = os.readlink(rawdat)
rawdat = OPR(rawdat)
raws_wanted.add(rawdat)
rawdat2lib[libtype][rawdat].append(library)
lib2rawdat[libtype][library].append(rawdat)
###
# determine (cond, rep) pairs associated to a given raw file
###
for (cond, reps) in analysis_config["lib2raw"].items():
for (rep, rawdat) in reps.items():
assert Path(rawdat).exists()
# if os.path.islink(rawdat):
# #warnings.warn(f"{rawdat} is a link.\n")
# rawdat = os.readlink(rawdat)
rawdat = OPR(rawdat)
if rawdat not in raws_wanted:
continue
#if rawdat in rawdat2lib:
# library (i.e. sumbission sample name)
# is used to desambiguate
[library, ] = rawdat2lib[libtype][rawdat]
if True:
if len(rawdat2condrep[libtype][rawdat]):
issues[libtype].append(
(
analysis,
f"rawdat2condrep[{libtype}][{rawdat}]",
library, cond, rep,
"\n".join(map(str, rawdat2condrep[libtype][rawdat]))))
if len(condrep2rawdat[libtype][(library, cond, rep)]):
issues[libtype].append(
(
analysis,
f"condrep2rawdat[{libtype}][{(library, cond, rep)}]",
rawdat,
"\n".join(map(str, condrep2rawdat[libtype][(library, cond, rep)]))))
rawdat2condrep[libtype][rawdat].append((library, cond, rep))
condrep2rawdat[libtype][(library, cond, rep)].append(rawdat)
bw = OPJ(from_folder, bw_pattern.format(
lib=cond, rep=rep,
**libtype_info[libtype]["default_wildcards"]).format(
genome=genome))
try:
assert Path(bw).exists()
except AssertionError as err:
print(bw_pattern)
print(bw)
print(OPR(bw))
raise
condrep2bw[libtype][(library, cond, rep)].append(bw)
bw2condrep[libtype][bw].append((library, cond, rep))
# print(libtype)
for (rawdat, v) in rawdat2condrep[libtype].items():
if len(v) > 1:
print(rawdat, file=sys.stderr)
print(v, file=sys.stderr)
if rawdat in rawdat2lib:
warnings.warn(f"Multiple {libtype} libraries for a submitted one.")
else:
warnings.warn(f"Not sure we should worry about the above multiple {libtype} libraries: They do not seem to match submitted raw data files.")
if issues[libtype]:
print(f"Some issues for {libtype}:", file=sys.stderr)
for t in issues[libtype]:
print(t, file=sys.stderr)
##
# Transform lists into unique values
##
for (libtype, d) in rawdat2lib.items():
#rawdat2lib[libtype] = valmap(unlist, d)
rawdat2lib[libtype] = valmap(do_unlist(f"vm rawdat2lib[{libtype}]"), d)
for (libtype, d) in lib2rawdat.items():
#lib2rawdat[libtype] = valmap(unlist, d)
lib2rawdat[libtype] = valmap(do_unlist(f"vm lib2rawdat[{libtype}]"), d)
for (libtype, d) in rawdat2condrep.items():
#rawdat2condrep[libtype] = valmap(unlist, d)
rawdat2condrep[libtype] = valmap(do_unlist(f"vm rawdat2condrep[{libtype}]"), d)
for (libtype, d) in condrep2rawdat.items():
#condrep2rawdat[libtype] = valmap(unlist, d)
condrep2rawdat[libtype] = valmap(do_unlist(f"vm condrep2rawdat[{libtype}]"), d)
for (libtype, d) in condrep2bw.items():
#condrep2bw[libtype] = valmap(unlist, d)
condrep2bw[libtype] = valmap(do_unlist(f"vm condrep2bw[{libtype}]"), d)
for (libtype, d) in bw2condrep.items():
#bw2condrep[libtype] = valmap(unlist, d)
bw2condrep[libtype] = valmap(do_unlist(f"vm bw2condrep[{libtype}]"), d)
os.makedirs(data_dir, exist_ok=True)
dump_dicts(
OPJ(data_dir, "file_dicts.pickle"),
[rawdat2lib, lib2rawdat,
rawdat2condrep, condrep2rawdat,
condrep2bw, bw2condrep])
return 0
sys.exit(main())