build_file_dicts.py 11.74 KiB
#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
"""This script pickles python dictionaries that associate raw data files,
bigwig files and library names."""
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
sys.exit("Need at least python 3.6\n")
import os
import warnings
import re
OPJ = os.path.join
OPR = os.path.realpath
def formatwarning(message, category, filename, lineno, line=None):
"""Used to format warning messages."""
return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)
# https://stackoverflow.com/a/22661838/1878788
warnings.simplefilter('always', UserWarning)
warnings.formatwarning = formatwarning
from pickle import dump, HIGHEST_PROTOCOL
from pathlib import Path
from yaml import load as yload
from collections import defaultdict
from cytoolz import valmap
from snakemake import Workflow
def dump_dicts(path, dicts):
with open(path, "wb") as pickle_file:
for d in dicts:
dump(d, pickle_file, HIGHEST_PROTOCOL)
# https://stackoverflow.com/a/1319675/1878788
class TooManyValues(ValueError):
def __init__(self, message="Too many values.", value=None):
super().__init__(message)
self.value = value
def unlist(l):
try:
(elem,) = set(l)
except ValueError as err:
warnings.warn(f"There are more than one element in {l}.\n")
warnings.warn("Selecting an arbitrary one.\n")
(elem, *_) = set(l)
# raise TooManyValues(value=elem) from err
return elem
def main():
config = yload(open(sys.argv[1], "r"))
ref_info = config["ref"]
data_dir = ref_info["paper"]
data_info = config["data"]
LIBTYPES = list(data_info.keys())
# libtype_info = {
# "RNA-seq": {
# "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/RNA-seq/RNA-seq.snakefile",
# "rule": "make_normalized_bigwig",
# "default_wildcards": {
# "orientation": "all",
# "mapped_type": "on_{genome}",
# "norm_type": "protein_coding"
# }
# },
# "sRNA-seq": {
# "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/sRNA-seq/sRNA-seq.snakefile",
# "rule": "make_normalized_bigwig",
# "default_wildcards": {
# "orientation": "all",
# "read_type": "all_siRNA",
# "norm": "non_structural"
# }
# },
# "sRNA-IP-seq": {
# "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/sRNA-seq/sRNA-seq.snakefile",
# "rule": "make_normalized_bigwig",
# "default_wildcards": {
# "orientation": "all",
# "read_type": "all_siRNA",
# "norm": "non_structural"
# }
# },
# "GRO-seq": {
# "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/GRO-seq/GRO-seq.snakefile",
# "rule": "make_normalized_bigwig",
# "default_wildcards": {
# "trimmer": "cutadapt",
# "orientation": "all",
# "norm_type": "protein_coding"
# }
# }
# }
libtype_info = yload(open(
OPJ(os.path.dirname(OPR(sys.argv[0])), "libtype_info.yaml"),
"r"))
assert set(LIBTYPES) <= set(libtype_info.keys())
# We use lists so that we can check that lists have only one element
# instead of overwriting values.
# dict of dicts {rawdat: [library,]}
rawdat2lib = {
"RNA-seq": defaultdict(list),
"sRNA-seq": defaultdict(list),
"sRNA-IP-seq": defaultdict(list),
"GRO-seq": defaultdict(list),
}
# dict of dicts {library: [rawdat,]}
lib2rawdat = {
"RNA-seq": defaultdict(list),
"sRNA-seq": defaultdict(list),
"sRNA-IP-seq": defaultdict(list),
"GRO-seq": defaultdict(list),
}
# dict of dicts {rawdat: [(cond, rep),]}
rawdat2condrep = {
"RNA-seq": defaultdict(list),
"sRNA-seq": defaultdict(list),
"sRNA-IP-seq": defaultdict(list),
"GRO-seq": defaultdict(list),
}
# dict of dicts {(cond, rep): [rawdat,]}
condrep2rawdat = {
"RNA-seq": defaultdict(list),
"sRNA-seq": defaultdict(list),
"sRNA-IP-seq": defaultdict(list),
"GRO-seq": defaultdict(list),
}
# dict of dicts {(cond, rep): [bw,]}
condrep2bw = {
"RNA-seq": defaultdict(list),
"sRNA-seq": defaultdict(list),
"sRNA-IP-seq": defaultdict(list),
"GRO-seq": defaultdict(list),
}
# dict of dicts {bw: [(cond, rep),]}
bw2condrep = {
"RNA-seq": defaultdict(list),
"sRNA-seq": defaultdict(list),
"sRNA-IP-seq": defaultdict(list),
"GRO-seq": defaultdict(list),
}
vars_from_analyses = set()
vars_from_loading_sf = set()
for (libtype, analyses) in data_info.items():
# snakefile for this libtype
analysis_snakefile = libtype_info[libtype]["snakefile"]
# name of the rule that produces bigwig files
bw_rulename = libtype_info[libtype]["rule"]
for (analysis, analysis_info) in analyses.items():
###
# determine pattern of the bigwig file path
###
from_folder = OPR(analysis_info["from_folder"])
# configuration for this analysis
analysis_config = yload(open(analysis_info["config"], "r"))
common_vars = set(analysis_config.keys()) & (set(dir()) - vars_from_analyses)
assert not common_vars, f"Variable overwriting hazard!\n{common_vars}"
vars_from_analyses |= set(analysis_config.keys())
# This may be needed for analysis types with configurable genome
genome = analysis_config.get(
"genome_dict",
# default genome name will be "C_elegans"
{"name": "C_elegans"})["name"]
# Load the snakefile to get information
# about the rule making bigwig files
# Based on snakemake/__init__.py and snakemake/workflow.py
#before_loading_vars = list(recdir())
#before_loading_vars = dir()
sf = Workflow(
analysis_snakefile,
overwrite_config=analysis_config)
# TODO: find out other ways to inspect rules from another workflow
# or how to undefine rules
# This overwrites a lib2raw dict!!
# Probably other variables are set, likely from analysis_config
sf.include(
analysis_snakefile,
overwrite_first_rule=True)
sf.check()
#after_loading_vars = dir()
#after_loading_vars = list(recdir())
# Use pattern matching so that it fails if we have not exactly
# one bigwig file in the output of the rule
[bw_pattern] = sf._rules[bw_rulename].output
#for varname in set(after_loading_vars) - set(before_loading_vars):
# # https://stackoverflow.com/a/26545111/1878788
# del globals()[varname]
#for varname in dir():
# if varname.startswith("__rule"):
# print(varname)
# del globals()[varname]
bw_pattern = re.sub(",[^}]*}", "}", bw_pattern)
###
# determine raw data files for this analysis
###
for (library, rawdat) in analysis_info["libraries"].items():
assert Path(rawdat).exists()
# if os.path.islink(rawdat):
# #warnings.warn(f"{rawdat} is a link.\n")
# rawdat = os.readlink(rawdat)
rawdat = OPR(rawdat)
rawdat2lib[libtype][rawdat].append(library)
lib2rawdat[libtype][library].append(rawdat)
###
# determine (cond, rep) pairs associated to a given raw file
###
for (cond, reps) in analysis_config["lib2raw"].items():
for (rep, rawdat) in reps.items():
assert Path(rawdat).exists()
# if os.path.islink(rawdat):
# #warnings.warn(f"{rawdat} is a link.\n")
# rawdat = os.readlink(rawdat)
rawdat = OPR(rawdat)
#if rawdat in rawdat2lib:
if True:
rawdat2condrep[libtype][rawdat].append((cond, rep))
condrep2rawdat[libtype][(cond, rep)].append(rawdat)
bw = OPJ(from_folder, bw_pattern.format(
lib=cond, rep=rep,
**libtype_info[libtype]["default_wildcards"]).format(
genome=genome))
try:
assert Path(bw).exists()
except AssertionError as err:
print(bw_pattern)
print(bw)
print(OPR(bw))
raise
condrep2bw[libtype][(cond, rep)].append(bw)
bw2condrep[libtype][bw].append((cond, rep))
# print(libtype)
for (k, v) in rawdat2condrep[libtype].items():
if len(v) > 1:
print(k)
print(v)
if k in rawdat2lib:
warnings.warn(f"Multiple {libtype} libraries for a submitted one.")
else:
warnings.warn(f"Not sure we should worry about the above multiple {libtype} libraries: They do not seem to match submitted raw data files.")
##
# Transform lists into unique values
##
# for (k, d) in rawdat2lib.items():
# try:
# rawdat2lib[k] = valmap(unlist, d)
# except TooManyValues as err:
# warnings.warn(f"rawdat2lib[{k}]\n{str(err)}")
# rawdat2lib[k] = err.value
# for (k, d) in lib2rawdat.items():
# try:
# lib2rawdat[k] = valmap(unlist, d)
# except TooManyValues as err:
# warnings.warn(f"lib2rawdat[{k}]\n{str(err)}")
# lib2rawdat[k] = err.value
# for (k, d) in rawdat2condrep.items():
# try:
# rawdat2condrep[k] = valmap(unlist, d)
# except TooManyValues as err:
# warnings.warn(f"rawdat2condrep[{k}]\n{str(err)}")
# rawdat2condrep[k] = err.value
# for (k, d) in condrep2rawdat.items():
# try:
# condrep2rawdat[k] = valmap(unlist, d)
# except TooManyValues as err:
# warnings.warn(f"condrep2rawdat[{k}]\n{str(err)}")
# condrep2rawdat[k] = err.value
# for (k, d) in condrep2bw.items():
# try:
# condrep2bw[k] = valmap(unlist, d)
# except TooManyValues as err:
# warnings.warn(f"condrep2bw[{k}]\n{str(err)}")
# condrep2bw[k] = err.value
# for (k, d) in bw2condrep.items():
# try:
# bw2condrep[k] = valmap(unlist, d)
# except TooManyValues as err:
# warnings.warn(f"bw2condrep[{k}]\n{str(err)}")
# bw2condrep[k] = err.value
for (k, d) in rawdat2lib.items():
rawdat2lib[k] = valmap(unlist, d)
for (k, d) in lib2rawdat.items():
lib2rawdat[k] = valmap(unlist, d)
for (k, d) in rawdat2condrep.items():
rawdat2condrep[k] = valmap(unlist, d)
for (k, d) in condrep2rawdat.items():
condrep2rawdat[k] = valmap(unlist, d)
for (k, d) in condrep2bw.items():
condrep2bw[k] = valmap(unlist, d)
for (k, d) in bw2condrep.items():
bw2condrep[k] = valmap(unlist, d)
os.makedirs(data_dir, exist_ok=True)
dump_dicts(
OPJ(data_dir, "file_dicts.pickle"),
[rawdat2lib, lib2rawdat,
rawdat2condrep, condrep2rawdat,
condrep2bw, bw2condrep])
return 0
sys.exit(main())