Data_submission.snakefile

"""
This snakefile helps preparing data for submission to GEO.

It will gather raw data and bigWig files in one submission directory for each
library type defined in the configuration file, and prepare tsv files with
information about those files, including their md5sums.

Some information about the submission process (may not be up-to-date):

<https://www.ncbi.nlm.nih.gov/geo/info/submissionftp.html>
Overview
Back to top

This document contains details about using FTP to transfer your files to GEO.

    You must be logged in to your GEO account to see the GEO FTP server credentials below.
    Gather all required submission files prepared according to the Hints and tips below. Your transfer should include all required components (raw data files, processed data files and metadata spreadsheet). Start at the Submitting data page for full submission requirements.
    On your computer, create a folder named using your GEO username (/cecerelab). Put all required submission files into this folder.
    Transfer the folder to the GEO FTP server using the credentials below. Do not transfer files unless you are confident that you have a complete submission that includes all required components (raw data files, processed data files and metadata spreadsheet).
    After the FTP transfer is complete, you must notify GEO using the Submit to GEO web form. We cannot start processing your submission until the transfer is complete and we have received all required components.

Hints and tips
Back to top

    Please contact us in advance if your submission exceeds 1 terabyte in size. Do not initiate transfer until you hear back from GEO.
    Your upload should include three components: (1) raw data files, (2) processed data files, and (3) completed Metadata Template. Start at the Submitting data page for full submission requirements.
    DO NOT bundle files into archives (.zip/.rar/.tar/etc) before transferring (OK for smaller microarray submissions). For next-generation sequencing submissions, and larger microarray submissions (exceeding 4GB), we recommend that you place all submission files into a single directory named according to your GEO username, and then recursively transfer the directory to our FTP server (see below examples).
    Avoid whitespace and special characters in file names. Use only alphanumerals [A-Z, a-z, 0-9], underscores [_] and dots [.].
    File names should be unique and specific. Generic names (e.g. ‘S_1_1.fq’) should be avoided to prevent overwriting identically-named files.
    For large, non-binary raw data files (e.g. FASTQ) we recommend (but do not require) gzip (*.gz) or bzip2 (*.bz2) compression to speed transfer.
    DO NOT use gz- or bzip2-compression on binary files (.BigWig, .bw, .bigBed, .bb, .h5, .bam, .tdf etc).
    For high-throughput sequencing submissions, we recommend providing the MD5 checksums for the files that you are uploading (details below).
    Please use 'passive mode' when transferring files. Optimal buffer size is ~32 MB.
    File names on the server are visible to other submitters, but content is accessible only to GEO curators.
    The FTP server is a temporary storage space. Files will be moved by curators to an internal location for processing and assigning of accessions.
    You will not be able to rename or remove files after uploading. Please contact us if you need assistance.
    Files deposited on the FTP site are not displayed under 'My Submissions' on the web interface. The web interface only displays accessioned submissions.
    Un-announced files will be automatically deleted from the server after two weeks.

FTP server credentials
Back to top
host	ftp-private.ncbi.nlm.nih.gov
username	geo
password	33%9uyj_fCh?M16H
url	ftp://geo:33%259uyj_fCh%3FM16H@ftp-private.ncbi.nlm.nih.gov

Note: all submitters use the same login to access the ftp server (you cannot use your GEO username/password to connect to the ftp server) 
"""
import sys


major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
    sys.exit("Need at least python 3.6\n")


import os
import warnings

OPB = os.path.basename
OPR = os.path.realpath
OPJ = os.path.join


def formatwarning(message, category, filename, lineno, line=None):
    """Used to format warning messages."""
    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)


# https://stackoverflow.com/a/22661838/1878788
warnings.simplefilter('always', UserWarning)
warnings.formatwarning = formatwarning

from collections import defaultdict
from pickle import load as pload
from gzip import open as gzopen
from yaml import safe_load as yload
from pathlib import Path
from cytoolz import valmap


def unlist(l):
    try:
        (elem,) = set(l)
    except ValueError as err:
        warnings.warn(f"There are more than one element in {l}.\n")
        warnings.warn("Selecting an arbitrary one.\n")
        (elem, *_) = set(l)
        # raise TooManyValues(value=elem) from err
    return elem


# If we wanted to have some rules executed using a singularity container:
# simg = config.get("simg", "/opt/bioinfo_utils/bin/run_pipeline")
ref_info = config["ref"]
paper = ref_info["paper"]
submission_dir = ref_info["NCBI_submitter"]

gather_processed_data = config.get("gather_processed_tables", False)
if not gather_processed_data:
    print("Only raw files will be gathered.")
data_info = config["data"]
LIBTYPES = list(data_info.keys())

# libtype_info = {
#     "RNA-seq": {
#         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/RNA-seq/RNA-seq.snakefile",
#         "rule": "make_normalized_bigwig",
#         "default_wildcards": {
#             "orientation": "all",
#             "mapped_type": "on_{genome}",
#             "norm_type": "protein_coding"
#         }
#     },
#     "sRNA-seq": {
#         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/sRNA-seq/sRNA-seq.snakefile",
#         "rule": "make_normalized_bigwig",
#         "default_wildcards": {
#             "orientation": "all",
#             "read_type": "all_siRNA",
#             "norm": "non_structural"
#         }
#     },
#     "sRNA-IP-seq": {
#         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/sRNA-seq/sRNA-seq.snakefile",
#         "rule": "make_normalized_bigwig",
#         "default_wildcards": {
#             "orientation": "all",
#             "read_type": "all_siRNA",
#             "norm": "non_structural"
#         }
#     },
#     "GRO-seq": {
#         "snakefile": "/pasteur/homes/bli/src/bioinfo_utils/GRO-seq/GRO-seq.snakefile",
#         "rule": "make_normalized_bigwig",
#         "default_wildcards": {
#             "trimmer": "cutadapt",
#             "orientation": "all",
#             "norm_type": "protein_coding"
#         }
#     }
# }
libtype_info = yload(open(
    OPJ(os.path.dirname(OPR(workflow.snakefile)), "libtype_info.yaml"),
    "r"))

msg = f"LIBTYPES: {LIBTYPES}\nlibtype_info.keys(): {libtype_info.keys()}\n"
assert set(LIBTYPES) <= set(libtype_info.keys()), msg

if gather_processed_data:
    # Should be generated outside of the python code of the workflow
    # because loading other snakefiles seems to cause interferences.
    # shell("build_file_dicts.py config/Barucci_et_al_2018.yaml")
    #with open(OPJ(paper, "file_dicts.pickle"), "rb") as pickle_file:
    with open("file_dicts.pickle", "rb") as pickle_file:
        [
            # Not used
            # {libtype: {analysis: {rawdat: library}}}
            rawdat2lib,
            # {libtype: {analysis: {library: rawdat}}}
            lib2rawdat,
            # Several (cond, rep) pairs may use the same raw file
            # An arbitrary pair is used
            # Now a triple (library, cond, rep) is used, so that
            # there should be no more ambiguity
            # {libtype: {analysis: {rawdat: (library, cond, rep)}}}
            rawdat2condrep,
            # Not used
            # {libtype: {analysis: {(library, cond, rep): rawdat}}}
            condrep2rawdat,
            # {libtype: {analysis: {(library, cond, rep): bw}}}
            condrep2bw,
            # Not used
            # {libtype: {analysis: {bw: (library, cond, rep)}}}
            bw2condrep] = [pload(pickle_file) for _ in range(6)]
    def source_bw(wildcards):
        try:
            rawdat = lib2rawdat[wildcards.libtype][wildcards.library]
        except KeyError as err:
            print(*lib2rawdat[wildcards.libtype].keys(), sep="\n")
            raise
        (library, cond, rep) = rawdat2condrep[wildcards.libtype][rawdat]
        return os.path.abspath(condrep2bw[wildcards.libtype][(library, cond, rep)])
else:
    # dict of dicts {library: [rawdat,]}
    lib2rawdat = {
        "Ribo-seq": defaultdict(list),
        "RNA-seq": defaultdict(list),
        "sRNA-seq": defaultdict(list),
        "sRNA-IP-seq": defaultdict(list),
        "GRO-seq": defaultdict(list),
    }
    rawdat2lib = None
    rawdat2condrep = None
    condrep2rawdat = None
    condrep2bw = None
    bw2condrep = None
    for (libtype, analyses) in data_info.items():
        for (analysis, analysis_info) in analyses.items():
            if analysis_info["libraries"]:
                for (library, rawdat) in analysis_info["libraries"].items():
                    assert Path(rawdat).exists()
                    # if os.path.islink(rawdat):
                    #     #warnings.warn(f"{rawdat} is a link.\n")
                    #     rawdat = os.readlink(rawdat)
                    rawdat = OPR(rawdat)
                    lib2rawdat[libtype][library].append(rawdat)
    for (k, d) in lib2rawdat.items():
        lib2rawdat[k] = valmap(unlist, d)
    def source_bw(wildcards):
        return []


def determine_links():
    links_in_lib_type = set()
    for (libtype, analyses) in data_info.items():
        wc_dict = libtype_info[libtype]["default_wildcards"]
        norm_type = wc_dict.get("norm_type", wc_dict.get("norm", wc_dict.get("normalizer")))
        aligned_type = wc_dict.get("read_type", "reads")
        for (analysis, analysis_info) in analyses.items():
            if analysis_info["libraries"]:
                for (library, rawdat) in analysis_info["libraries"].items():
                    assert Path(rawdat).exists()
                    link_in_lib_type = OPJ(f"{paper}_{libtype}", f"{library}.fastq.gz")
                    assert link_in_lib_type not in links_in_lib_type, f"Name conflict for {link_in_lib_type}."
                    links_in_lib_type.add(link_in_lib_type)
                    if gather_processed_data:
                        link_bw_in_lib_type = OPJ(f"{paper}_{libtype}", f"{library}_{aligned_type}_by_{norm_type}.bw")
                        yield (
                            link_in_lib_type,
                            link_bw_in_lib_type,
                            # link_raw_data
                            #OPJ(paper, libtype, analysis, f"{library}.fastq.gz"),
                            # compute_md5sum
                            OPJ(paper, libtype, analysis, f"{library}.fastq.gz.md5"),
                            # link_bigwig
                            #OPJ(paper, libtype, analysis, "processed_files", f"{library}_{aligned_type}_by_{norm_type}.bw"),
                            # compute_bw_md5sum
                            OPJ(paper, libtype, analysis, "processed_files", f"{library}_{aligned_type}_by_{norm_type}.bw.md5"))
                    else:
                        yield (
                            link_in_lib_type,
                            # compute_md5sum
                            OPJ(paper, libtype, analysis, f"{library}.fastq.gz.md5"))


wildcard_constraints:
    libtype = "|".join(LIBTYPES),
    library = "|".join([k for d in lib2rawdat.values() for k in d.keys()]),

raw_tables = expand(OPJ(paper, "{libtype}", "raw.tsv"), libtype=LIBTYPES)
if gather_processed_data:
    processed_tables = expand(OPJ(paper, "{libtype}", "processed.tsv"), libtype=LIBTYPES)
else:
    processed_tables = []

rule all:
    input:
        # tsv files with content to copy-paste in the "RAW FILES" section of the submission spreadsheet
        raw_tables,
        processed_tables,
        list(zip(*determine_links())),


def lib2data(wildcards):
    return os.path.abspath(lib2rawdat[wildcards.libtype][wildcards.library])
    # return data_info[wildcards.libtype][wildcards.analysis]["libraries"][wildcards.library]


rule link_raw_data:
    """This rule installs the raw data in a local directory using symlinks.
    The location of the original files is taken from the configuration."""
    input:
        raw = lib2data,
    output:
        link = OPJ(paper, "{libtype}", "{analysis}", "{library}.fastq.gz")
    #wildcard_constraints:
    #    analysis = f"(?!{submission_dir}).*"
    message:
        "Making link {output.link} to raw data {input.raw}."
    run:
        os.symlink(input.raw, output.link)


rule compute_md5sum:
    """This rule installs the raw data in a local directory using symlinks.
    The location of the original files is taken from the configuration."""
    input:
        link = rules.link_raw_data.output.link,
    output:
        md5 = OPJ(paper, "{libtype}", "{analysis}", "{library}.fastq.gz.md5")
    message:
        "Computing md5sum for {input.link}."
    shell:
        """
        md5sum {input.link} > {output.md5}
        """


# TODO: Complete with new instruments or do some guess work based on
# https://en.wikipedia.org/wiki/FASTQ_format#Illumina_sequence_identifiers
# and
# https://www.biostars.org/p/198143/#198150
# and other source of information (such as https://github.com/10XGenomics/supernova/blob/master/tenkit/lib/python/tenkit/illumina_instrument.py#L12-L45).
instrument2model = {
    "@NS500199": "Illumina NextSeq 500",
    "@MN00481": "Illumina MiniSeq",
    "@NB501291": "Illumina NextSeq 500"}


def fq_info(fqgz):
    """Determines information about the source and content of a fastq file.
    Based on the header of the first read in the file.
    Currently only works with some specific Illumina-generated files."""
    with gzopen(fqgz) as fq_file:
        [seq_id, descr] = fq_file.readline().strip().decode("utf-8").split(" ")
        # read_len = len(fq_file.readline().strip().decode("utf-8"))
    try:
        [instrument, run, flowcell, lane, tile, x, y, mate] = seq_id.split(":")
    except ValueError as err:
        [instrument, run, flowcell, lane, tile, x, y] = seq_id.split(":")
        mate = None
    model = instrument2model[instrument]
    if mate is None:
        #return (model, str(read_len), "single")
        return (model, "single")
    else:
        #return (model, str(read_len), "paired-end")
        return (model, "paired-end")


def lib_type2md5s(wildcards):
    """Find the md5sum files corresponding to raw data defined by *wildcards*."""
    for (analysis, analysis_info) in data_info[wildcards.libtype].items():
        if analysis_info["libraries"]:
            for library in analysis_info["libraries"]:
                yield OPJ(
                    paper,
                    f"{wildcards.libtype}",
                    f"{analysis}",
                    f"{library}.fastq.gz.md5")


def lib_type2read_lens(wildcards):
    """Find the read lengths (number of sequencing cycles) corresponding to raw data defined by *wildcards*."""
    for (analysis, analysis_info) in data_info[wildcards.libtype].items():
        if analysis_info["libraries"]:
            # Loop to have as many read lengths as there are md5 files (from lib_type2md5s)
            for _ in analysis_info["libraries"]:
                yield analysis_info["read_len"]


rule prepare_raw_data_lines:
    input:
        md5s = lib_type2md5s,
    output:
        tsv = OPJ(paper, "{libtype}", "raw.tsv")
    params:
        read_lens = lib_type2read_lens,
    message:
        """Preparing info to copy-paste in the RAW FILES section of the submission spreadsheet in:
        {paper}/{wildcards.libtype}/raw.tsv"""
    run:
        with open(output.tsv, "w") as tsv_file:
            for (md5_filename, read_len) in zip(input.md5s, params.read_lens):
                with open(md5_filename, "r") as md5_file:
                    for line in md5_file:
                        try:
                            (md5sum, raw_path) = line.strip().split()
                        except ValueError:
                            warnings.warn(
                                "There is an issue with the following line from {md5_filename}:\n")
                            print(line)
                            raise
                        # (model, read_len, pairness) = fq_info(raw_path)
                        (model, pairness) = fq_info(raw_path)
                        tsv_file.write(
                            f"{OPB(raw_path)}\tfastq\t{md5sum}\t{model}\t{read_len}\t{pairness}\n")


def get_link_for_analysis(wildcards):
    """Determine source for the the create_link_for_lib_type rule."""
    potential_links = set()
    source_links = set()
    for (analysis, analysis_info) in data_info[wildcards.libtype].items():
        if analysis_info["libraries"] and wildcards.library in analysis_info["libraries"]:
            source_link = OPJ(
                paper,
                f"{wildcards.libtype}",
                f"{analysis}",
                f"{wildcards.library}.fastq.gz")
            potential_link = OPJ(
                f"{paper}_{wildcards.libtype}",
                f"{wildcards.library}.fastq.gz")
            assert potential_link not in potential_links, f"Name conflict for {wildcards.library}."
            potential_links.add(potential_link)
            source_links.add(source_link)
    (source_link, ) = source_links
    "Cornes_et_al_2021/RNA-seq/1/N2_YA_rep1_RNA-seq.fastq.gz"
    return source_link


rule create_link_for_lib_type:
    input:
        link = get_link_for_analysis,
    output:
        link_in_lib_type = OPJ(f"{paper}_{{libtype}}", "{library}.fastq.gz"),
    run:
        os.symlink(os.path.abspath(input.link), output.link_in_lib_type)


rule link_bigwig:
    input:
        bw = source_bw,
    output:
        link = OPJ(paper, "{libtype}", "{analysis}", "processed_files", "{library}_{aligned_type}_by_{norm_type}.bw")
    run:
        os.symlink(os.path.abspath(input.bw), output.link)

rule compute_bw_md5sum:
    """This rule installs the raw data in a local directory using symlinks.
    The location of the original files is taken from the configuration."""
    input:
        link = rules.link_bigwig.output.link,
    output:
        md5 = OPJ(paper, "{libtype}", "{analysis}", "processed_files", "{library}_{aligned_type}_by_{norm_type}.bw.md5")
    message:
        "Computing md5sum for {input.link}."
    shell:
        """
        md5sum {input.link} > {output.md5}
        """


def get_link_bw_for_analysis(wildcards):
    """Determine source for the the create_link_bw_for_lib_type rule."""
    potential_links = set()
    source_links = set()
    for (analysis, analysis_info) in data_info[wildcards.libtype].items():
        if analysis_info["libraries"] and wildcards.library in analysis_info["libraries"]:
            source_link = OPJ(
                paper,
                f"{wildcards.libtype}",
                f"{analysis}",
                "processed_files",
                f"{wildcards.library}_{wildcards.aligned_type}_by_{wildcards.norm_type}.bw")
            potential_link = OPJ(
                f"{paper}_{wildcards.libtype}",
                f"{wildcards.library}_{wildcards.aligned_type}_by_{wildcards.norm_type}.bw")
            assert potential_link not in potential_links, f"Name conflict for {wildcards.library}."
            potential_links.add(potential_link)
            source_links.add(source_link)
    (source_link, ) = source_links
    return source_link


rule create_link_bw_for_lib_type:
    input:
        link = get_link_bw_for_analysis,
    output:
        link_bw_in_lib_type = OPJ(f"{paper}_{{libtype}}", "{library}_{aligned_type}_by_{norm_type}.bw"),
    run:
        os.symlink(os.path.abspath(input.link), output.link_bw_in_lib_type)


def lib_type2bw_md5s(wildcards):
    """Find the md5sum files corresponding to raw data defined by *wildcards*."""
    wc_dict = libtype_info[wildcards.libtype]["default_wildcards"]
    norm_type = wc_dict.get("norm_type", wc_dict.get("norm", wc_dict.get("normalizer")))
    aligned_type = wc_dict.get("read_type", "reads")
    for (analysis, analysis_info) in data_info[wildcards.libtype].items():
        if analysis_info["libraries"]:
            for library in analysis_info["libraries"]:
                yield OPJ(
                    paper,
                    f"{wildcards.libtype}",
                    f"{analysis}",
                    "processed_files",
                    f"{library}_{aligned_type}_by_{norm_type}.bw.md5")


rule prepare_processed_data_lines:
    input:
        md5s = lib_type2bw_md5s,
    output:
        tsv = OPJ(paper, "{libtype}", "processed.tsv")
    message:
        """Preparing info to copy-paste in the PROCESSED DATA FILES section of the submission spreadsheet in:
        {paper}/{wildcards.libtype}/raw.tsv"""
    run:
        with open(output.tsv, "w") as tsv_file:
            for md5 in input.md5s:
                with open(md5, "r") as md5_file:
                    for line in md5_file:
                        try:
                            (md5sum, raw_path) = line.strip().split()
                        except ValueError:
                            print(line)
                            raise
                        tsv_file.write(f"{OPB(raw_path)}\tbigWig\t{md5sum}\n")