Data_submission.snakefile

"""
<https://www.ncbi.nlm.nih.gov/geo/info/submissionftp.html>
Overview
Back to top

This document contains details about using FTP to transfer your files to GEO.

    You must be logged in to your GEO account to see the GEO FTP server credentials below.
    Gather all required submission files prepared according to the Hints and tips below. Your transfer should include all required components (raw data files, processed data files and metadata spreadsheet). Start at the Submitting data page for full submission requirements.
    On your computer, create a folder named using your GEO username (/cecerelab). Put all required submission files into this folder.
    Transfer the folder to the GEO FTP server using the credentials below. Do not transfer files unless you are confident that you have a complete submission that includes all required components (raw data files, processed data files and metadata spreadsheet).
    After the FTP transfer is complete, you must notify GEO using the Submit to GEO web form. We cannot start processing your submission until the transfer is complete and we have received all required components.

Hints and tips
Back to top

    Please contact us in advance if your submission exceeds 1 terabyte in size. Do not initiate transfer until you hear back from GEO.
    Your upload should include three components: (1) raw data files, (2) processed data files, and (3) completed Metadata Template. Start at the Submitting data page for full submission requirements.
    DO NOT bundle files into archives (.zip/.rar/.tar/etc) before transferring (OK for smaller microarray submissions). For next-generation sequencing submissions, and larger microarray submissions (exceeding 4GB), we recommend that you place all submission files into a single directory named according to your GEO username, and then recursively transfer the directory to our FTP server (see below examples).
    Avoid whitespace and special characters in file names. Use only alphanumerals [A-Z, a-z, 0-9], underscores [_] and dots [.].
    File names should be unique and specific. Generic names (e.g. ‘S_1_1.fq’) should be avoided to prevent overwriting identically-named files.
    For large, non-binary raw data files (e.g. FASTQ) we recommend (but do not require) gzip (*.gz) or bzip2 (*.bz2) compression to speed transfer.
    DO NOT use gz- or bzip2-compression on binary files (.BigWig, .bw, .bigBed, .bb, .h5, .bam, .tdf etc).
    For high-throughput sequencing submissions, we recommend providing the MD5 checksums for the files that you are uploading (details below).
    Please use 'passive mode' when transferring files. Optimal buffer size is ~32 MB.
    File names on the server are visible to other submitters, but content is accessible only to GEO curators.
    The FTP server is a temporary storage space. Files will be moved by curators to an internal location for processing and assigning of accessions.
    You will not be able to rename or remove files after uploading. Please contact us if you need assistance.
    Files deposited on the FTP site are not displayed under 'My Submissions' on the web interface. The web interface only displays accessioned submissions.
    Un-announced files will be automatically deleted from the server after two weeks.

FTP server credentials
Back to top
host	ftp-private.ncbi.nlm.nih.gov
username	geo
password	33%9uyj_fCh?M16H
url	ftp://geo:33%259uyj_fCh%3FM16H@ftp-private.ncbi.nlm.nih.gov

Note: all submitters use the same login to access the ftp server (you cannot use your GEO username/password to connect to the ftp server) 
"""
import sys


major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
    sys.exit("Need at least python 3.6\n")


import os
import warnings

OPB = os.path.basename
OPJ = os.path.join


def formatwarning(message, category, filename, lineno, line=None):
    """Used to format warning messages."""
    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)


warnings.formatwarning = formatwarning

from pathlib import Path
from yaml import load as yload
from collections import defaultdict


ref_info = config["ref"]
data_dir = ref_info["paper"]
submission_dir = ref_info["NCBI_submitter"]

data_info = config["data"]
LIB_TYPES = list(data_info.keys())


def determine_fqgz_and_md5file():
    links_in_lib_type = set()
    for (lib_type, analyses) in data_info.items():
        for (analysis, analysis_info) in analyses.items():
            for (library, raw_data) in analysis_info["libraries"].items():
                assert Path(raw_data).exists()
                link_in_lib_type = OPJ(data_dir, lib_type, f"{submission_dir}", f"{library}.fastq.gz")
                assert link_in_lib_type not in links_in_lib_type, f"Name conflict for {link_in_lib_type}."
                links_in_lib_type.add(link_in_lib_type)
                yield (
                    link_in_lib_type,
                    OPJ(data_dir, lib_type, analysis, f"{library}.fastq.gz"),
                    OPJ(data_dir, lib_type, analysis, f"{library}.fastq.gz.md5"))


rule all:
    input:
        # tsv files with content to copy-paste in the "RAW FILES" section of the submission spreadsheet
        expand(OPJ(data_dir, "{lib_type}", "raw.tsv"), lib_type=LIB_TYPES),
        list(zip(*determine_fqgz_and_md5file())),


def lib2data(wildcards):
    return data_info[wildcards.lib_type][wildcards.analysis]["libraries"][wildcards.library]


rule link_raw_data:
    """This rule installs the raw data in a local directory using symlinks.
    The location of the original files is taken from the configuration."""
    input:
        raw = lib2data,
    output:
        link = OPJ(data_dir, "{lib_type}", "{analysis}", "{library}.fastq.gz")
    wildcard_constraints:
        analysis = f"(?!{submission_dir}).*"
    message:
        "Making link {output.link} to raw data {input.raw}."
    run:
        # Ensure links are resolved in order to be able to match
        #if os.path.islink(input.raw):
        #    warnings.warn(f"{input.raw} is a link.\n")
        #    raw = os.readlink(input.raw)
        #else:
        #    raw = input.raw
        os.symlink(os.path.abspath(input.raw), output.link)


rule compute_md5sum:
    """This rule installs the raw data in a local directory using symlinks.
    The location of the original files is taken from the configuration."""
    input:
        link = rules.link_raw_data.output.link,
    output:
        md5 = OPJ(data_dir, "{lib_type}", "{analysis}", "{library}.fastq.gz.md5")
    message:
        "Computing md5sum for {input.link}."
    shell:
        """
        md5sum {input.link} > {output.md5}
        """


def lib_type2md5s(wildcards):
    """"""
    for (analysis, analysis_info) in config["data"][wildcards.lib_type].items():
        for library in analysis_info["libraries"]:
            yield OPJ(
                data_dir,
                f"{wildcards.lib_type}",
                f"{analysis}",
                f"{library}.fastq.gz.md5")


rule prepare_raw_data_lines:
    input:
        md5s = lib_type2md5s,
    output:
        tsv = OPJ(data_dir, "{lib_type}", "raw.tsv")
    run:
        with open(output.tsv, "w") as tsv_file:
            for md5 in input.md5s:
                with open(md5, "r") as md5_file:
                    for line in md5_file:
                        try:
                            (md5sum, raw_path) = line.strip().split()
                        except ValueError:
                            print(line)
                            raise
                        tsv_file.write(f"{OPB(raw_path)}\tfastq\t{md5sum}\n")


def get_link_for_analysis(wildcards):
    """"""
    potential_links = set()
    for (analysis, analysis_info) in config["data"][wildcards.lib_type].items():
        if wildcards.library in analysis_info["libraries"]:
            potential_link = OPJ(
                data_dir,
                f"{wildcards.lib_type}",
                f"{analysis}",
                f"{wildcards.library}.fastq.gz")
            assert potential_link not in potential_links, f"Name conflict for {wildcards.library}."
            potential_links.add(potential_link)
    (link,) = potential_links
    return link


# TODO: check that no identical links can be produced once analysis names are not present
rule create_link_for_lib_type:
    input:
        link = get_link_for_analysis,
    output:
        link_in_lib_type = OPJ(data_dir, "{lib_type}", f"{submission_dir}", "{library}.fastq.gz"),
    run:
        os.symlink(os.path.abspath(input.link), output.link_in_lib_type)


# TODO: use config["data"][lib_type][analysis]["config"] to match raw data and find corresponding bigwig file.
# TODO: use this to detect fake replicates
def find_libnames(wildcards):
    link = OPJ(data_dir, f"{wildcards.lib_type}", f"{submission_dir}", f"{wildcards.library}.fastq.gz")
    real_raw = os.readlink(link)
    analysis_config = yload(open(config["data"][wildcards.lib_type][wildcards.analysis]["config"], "r"))
    raw2libnames = defaultdict(list)
    for (cond, reps) in analysis_config["lib2raw"].items():
        for (rep, raw) in reps.items():
            if os.path.islink(raw):
                warnings.warn(f"{input.raw} is a link.\n")
                raw = os.readlink(raw)
            raw2libnames[raw].append(f"{cond}_{rep}")