Skip to content
Snippets Groups Projects
Select Git revision
  • 4fb8a6f1ddeea0ffc18d610b869b46d9fcba2d7b
  • master default protected
2 results

Data_submission.snakefile

Blame
  • Data_submission.snakefile 9.12 KiB
    """
    <https://www.ncbi.nlm.nih.gov/geo/info/submissionftp.html>
    Overview
    Back to top
    
    This document contains details about using FTP to transfer your files to GEO.
    
        You must be logged in to your GEO account to see the GEO FTP server credentials below.
        Gather all required submission files prepared according to the Hints and tips below. Your transfer should include all required components (raw data files, processed data files and metadata spreadsheet). Start at the Submitting data page for full submission requirements.
        On your computer, create a folder named using your GEO username (/cecerelab). Put all required submission files into this folder.
        Transfer the folder to the GEO FTP server using the credentials below. Do not transfer files unless you are confident that you have a complete submission that includes all required components (raw data files, processed data files and metadata spreadsheet).
        After the FTP transfer is complete, you must notify GEO using the Submit to GEO web form. We cannot start processing your submission until the transfer is complete and we have received all required components.
    
    Hints and tips
    Back to top
    
        Please contact us in advance if your submission exceeds 1 terabyte in size. Do not initiate transfer until you hear back from GEO.
        Your upload should include three components: (1) raw data files, (2) processed data files, and (3) completed Metadata Template. Start at the Submitting data page for full submission requirements.
        DO NOT bundle files into archives (.zip/.rar/.tar/etc) before transferring (OK for smaller microarray submissions). For next-generation sequencing submissions, and larger microarray submissions (exceeding 4GB), we recommend that you place all submission files into a single directory named according to your GEO username, and then recursively transfer the directory to our FTP server (see below examples).
        Avoid whitespace and special characters in file names. Use only alphanumerals [A-Z, a-z, 0-9], underscores [_] and dots [.].
        File names should be unique and specific. Generic names (e.g. ‘S_1_1.fq’) should be avoided to prevent overwriting identically-named files.
        For large, non-binary raw data files (e.g. FASTQ) we recommend (but do not require) gzip (*.gz) or bzip2 (*.bz2) compression to speed transfer.
        DO NOT use gz- or bzip2-compression on binary files (.BigWig, .bw, .bigBed, .bb, .h5, .bam, .tdf etc).
        For high-throughput sequencing submissions, we recommend providing the MD5 checksums for the files that you are uploading (details below).
        Please use 'passive mode' when transferring files. Optimal buffer size is ~32 MB.
        File names on the server are visible to other submitters, but content is accessible only to GEO curators.
        The FTP server is a temporary storage space. Files will be moved by curators to an internal location for processing and assigning of accessions.
        You will not be able to rename or remove files after uploading. Please contact us if you need assistance.
        Files deposited on the FTP site are not displayed under 'My Submissions' on the web interface. The web interface only displays accessioned submissions.
        Un-announced files will be automatically deleted from the server after two weeks.
    
    FTP server credentials
    Back to top
    host	ftp-private.ncbi.nlm.nih.gov
    username	geo
    password	33%9uyj_fCh?M16H
    url	ftp://geo:33%259uyj_fCh%3FM16H@ftp-private.ncbi.nlm.nih.gov
    
    Note: all submitters use the same login to access the ftp server (you cannot use your GEO username/password to connect to the ftp server) 
    """
    import sys
    
    
    major, minor = sys.version_info[:2]
    if major < 3 or (major == 3 and minor < 6):
        sys.exit("Need at least python 3.6\n")
    
    
    import os
    import warnings
    
    OPB = os.path.basename
    OPJ = os.path.join
    
    
    def formatwarning(message, category, filename, lineno, line=None):
        """Used to format warning messages."""
        return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)
    
    
    warnings.formatwarning = formatwarning
    
    from pathlib import Path
    from yaml import load as yload
    from collections import defaultdict
    
    
    ref_info = config["ref"]
    data_dir = ref_info["paper"]
    submission_dir = ref_info["NCBI_submitter"]
    
    data_info = config["data"]
    LIB_TYPES = list(data_info.keys())
    
    
    def determine_fqgz_and_md5file():
        links_in_lib_type = set()
        for (lib_type, analyses) in data_info.items():
            for (analysis, analysis_info) in analyses.items():
                for (library, raw_data) in analysis_info["libraries"].items():
                    assert Path(raw_data).exists()
                    link_in_lib_type = OPJ(data_dir, lib_type, f"{submission_dir}", f"{library}.fastq.gz")
                    assert link_in_lib_type not in links_in_lib_type, f"Name conflict for {link_in_lib_type}."
                    links_in_lib_type.add(link_in_lib_type)
                    yield (
                        link_in_lib_type,
                        OPJ(data_dir, lib_type, analysis, f"{library}.fastq.gz"),
                        OPJ(data_dir, lib_type, analysis, f"{library}.fastq.gz.md5"))
    
    
    rule all:
        input:
            # tsv files with content to copy-paste in the "RAW FILES" section of the submission spreadsheet
            expand(OPJ(data_dir, "{lib_type}", "raw.tsv"), lib_type=LIB_TYPES),
            list(zip(*determine_fqgz_and_md5file())),
    
    
    def lib2data(wildcards):
        return data_info[wildcards.lib_type][wildcards.analysis]["libraries"][wildcards.library]
    
    
    rule link_raw_data:
        """This rule installs the raw data in a local directory using symlinks.
        The location of the original files is taken from the configuration."""
        input:
            raw = lib2data,
        output:
            link = OPJ(data_dir, "{lib_type}", "{analysis}", "{library}.fastq.gz")
        wildcard_constraints:
            analysis = f"(?!{submission_dir}).*"
        message:
            "Making link {output.link} to raw data {input.raw}."
        run:
            # Ensure links are resolved in order to be able to match
            #if os.path.islink(input.raw):
            #    warnings.warn(f"{input.raw} is a link.\n")
            #    raw = os.readlink(input.raw)
            #else:
            #    raw = input.raw
            os.symlink(os.path.abspath(input.raw), output.link)
    
    
    rule compute_md5sum:
        """This rule installs the raw data in a local directory using symlinks.
        The location of the original files is taken from the configuration."""
        input:
            link = rules.link_raw_data.output.link,
        output:
            md5 = OPJ(data_dir, "{lib_type}", "{analysis}", "{library}.fastq.gz.md5")
        message:
            "Computing md5sum for {input.link}."
        shell:
            """
            md5sum {input.link} > {output.md5}
            """
    
    
    def lib_type2md5s(wildcards):
        """"""
        for (analysis, analysis_info) in config["data"][wildcards.lib_type].items():
            for library in analysis_info["libraries"]:
                yield OPJ(
                    data_dir,
                    f"{wildcards.lib_type}",
                    f"{analysis}",
                    f"{library}.fastq.gz.md5")
    
    
    rule prepare_raw_data_lines:
        input:
            md5s = lib_type2md5s,
        output:
            tsv = OPJ(data_dir, "{lib_type}", "raw.tsv")
        run:
            with open(output.tsv, "w") as tsv_file:
                for md5 in input.md5s:
                    with open(md5, "r") as md5_file:
                        for line in md5_file:
                            try:
                                (md5sum, raw_path) = line.strip().split()
                            except ValueError:
                                print(line)
                                raise
                            tsv_file.write(f"{OPB(raw_path)}\tfastq\t{md5sum}\n")
    
    
    def get_link_for_analysis(wildcards):
        """"""
        potential_links = set()
        for (analysis, analysis_info) in config["data"][wildcards.lib_type].items():
            if wildcards.library in analysis_info["libraries"]:
                potential_link = OPJ(
                    data_dir,
                    f"{wildcards.lib_type}",
                    f"{analysis}",
                    f"{wildcards.library}.fastq.gz")
                assert potential_link not in potential_links, f"Name conflict for {wildcards.library}."
                potential_links.add(potential_link)
        (link,) = potential_links
        return link
    
    
    # TODO: check that no identical links can be produced once analysis names are not present
    rule create_link_for_lib_type:
        input:
            link = get_link_for_analysis,
        output:
            link_in_lib_type = OPJ(data_dir, "{lib_type}", f"{submission_dir}", "{library}.fastq.gz"),
        run:
            os.symlink(os.path.abspath(input.link), output.link_in_lib_type)
    
    
    # TODO: use config["data"][lib_type][analysis]["config"] to match raw data and find corresponding bigwig file.
    # TODO: use this to detect fake replicates
    def find_libnames(wildcards):
        link = OPJ(data_dir, f"{wildcards.lib_type}", f"{submission_dir}", f"{wildcards.library}.fastq.gz")
        real_raw = os.readlink(link)
        analysis_config = yload(open(config["data"][wildcards.lib_type][wildcards.analysis]["config"], "r"))
        raw2libnames = defaultdict(list)
        for (cond, reps) in analysis_config["lib2raw"].items():
            for (rep, raw) in reps.items():
                if os.path.islink(raw):
                    warnings.warn(f"{input.raw} is a link.\n")
                    raw = os.readlink(raw)
                raw2libnames[raw].append(f"{cond}_{rep}")