Read lengths should not come from first read.

The read length to indicate in the data submission spreadsheet should be the number of sequencing cycles. Some reads may be shorter than this.

Read lengths should not come from first read.
06e5b121 · Blaise Li · 16b41b3f · 06e5b121
Commit 06e5b121 authored 6 years ago by Blaise Li
--- a/Data_submission/Data_submission.snakefile
+++ b/Data_submission/Data_submission.snakefile
@@ -234,10 +234,11 @@ instrument2model = {
 def fq_info(fqgz):
    """Determines information about the source and content of a fastq file.
+    Based on the header of the first read in the file.
    Currently only works with some specific Illumina-generated files."""
    with gzopen(fqgz) as fq_file:
        [seq_id, descr] = fq_file.readline().strip().decode("utf-8").split(" ")
-        read_len = len(fq_file.readline().strip().decode("utf-8"))
+        # read_len = len(fq_file.readline().strip().decode("utf-8"))
    try:
        [instrument, run, flowcell, lane, tile, x, y, mate] = seq_id.split(":")
    except ValueError as err:
@@ -245,9 +246,11 @@ def fq_info(fqgz):
        mate = None
    model = instrument2model[instrument]
    if mate is None:
-        return (model, str(read_len), "single")
+        #return (model, str(read_len), "single")
+        return (model, "single")
    else:
-        return (model, str(read_len), "paired-end")
+        #return (model, str(read_len), "paired-end")
+        return (model, "paired-end")
 def lib_type2md5s(wildcards):
@@ -261,17 +264,27 @@ def lib_type2md5s(wildcards):
                f"{library}.fastq.gz.md5")
+def lib_type2read_lens(wildcards):
+    """Find the read lengths (number of sequencing cycles) corresponding to raw data defined by *wildcards*."""
+    for (analysis, analysis_info) in data_info[wildcards.libtype].items():
+        # Loop to have as many read lengths as there are md5 files (from lib_type2md5s)
+        for _ in analysis_info["libraries"]:
+            yield analysis_info["read_len"]
 rule prepare_raw_data_lines:
    input:
        md5s = lib_type2md5s,
    output:
        tsv = OPJ(data_dir, "{libtype}", "raw.tsv")
+    params:
+        read_lens = lib_type2read_lens,
    message:
        """Preparing info to copy-paste in the RAW FILES section of the submission spreadsheet in:
        {data_dir}/{wildcards.libtype}/raw.tsv"""
    run:
        with open(output.tsv, "w") as tsv_file:
-            for md5_filename in input.md5s:
+            for (md5_filename, read_len) in zip(input.md5s, params.read_lens):
                with open(md5_filename, "r") as md5_file:
                    for line in md5_file:
                        try:
@@ -281,7 +294,8 @@ rule prepare_raw_data_lines:
                                "There is an issue with the following line from {md5_filename}:\n")
                            print(line)
                            raise
-                        (model, read_len, pairness) = fq_info(raw_path)
+                        # (model, read_len, pairness) = fq_info(raw_path)
+                        (model, pairness) = fq_info(raw_path)
                        tsv_file.write(
                            f"{OPB(raw_path)}\tfastq\t{md5sum}\t{model}\t{read_len}\t{pairness}\n")