From 06e5b121b88ab131176f8e65ecb85e79905f6361 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Fri, 14 Dec 2018 18:45:57 +0100
Subject: [PATCH] Read lengths should not come from first read.

The read length to indicate in the data submission spreadsheet should be
the number of sequencing cycles. Some reads may be shorter than this.
---
 Data_submission/Data_submission.snakefile | 24 ++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/Data_submission/Data_submission.snakefile b/Data_submission/Data_submission.snakefile
index b8bd633..832d91a 100644
--- a/Data_submission/Data_submission.snakefile
+++ b/Data_submission/Data_submission.snakefile
@@ -234,10 +234,11 @@ instrument2model = {
 
 def fq_info(fqgz):
     """Determines information about the source and content of a fastq file.
+    Based on the header of the first read in the file.
     Currently only works with some specific Illumina-generated files."""
     with gzopen(fqgz) as fq_file:
         [seq_id, descr] = fq_file.readline().strip().decode("utf-8").split(" ")
-        read_len = len(fq_file.readline().strip().decode("utf-8"))
+        # read_len = len(fq_file.readline().strip().decode("utf-8"))
     try:
         [instrument, run, flowcell, lane, tile, x, y, mate] = seq_id.split(":")
     except ValueError as err:
@@ -245,9 +246,11 @@ def fq_info(fqgz):
         mate = None
     model = instrument2model[instrument]
     if mate is None:
-        return (model, str(read_len), "single")
+        #return (model, str(read_len), "single")
+        return (model, "single")
     else:
-        return (model, str(read_len), "paired-end")
+        #return (model, str(read_len), "paired-end")
+        return (model, "paired-end")
 
 
 def lib_type2md5s(wildcards):
@@ -261,17 +264,27 @@ def lib_type2md5s(wildcards):
                 f"{library}.fastq.gz.md5")
 
 
+def lib_type2read_lens(wildcards):
+    """Find the read lengths (number of sequencing cycles) corresponding to raw data defined by *wildcards*."""
+    for (analysis, analysis_info) in data_info[wildcards.libtype].items():
+        # Loop to have as many read lengths as there are md5 files (from lib_type2md5s)
+        for _ in analysis_info["libraries"]:
+            yield analysis_info["read_len"]
+
+
 rule prepare_raw_data_lines:
     input:
         md5s = lib_type2md5s,
     output:
         tsv = OPJ(data_dir, "{libtype}", "raw.tsv")
+    params:
+        read_lens = lib_type2read_lens,
     message:
         """Preparing info to copy-paste in the RAW FILES section of the submission spreadsheet in:
         {data_dir}/{wildcards.libtype}/raw.tsv"""
     run:
         with open(output.tsv, "w") as tsv_file:
-            for md5_filename in input.md5s:
+            for (md5_filename, read_len) in zip(input.md5s, params.read_lens):
                 with open(md5_filename, "r") as md5_file:
                     for line in md5_file:
                         try:
@@ -281,7 +294,8 @@ rule prepare_raw_data_lines:
                                 "There is an issue with the following line from {md5_filename}:\n")
                             print(line)
                             raise
-                        (model, read_len, pairness) = fq_info(raw_path)
+                        # (model, read_len, pairness) = fq_info(raw_path)
+                        (model, pairness) = fq_info(raw_path)
                         tsv_file.write(
                             f"{OPB(raw_path)}\tfastq\t{md5sum}\t{model}\t{read_len}\t{pairness}\n")
 
-- 
GitLab