Skip to content
Snippets Groups Projects
Commit 06e5b121 authored by Blaise Li's avatar Blaise Li
Browse files

Read lengths should not come from first read.

The read length to indicate in the data submission spreadsheet should be
the number of sequencing cycles. Some reads may be shorter than this.
parent 16b41b3f
No related branches found
No related tags found
No related merge requests found
...@@ -234,10 +234,11 @@ instrument2model = { ...@@ -234,10 +234,11 @@ instrument2model = {
def fq_info(fqgz): def fq_info(fqgz):
"""Determines information about the source and content of a fastq file. """Determines information about the source and content of a fastq file.
Based on the header of the first read in the file.
Currently only works with some specific Illumina-generated files.""" Currently only works with some specific Illumina-generated files."""
with gzopen(fqgz) as fq_file: with gzopen(fqgz) as fq_file:
[seq_id, descr] = fq_file.readline().strip().decode("utf-8").split(" ") [seq_id, descr] = fq_file.readline().strip().decode("utf-8").split(" ")
read_len = len(fq_file.readline().strip().decode("utf-8")) # read_len = len(fq_file.readline().strip().decode("utf-8"))
try: try:
[instrument, run, flowcell, lane, tile, x, y, mate] = seq_id.split(":") [instrument, run, flowcell, lane, tile, x, y, mate] = seq_id.split(":")
except ValueError as err: except ValueError as err:
...@@ -245,9 +246,11 @@ def fq_info(fqgz): ...@@ -245,9 +246,11 @@ def fq_info(fqgz):
mate = None mate = None
model = instrument2model[instrument] model = instrument2model[instrument]
if mate is None: if mate is None:
return (model, str(read_len), "single") #return (model, str(read_len), "single")
return (model, "single")
else: else:
return (model, str(read_len), "paired-end") #return (model, str(read_len), "paired-end")
return (model, "paired-end")
def lib_type2md5s(wildcards): def lib_type2md5s(wildcards):
...@@ -261,17 +264,27 @@ def lib_type2md5s(wildcards): ...@@ -261,17 +264,27 @@ def lib_type2md5s(wildcards):
f"{library}.fastq.gz.md5") f"{library}.fastq.gz.md5")
def lib_type2read_lens(wildcards):
"""Find the read lengths (number of sequencing cycles) corresponding to raw data defined by *wildcards*."""
for (analysis, analysis_info) in data_info[wildcards.libtype].items():
# Loop to have as many read lengths as there are md5 files (from lib_type2md5s)
for _ in analysis_info["libraries"]:
yield analysis_info["read_len"]
rule prepare_raw_data_lines: rule prepare_raw_data_lines:
input: input:
md5s = lib_type2md5s, md5s = lib_type2md5s,
output: output:
tsv = OPJ(data_dir, "{libtype}", "raw.tsv") tsv = OPJ(data_dir, "{libtype}", "raw.tsv")
params:
read_lens = lib_type2read_lens,
message: message:
"""Preparing info to copy-paste in the RAW FILES section of the submission spreadsheet in: """Preparing info to copy-paste in the RAW FILES section of the submission spreadsheet in:
{data_dir}/{wildcards.libtype}/raw.tsv""" {data_dir}/{wildcards.libtype}/raw.tsv"""
run: run:
with open(output.tsv, "w") as tsv_file: with open(output.tsv, "w") as tsv_file:
for md5_filename in input.md5s: for (md5_filename, read_len) in zip(input.md5s, params.read_lens):
with open(md5_filename, "r") as md5_file: with open(md5_filename, "r") as md5_file:
for line in md5_file: for line in md5_file:
try: try:
...@@ -281,7 +294,8 @@ rule prepare_raw_data_lines: ...@@ -281,7 +294,8 @@ rule prepare_raw_data_lines:
"There is an issue with the following line from {md5_filename}:\n") "There is an issue with the following line from {md5_filename}:\n")
print(line) print(line)
raise raise
(model, read_len, pairness) = fq_info(raw_path) # (model, read_len, pairness) = fq_info(raw_path)
(model, pairness) = fq_info(raw_path)
tsv_file.write( tsv_file.write(
f"{OPB(raw_path)}\tfastq\t{md5sum}\t{model}\t{read_len}\t{pairness}\n") f"{OPB(raw_path)}\tfastq\t{md5sum}\t{model}\t{read_len}\t{pairness}\n")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment