From 93b919a864424b58b3fdb2cd93d8fbd015c6b3d2 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Tue, 9 Jan 2018 19:51:31 +0100
Subject: [PATCH] Script to demultiplex fastq using qualities.
Could probably be made faster using Cython, or an other language.
---
fastq-demultiplexing/Python/fq_demux.py | 126 ++++++++++++++++++++++++
1 file changed, 126 insertions(+)
create mode 100755 fastq-demultiplexing/Python/fq_demux.py
diff --git a/fastq-demultiplexing/Python/fq_demux.py b/fastq-demultiplexing/Python/fq_demux.py
new file mode 100755
index 0000000..f5a253e
--- /dev/null
+++ b/fastq-demultiplexing/Python/fq_demux.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+
+"""This program demultiplexes fastq data based on a given list of
+barcodes and a barcode start position. It tries to assign each fastq
+record to the most likely barcode, taking into account the sequence
+qualities (interpreted as being sanger-encoded)."""
+
+import argparse
+import fileinput
+import os
+from itertools import islice
+
+
+OPD = os.path.dirname
+OPJ = os.path.join
+
+
+def parse_fastq(input_stream):
+ """Generates fastq triplets for text *input_stream*."""
+ for line in input_stream:
+ name = line.strip().decode("utf-8")
+ seq = input_stream.readline().strip().decode("utf-8")
+ input_stream.readline()
+ qual = input_stream.readline().strip().decode("utf-8")
+ yield (name, seq, qual)
+
+
+def main():
+ """Main function of the program."""
+ parser = argparse.ArgumentParser(
+ description=__doc__,
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "-i", "--input_fastq",
+ help="Fastq file to demultiplex.")
+ parser.add_argument(
+ "-o", "--output_dir",
+ help="Directory in which to put the demultiplexed fastq files.")
+ parser.add_argument(
+ "-b", "--barcodes",
+ help="Barcodes to which the reads should be attributed.",
+ nargs="*")
+ parser.add_argument(
+ "-s", "--barcode_start",
+ type=int,
+ required=True,
+ help="Position at which the barcode starts (1-based).")
+ parser.add_argument(
+ "-m", "--max_diff",
+ type=int,
+ default=3,
+ help="Only assign a record to one of the barcodes if it has "
+ "no more than *max_diff* differences in its barcode portion.")
+ args = parser.parse_args()
+ if args.output_dir:
+ out_dir = args.output_dir
+ os.makedirs(out_dir, exist_ok=True)
+ else:
+ if args.input_fastq is None:
+ out_dir = "."
+ else:
+ out_dir = OPD(args.input_fastq)
+ outfiles = {}
+ bc_len = None
+ for barcode in args.barcodes:
+ if bc_len is None:
+ bc_len = len(barcode)
+ else:
+ assert(len(barcode) == bc_len), f"len({barcode}) != {bc_len}"
+ outfiles[barcode] = open(OPJ(out_dir, f"{barcode}.fastq"), "w")
+ if not outfiles:
+ print("No barcode given.")
+ exit(0)
+ outfiles["undetermined"] = open(OPJ(
+ out_dir, f"undetermined.fastq"), "w")
+ bc_start = args.barcode_start - 1
+
+ def bc_prob_profile(seq, qual):
+ # return [(letter, 10 ** (-quality / 10))
+ return [(letter, 10 ** (-ord(quality) / 10))
+ for letter, quality
+ in islice(zip(seq, qual), bc_start, bc_start + bc_len)]
+
+ def barcode_prob_computer(seq, qual):
+ bpp = bc_prob_profile(seq, qual)
+
+ def prob(barcode):
+ bc_prob = 1
+ nb_diff = 0
+ for ((seq_letter, prob_err), bc_letter) in zip(bpp, barcode):
+ if seq_letter == bc_letter:
+ bc_prob *= (1 - prob_err)
+ else:
+ nb_diff += 1
+ bc_prob *= (prob_err / 3)
+ return bc_prob, nb_diff
+ return prob
+
+ if args.input_fastq is None:
+ fq_files = ("-",)
+ else:
+ fq_files = (args.input_fastq,)
+ with fileinput.input(files=fq_files,
+ openhook=fileinput.hook_compressed) as fq_in:
+ for (name, seq, qual) in parse_fastq(fq_in):
+ bc_prob = barcode_prob_computer(seq, qual)
+ (best_bc, best_prob) = ("", 0.0)
+ best_diff = bc_len
+ for barcode in args.barcodes:
+ (this_prob, this_diff) = bc_prob(barcode)
+ if this_prob > best_prob:
+ (best_bc,
+ best_prob,
+ best_diff) = (barcode, this_prob, this_diff)
+ bc_comments = f"{best_bc}:{best_prob}:{best_diff}"
+ fq_repr = f"{name} {bc_comments}\n{seq}\n+\n{qual}\n"
+ if best_diff <= args.max_diff:
+ outfiles[best_bc].write(fq_repr)
+ else:
+ outfiles["undetermined"].write(fq_repr)
+ for outfile in outfiles.values():
+ outfile.close()
+
+
+if __name__ == "__main__":
+ exit(main())
--
GitLab