diff --git a/remove-duplicates-from-sorted-fastq/OCaml/dune b/remove-duplicates-from-sorted-fastq/OCaml/dune index 2a640a4badf6da14404389d8f38e0e922acfb3db..62ea03d0120e475c6e83a2338e4ad749ca9c779f 100644 --- a/remove-duplicates-from-sorted-fastq/OCaml/dune +++ b/remove-duplicates-from-sorted-fastq/OCaml/dune @@ -1,3 +1,3 @@ (executable (name remove_duplicates_from_sorted_fastq) - (libraries batteries)) + (libraries batteries stdio)) diff --git a/remove-duplicates-from-sorted-fastq/OCaml/remove_duplicates_from_sorted_fastq.ml b/remove-duplicates-from-sorted-fastq/OCaml/remove_duplicates_from_sorted_fastq.ml index cc805ef280eb120bd7e3b4f6cc5be0fda0c0fda7..53d7566cf9f796ad4ecaa9d00be4b3eb0d4c96cb 100644 --- a/remove-duplicates-from-sorted-fastq/OCaml/remove_duplicates_from_sorted_fastq.ml +++ b/remove-duplicates-from-sorted-fastq/OCaml/remove_duplicates_from_sorted_fastq.ml @@ -1,4 +1,5 @@ -(*open Batteries*) +(* open Stdio *) +(* open Batteries *) (* Function ignoring the "stream count" argument (whatever that means) and wrapping lines in option. *) @@ -80,6 +81,7 @@ print_fq { "BBBFFFFFFFFFFIIIFFFFFFFFFFFFFFFBFFFFB70''77'''000'")} *) +(* let main () = let lines: string BatEnum.t = BatIO.lines_of BatIO.stdin in (* @@ -127,5 +129,59 @@ let main () = let fused_fqs = BatEnum.from fuse_init_fqs in BatEnum.iter print_fq fused_fqs +*) + + +type line_group = + | Oneline of string + | Twolines of string * string + | Threelines of string * string * string + | Fourlines of string * string * string * string + +let make_fq = function + | Fourlines (l1, l2, _, l4) -> {name=l1; seq=l2; qual=l4} + | _ -> failwith "Not enough lines for a fastq record" + +(* seems to work +let main () = + let four_line_groups = + let accumulate_four_lines (acc: line_group list) (line: string) = + match acc with + | [] -> [Oneline line] + | (Oneline l1) :: tl -> (Twolines (l1, line)) :: tl + | (Twolines (l1, l2)) :: tl -> (Threelines (l1, l2, line)) :: tl + | (Threelines (l1, l2, l3)) :: tl -> (Fourlines (l1, l2, l3, line)) :: tl + | (Fourlines _) :: _ -> (Oneline line) :: acc + in + BatLazyList.of_list (Stdio.In_channel.fold_lines Stdio.In_channel.stdin ~init:[] ~f:accumulate_four_lines) + in + let fqs = BatLazyList.map make_fq four_line_groups in + BatLazyList.iter print_fq fqs +*) + +let main () = + let four_line_groups = + let accumulate_four_lines (acc: line_group list) (line: string) = + match acc with + | [] -> [Oneline line] + | (Oneline l1) :: tl -> (Twolines (l1, line)) :: tl + | (Twolines (l1, l2)) :: tl -> (Threelines (l1, l2, line)) :: tl + | (Threelines (l1, l2, l3)) :: tl -> (Fourlines (l1, l2, l3, line)) :: tl + | (Fourlines _) :: _ -> (Oneline line) :: acc + in + BatLazyList.of_list (Stdio.In_channel.fold_lines Stdio.In_channel.stdin ~init:[] ~f:accumulate_four_lines) + in + let fqs = BatLazyList.map make_fq four_line_groups in + let fused_fqs: fastq BatLazyList.t = + let fuse_fqs (fq_ll: fastq BatLazyList.t) (fq: fastq): fastq BatLazyList.t = + match BatLazyList.get fq_ll with + | None -> BatLazyList.cons fq fq_ll + | Some (fq2, tl) -> + if fq2.seq = fq.seq + then BatLazyList.cons {name = fq.name; seq = fq2.seq; qual = best_quals fq2.qual fq.qual} tl + else BatLazyList.cons fq fq_ll + in BatLazyList.fold_left fuse_fqs BatLazyList.nil fqs + in + BatLazyList.iter print_fq fused_fqs let () = main ()