From 622e57f27be511aa5db0855b040cebb6f7f6c58b Mon Sep 17 00:00:00 2001
From: criscuol <alexis.criscuolo@pasteur.fr>
Date: Sat, 13 Mar 2021 13:27:05 +0100
Subject: [PATCH] 2.0

---
 README.md      |  49 ++++++++++-
 contig_info.sh | 222 ++++++++++++++++++++++++++++++++++---------------
 2 files changed, 201 insertions(+), 70 deletions(-)

diff --git a/README.md b/README.md
index 46531a2..0f73f1a 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,12 @@
 # contig_info
 
 _contig_info_ is a command line program written in [Bash](https://www.gnu.org/software/bash/) for quickly estimating several standard descriptive statistics from FASTA-formatted contig files inferred by _de novo_ genome assembly methods.
-Estimated statistics are sequence number, residue counts, AT- and GC-content, sequence lengths, [auN](https://lh3.github.io/2020/04/08/a-new-metric-on-assembly-contiguity) (also called E-size, Salzberg et al. 2012), [N50](https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics) (Lander et al. 2001), [NG50](https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics) (Earl et al. 2011), and the related N75, NG75, N90, NG90, L50, LG50, L75, LG75, L90, LG90.
+Estimated statistics are sequence number, residue counts, AT- and GC-content, sequence lengths, [auN](https://lh3.github.io/2020/04/08/a-new-metric-on-assembly-contiguity) (also called E-size, Salzberg et al. 2012), [N50](https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics) (Lander et al. 2001), [NG50](https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics) (Earl et al. 2011), and the related N(G)75, N(G)G90, L(G)50, L(G)75, L(G)90.
+_contig_info_ can also estimates nucleotide content statistics for each contig sequence.
 
 ## Installation and execution
 
-Give the execute permission to the file `contig_info.sh` by typing:
+Give the execute permission to the file `contig_info.sh` by typing: 
 
 ```bash
 chmod +x contig_info.sh
@@ -29,6 +30,8 @@ Run _contig_info_ without option to read the following documentation:
                than this cutoff will be discarded (default: 1)
    -g <int>    expected genome size  for computing auNG  and {N,L}G{50,75,90}
                values instead of auN and {N,L}{50,75,90} ones, respectively
+   -r          residue content statistics for each contig sequence instead of 
+               global statistics
    -t          tab-delimited output
 ```
 
@@ -126,7 +129,7 @@ Mucor.WJ11.fasta      28368   24148    12884   5672     429  898  1455
 ```
 
 
-Finally, the option `-g` can be used to set an expected genome size for obtaining auNG and {N,L}G{50,75,90} statistics instead of auN and {N,L}{50,75,90} ones:
+The option `-g` can be used to set an expected genome size for obtaining auNG and {N,L}G{50,75,90} statistics instead of auN and {N,L}{50,75,90} ones:
 
 ```bash
 ./contig_info.sh  -t  -g 36000000  Mucor.*.fasta | cut -f1,22-
@@ -142,6 +145,46 @@ Mucor.WJ11.fasta      26055   21799    9865    2445     493  1092 2146 36000000
 ```
 
 
+Finally, the option `-r` enables to obtain the residue details for each sequence within each input file.
+Option `-r` can be used together with option `-t` to obtain a global view of the residue composition:
+
+```bash
+./contig_info.sh  -r  -t   Mucor.CBS277.49.fasta
+```
+
+```
+#File                  Seq                                                                                           Nres    A       C       G       T       N  %A    %C    %G    %T    %N  %AT   %GC    Pval
+Mucor.CBS277.49.fasta  AMYB01000001.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_01, whole genome shotgun sequence   6050249 1750309 1276313 1271843 1751784 0  28.92 21.09 21.02 28.95 0   57.89 42.11  1.0000
+Mucor.CBS277.49.fasta  AMYB01000002.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_02, whole genome shotgun sequence   5009828 1445835 1059454 1055351 1449188 0  28.85 21.14 21.06 28.92 0   57.79 42.21  0.000
+Mucor.CBS277.49.fasta  AMYB01000003.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_03, whole genome shotgun sequence   4868387 1404688 1027031 1026257 1410411 0  28.85 21.09 21.08 28.97 0   57.83 42.17  0.020
+Mucor.CBS277.49.fasta  AMYB01000004.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_04, whole genome shotgun sequence   4318338 1250156 914240  913373  1240569 0  28.94 21.17 21.15 28.72 0   57.68 42.32  0.000
+Mucor.CBS277.49.fasta  AMYB01000005.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_05, whole genome shotgun sequence   3239665 934794  681540  688359  934972  0  28.85 21.03 21.24 28.86 0   57.72 42.28  0.222
+Mucor.CBS277.49.fasta  AMYB01000006.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_06, whole genome shotgun sequence   3187354 921853  671173  669308  925020  0  28.92 21.05 20.99 29.02 0   57.95 42.05  0.626
+Mucor.CBS277.49.fasta  AMYB01000007.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_07, whole genome shotgun sequence   3096690 894782  653220  654088  894600  0  28.89 21.09 21.12 28.88 0   57.79 42.21  0.444
+Mucor.CBS277.49.fasta  AMYB01000008.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_08, whole genome shotgun sequence   2213752 637973  467830  470692  637257  0  28.81 21.13 21.26 28.78 0   57.61 42.39  0.526
+Mucor.CBS277.49.fasta  AMYB01000009.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_09, whole genome shotgun sequence   1074709 310418  227365  225234  311692  0  28.88 21.15 20.95 29.00 0   57.89 42.11  0.808
+Mucor.CBS277.49.fasta  AMYB01000010.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_10, whole genome shotgun sequence   976311  285831  206369  203542  280569  0  29.27 21.13 20.84 28.73 0   58.02 41.98  0.792
+Mucor.CBS277.49.fasta  AMYB01000011.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_11, whole genome shotgun sequence   934259  273240  195919  194769  270331  0  29.24 20.97 20.84 28.93 0   58.19 41.81  0.378
+Mucor.CBS277.49.fasta  AMYB01000012.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_12, whole genome shotgun sequence   832466  240427  175344  173907  242788  0  28.88 21.06 20.89 29.16 0   58.05 41.95  0.720
+Mucor.CBS277.49.fasta  AMYB01000013.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_13, whole genome shotgun sequence   423239  121227  88619   87831   125562  0  28.64 20.93 20.75 29.66 0   58.31 41.69  0.180
+Mucor.CBS277.49.fasta  AMYB01000014.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_14, whole genome shotgun sequence   155282  45402   31673   31509   46698   0  29.23 20.39 20.29 30.07 0   59.32 40.68  0.000
+Mucor.CBS277.49.fasta  AMYB01000015.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_15, whole genome shotgun sequence   97977   28739   20928   20035   28275   0  29.33 21.36 20.44 28.85 0   58.20 41.80  0.702
+Mucor.CBS277.49.fasta  AMYB01000016.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_16, whole genome shotgun sequence   41542   11324   9228    9406    11584   0  27.25 22.21 22.64 27.88 0   55.15 44.85  0.000
+Mucor.CBS277.49.fasta  AMYB01000017.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_17, whole genome shotgun sequence   17493   5063    3195    3919    5316    0  28.94 18.26 22.40 30.38 0   59.34 40.66  0.504
+Mucor.CBS277.49.fasta  AMYB01000018.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_18, whole genome shotgun sequence   11355   3216    2365    2256    3518    0  28.32 20.82 19.86 30.98 0   59.31 40.69  0.486
+Mucor.CBS277.49.fasta  AMYB01000019.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_19, whole genome shotgun sequence   9869    2972    2278    2160    2459    0  30.11 23.08 21.88 24.91 0   55.04 44.96  0.144
+Mucor.CBS277.49.fasta  AMYB01000020.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_21, whole genome shotgun sequence   4662    1534    815     946     1367    0  32.90 17.48 20.29 29.32 0   62.23 37.77  0.216
+Mucor.CBS277.49.fasta  AMYB01000021.1 Mucor lusitanicus CBS 277.49 MUCCIscaffold_22, whole genome shotgun sequence   4155    1247    1002    1116    790     0  30.01 24.11 26.85 19.01 0   49.03 50.97  0.036
+```
+
+Note that the last column `Pval` assesses the GC-content adequation between each contig and the longest one.
+When _Pval_ is close to 0, then the %GC of the corresponding contig is significantly different to the %GC of the longest one.
+This _p_-value can be used as an indicator when searching for particular replicons (e.g. plasmids, mitochondrion) or artefactual contigs, as such sequences often induce specific nucleotide compositions.
+Indeed, when considering a FASTA file outputted by a _de novo_ assembly program, the longest contig generally does not correspond to such replicon(s), therefore giving a good approximation of the expected GC-content within the hole chromosome.
+
+
+
+
 ## References
 
 Earl D, Bradnam K, St John J, Darling A, Lin D, Fass J, Yu HO, Buffalo V, Zerbino DR, Diekhans M, Nguyen N, Ariyaratne PN, Sung WK, Ning Z, Haimel M, Simpson JT, Fonseca NA, Birol Ä°, Docking TR, Ho IY, Rokhsar DS, Chikhi R, Lavenier D, Chapuis G, Naquin D, Maillet N, Schatz MC, Kelley DR, Phillippy AM, Koren S, Yang SP, Wu W, Chou WC, Srivastava A, Shaw TI, Ruby JG, Skewes-Cox P, Betegon M, Dimon MT, Solovyev V, Seledtsov I, Kosarev P, Vorobyev D, Ramirez-Gonzalez R, Leggett R, MacLean D, Xia F, Luo R, Li Z, Xie Y, Liu B, Gnerre S, MacCallum I, Przybylski D, Ribeiro FJ, Yin S, Sharpe T, Hall G, Kersey PJ, Durbin R, Jackman SD, Chapman JA, Huang X, DeRisi JL, Caccamo M, Li Y, Jaffe DB, Green RE, Haussler D, Korf I, Paten B (2011) _Assemblathon 1: a competitive assessment of de novo short read assembly methods_. **Genome Research**, 21(12):2224-2241. [doi:10.1101/gr.126599.111](https://genome.cshlp.org/content/21/12/2224).
diff --git a/contig_info.sh b/contig_info.sh
index c22e1c1..d279e13 100755
--- a/contig_info.sh
+++ b/contig_info.sh
@@ -4,7 +4,7 @@
 #                                                                                                            #
 #  contig_info: a BASH script to estimate standard statistics from FASTA contig files                        #
 #                                                                                                            #
-#  Copyright (C) 2018-2021  Institut Pasteur                                                                 #
+   COPYRIGHT="Copyright (C) 2018-2021 Institut Pasteur"                                                      #
 #                                                                                                            #
 #  This program  is free software:  you can  redistribute it  and/or modify it  under the terms  of the GNU  #
 #  General Public License as published by the Free Software Foundation, either version 3 of the License, or  #
@@ -33,7 +33,11 @@
 # = VERSIONS =                                                                                               #
 # ============                                                                                               #
 #                                                                                                            #
-  VERSION=1.1.201007ac                                                                                       #
+  VERSION=2.0.210312ac                                                                                       #
+# + adding option -r                                                                                         #
+# + adding trap when interrupting script                                                                     #
+#                                                                                                            #
+# VERSION=1.1.201007ac                                                                                       #
 # + estimating auN (also called E-size)                                                                      #
 #                                                                                                            #
 # VERSION=1.0.190426ac                                                                                       #
@@ -69,10 +73,9 @@
 #                                                                                                           #
 if [ "$1" = "-?" ] || [ $# -lt 1 ]
 then
+  echo -e "\n\033[1m contig_info v$VERSION           $COPYRIGHT\033[0m";
   cat <<EOF
 
- contig_info v.$VERSION         Copyright (C) 2018-2021  Institut Pasteur
-
  USAGE:  contig_info.sh  [options]  <contig_files> 
 
   where 'options' are:
@@ -81,11 +84,13 @@ then
                than this cutoff will be discarded (default: 1)
    -g <int>    expected genome size  for computing auNG  and {N,L}G{50,75,90}
                values instead of auN and {N,L}{50,75,90} ones, respectively
+   -r          residue content statistics for each contig sequence instead of 
+               global statistics
    -t          tab-delimited output
 
 EOF
-  exit                                                                                                      #
-fi                                                                                                          #
+  exit ;
+fi
 #                                                                                                           #
 #############################################################################################################
 
@@ -110,21 +115,29 @@ randomfile() {
 #### INITIALIZING PARAMETERS AND READING OPTIONS                                                         ####
 ####                                                                                                     ####
 #############################################################################################################
+export LC_ALL=C;
+
 MIN_CONTIG_LGT=1;
 GENOME_SIZE=0;
+RES_CONTENT=false;
 TSVOUT=false;
-while getopts m:g:t option
+
+while getopts m:g:tr option
 do
   case $option in
     m)  MIN_CONTIG_LGT=$OPTARG                            ;;
     g)  GENOME_SIZE=$OPTARG                               ;;
+    r)  RES_CONTENT=true                                  ;;
     t)  TSVOUT=true                                       ;;
     :)  echo "option $OPTARG : missing argument" ; exit 1 ;;
     \?) echo "$OPTARG : option invalide" ;         exit 1 ;;
   esac
 done
-if [ $MIN_CONTIG_LGT -lt 1 ]; then echo "   the min contig length threshold must be a positive integer (option -m)" ; exit 1 ; fi
-if [ $GENOME_SIZE -lt 0 ];    then echo "   the expected genome size must be a positive integer (option -g)" ;        exit 1 ; fi
+if ! [[ $MIN_CONTIG_LGT =~ ^[0-9]+$ ]]; then echo " incorrect value (option -m): $MIN_CONTIG_LGT"                           >&2 ; exit 1 ; fi
+if [ $MIN_CONTIG_LGT -lt 1 ];           then echo " the min contig length threshold must be a positive integer (option -m)" >&2 ; exit 1 ; fi
+if ! [[ $GENOME_SIZE =~ ^[0-9]+$ ]];    then echo " incorrect value (option -g): $GENOME_SIZE"                              >&2 ; exit 1 ; fi
+if [ $GENOME_SIZE -lt 0 ];              then echo " the expected genome size must be a positive integer (option -g)"        >&2 ; exit 1 ; fi
+
 
 #############################################################################################################
 ####                                                                                                     ####
@@ -133,81 +146,156 @@ if [ $GENOME_SIZE -lt 0 ];    then echo "   the expected genome size must be a p
 #############################################################################################################
 if $TSVOUT
 then
-  CSVCAPT="#File\tNseq\tNres\tA\tC\tG\tT\tN\t%A\t%C\t%G\t%T\t%N\t%AT\t%GC\tMin\tQ25\tMed\tQ75\tMax\tAvg\tauN\tN50\tN75\tN90\tL50\tL75\tL90";
-  [ $GENOME_SIZE -ne 0 ]&&CSVCAPT="$CSVCAPT\tExpSize";
+  if ! $RES_CONTENT
+  then
+    CSVCAPT="#File\tNseq\tNres\tA\tC\tG\tT\tN\t%A\t%C\t%G\t%T\t%N\t%AT\t%GC\tMin\tQ25\tMed\tQ75\tMax\tAvg\tauN\tN50\tN75\tN90\tL50\tL75\tL90";
+    [ $GENOME_SIZE -ne 0 ] && CSVCAPT="$CSVCAPT\tExpSize";
+  else
+    CSVCAPT="#File\tSeq\tNres\tA\tC\tG\tT\tN\t%A\t%C\t%G\t%T\t%N\t%AT\t%GC\tPval";
+  fi
   echo -e "$CSVCAPT" ;
 fi
+
 SEQS=$(randomfile); ## txt file with one sequence per line
+trap "echo -n interrupting ; rm -f $SEQS &>/dev/null ; echo ; exit " SIGINT
+
 for INFILE in "$@"
 do
   if [ ! -e $INFILE ] || [ ! -f $INFILE ] || [ ! -r $INFILE ]; then continue; fi
     
   tr -d '\15\32' < $INFILE | awk -v thr=$MIN_CONTIG_LGT '/^>/{if(length(s)>=thr)print s;s="";next}{s=s$0}END{if(length(s)>=thr)print s}' | tr '[:lower:]' '[:upper:]' > $SEQS ;
-  R=$(tr -d [:cntrl:] < $SEQS | wc -c); S=$(wc -l < $SEQS); AVG=$(bc -l<<<"scale=2;$R/$S" | sed 's/^\./0./');
-  A=$(tr -cd A < $SEQS | wc -c); fA=$(bc -l <<<"scale=2;100*$A/$R" | sed 's/^\./0./');
-  C=$(tr -cd C < $SEQS | wc -c); fC=$(bc -l <<<"scale=2;100*$C/$R" | sed 's/^\./0./');
-  G=$(tr -cd G < $SEQS | wc -c); fG=$(bc -l <<<"scale=2;100*$G/$R" | sed 's/^\./0./');
-  T=$(tr -cd T < $SEQS | wc -c); fT=$(bc -l <<<"scale=2;100*$T/$R" | sed 's/^\./0./');
-  N=$(tr -cd N < $SEQS | wc -c); fN=$(bc -l <<<"scale=2;100*$N/$R" | sed 's/^\./0./');
-  fGC=$(bc -l <<<"scale=2;100*($C+$G)/($A+$C+$G+$T)" | sed 's/^\./0./'); fAT=$(bc -l <<<"scale=2;100-$fGC" | sed 's/^\./0./');
-  ER=$R; [ $GENOME_SIZE != 0 ] && ER=$GENOME_SIZE;
-  STATS=$(awk '{print length}' $SEQS | sort -rn | awk -v g=$ER '{l[++n]=$0;aun+=$0*$0}END{g50=g/2;g75=3*g/4;g90=9*g/10;i=s=n50=n75=n90=0;while(++i<=n&&n90==0){s+=l[i];n50==0&&s>=g50&&n50=l[i]+(l50=i);n75==0&&s>=g75&&n75=l[i]+(l75=i);n90==0&&s>=g90&&n90=l[i]+(l90=i)}print (n50-l50)"\t"(n75-l75)"\t"(n90-l90)"\t"l50"\t"l75"\t"l90"\t"l[1]"\t"l[int(n/4+1)]"\t"l[int(n/2+1)]"\t"l[int(3*n/4+1)]"\t"l[n]"\t"int(0.5+aun/g)}');
-  N50=$(cut -f1 <<<"$STATS");  N75=$(cut -f2 <<<"$STATS");  N90=$(cut -f3 <<<"$STATS"); 
-  L50=$(cut -f4 <<<"$STATS");  L75=$(cut -f5 <<<"$STATS");  L90=$(cut -f6 <<<"$STATS"); 
-  Q75=$(cut -f8 <<<"$STATS");  Q50=$(cut -f9 <<<"$STATS");  Q25=$(cut -f10 <<<"$STATS"); 
-  MIN=$(cut -f11 <<<"$STATS"); MAX=$(cut -f7 <<<"$STATS");  AUN=$(cut -f12 <<<"$STATS");
-
-  if ! $TSVOUT
+  
+  if ! $RES_CONTENT
   then
-    echo ;
-    echo "File                           $(basename $INFILE)" ;
-    echo ;
-    echo "Number of sequences            $S" ;
-    echo ;
-    echo "Residue counts:" ;
-    echo "  Number of A's                $A  $fA %" ;
-    echo "  Number of C's                $C  $fC %" ;
-    echo "  Number of G's                $G  $fG %" ;
-    echo "  Number of T's                $T  $fT %" ;
-    echo "  Number of N's                $N  $fN %" ;
-    echo "  Total                        $R" ;
-    echo ;
-    echo "  %AT                          $fAT %" ;
-    echo "  %GC                          $fGC %" ;
-    echo ;
-    echo "Sequence lengths:" ;
-    echo "  Minimum                      $MIN" ;
-    echo "  Quartile 25%                 $Q25" ;
-    echo "  Median                       $Q50" ;
-    echo "  Quartile 75%                 $Q75" ;
-    echo "  Maximum                      $MAX" ;
-    echo "  Average                      $AVG" ;
-    echo ;
-    echo "Contiguity statistics:" ;
-    echo "  auN                          $AUN" ;
-    echo "  N50                          $N50" ;
-    echo "  N75                          $N75" ;
-    echo "  N90                          $N90" ;
-    echo "  L50                          $L50" ;
-    echo "  L75                          $L75" ;
-    echo "  L90                          $L90" ;
-    if [ $GENOME_SIZE -ne 0 ]; then echo "  Expected genome size         $GENOME_SIZE"; fi
-    echo ;
-  else
-    CSVLINE="$(basename $INFILE)\t$S\t$R\t$A\t$C\t$G\t$T\t$N\t$fA%\t$fC%\t$fG%\t$fT%\t$fN%\t$fAT%\t$fGC%\t$MIN\t$Q25\t$Q50\t$Q75\t$MAX\t$AVG\t$AUN\t$N50\t$N75\t$N90\t$L50\t$L75\t$L90";
-    [ $GENOME_SIZE -ne 0 ]&&CSVLINE="$CSVLINE\t$ER";
-    echo -e "$CSVLINE" ;
+    R=$(tr -d [:cntrl:] < $SEQS | wc -c); S=$(wc -l < $SEQS); AVG=$(bc -l<<<"scale=2;$R/$S" | sed 's/^\./0./');
+    A=$(tr -cd A < $SEQS | wc -c); fA=$(bc -l <<<"scale=2;100*$A/$R" | sed 's/^\./0./');
+    C=$(tr -cd C < $SEQS | wc -c); fC=$(bc -l <<<"scale=2;100*$C/$R" | sed 's/^\./0./');
+    G=$(tr -cd G < $SEQS | wc -c); fG=$(bc -l <<<"scale=2;100*$G/$R" | sed 's/^\./0./');
+    T=$(tr -cd T < $SEQS | wc -c); fT=$(bc -l <<<"scale=2;100*$T/$R" | sed 's/^\./0./');
+    N=$(tr -cd N < $SEQS | wc -c); fN=$(bc -l <<<"scale=2;100*$N/$R" | sed 's/^\./0./');
+    ACGT=$(( $A + $C + $G + $T ));
+    fGC=$(bc -l <<<"scale=2;100*($C+$G)/$ACGT" | sed 's/^\./0./'); fAT=$(bc -l <<<"scale=2;100-$fGC" | sed 's/^\./0./');
+    ER=$R; [ $GENOME_SIZE != 0 ] && ER=$GENOME_SIZE;
+    STATS=$(awk '{print length}' $SEQS | sort -rn | awk -v g=$ER '{l[++n]=$0;aun+=$0*$0}
+                                                                  END{OFMT="%f";g50=g/2;g75=3*g/4;g90=9*g/10;i=s=n50=n75=n90=0;
+                                                                      while(++i<=n&&n90==0){s+=l[i];n50==0&&s>=g50&&n50=l[i]+(l50=i);n75==0&&s>=g75&&n75=l[i]+(l75=i);n90==0&&s>=g90&&n90=l[i]+(l90=i)}
+                                                                      n90==0&&n90=l[n]+(l90=n);n75==0&&n75=l[n]+(l75=n);n50==0&&n50=l[n]+(l50=n);
+                                                                      iq1=int(n/4+1);iq2=int(n/2+1);iq3=int(3*n/4+1);
+                                                                      print(n50-l50)"\t"(n75-l75)"\t"(n90-l90)"\t"l50"\t"l75"\t"l90"\t"l[1]"\t"l[iq1]"\t"l[iq2]"\t"l[iq3]"\t"l[n]"\t"int(0.5+aun/g)}');
+    N50=$(cut -f1 <<<"$STATS");  N75=$(cut -f2 <<<"$STATS");  N90=$(cut -f3 <<<"$STATS"); 
+    L50=$(cut -f4 <<<"$STATS");  L75=$(cut -f5 <<<"$STATS");  L90=$(cut -f6 <<<"$STATS"); 
+    Q75=$(cut -f8 <<<"$STATS");  Q50=$(cut -f9 <<<"$STATS");  Q25=$(cut -f10 <<<"$STATS"); 
+    MIN=$(cut -f11 <<<"$STATS"); MAX=$(cut -f7 <<<"$STATS");  AUN=$(cut -f12 <<<"$STATS");
+
+    if ! $TSVOUT
+    then
+      echo ;
+      echo "File                           $(basename $INFILE)" ;
+      echo ;
+      echo "Number of sequences            $S" ;
+      echo ;
+      echo "Residue counts:" ;
+      echo "  Number of A's                $A  $fA %" ;
+      echo "  Number of C's                $C  $fC %" ;
+      echo "  Number of G's                $G  $fG %" ;
+      echo "  Number of T's                $T  $fT %" ;
+      echo "  Number of N's                $N  $fN %" ;
+      echo "  Total                        $R" ;
+      echo ;
+      echo "  %AT                          $fAT %" ;
+      echo "  %GC                          $fGC %" ;
+      echo ;
+      echo "Sequence lengths:" ;
+      echo "  Minimum                      $MIN" ;
+      echo "  Quartile 25%                 $Q25" ;
+      echo "  Median                       $Q50" ;
+      echo "  Quartile 75%                 $Q75" ;
+      echo "  Maximum                      $MAX" ;
+      echo "  Average                      $AVG" ;
+      echo ;
+      echo "Contiguity statistics:" ;
+      echo "  auN                          $AUN" ;
+      echo "  N50                          $N50" ;
+      echo "  N75                          $N75" ;
+      echo "  N90                          $N90" ;
+      echo "  L50                          $L50" ;
+      echo "  L75                          $L75" ;
+      echo "  L90                          $L90" ;
+      if [ $GENOME_SIZE -ne 0 ]; then echo "  Expected genome size         $GENOME_SIZE"; fi
+      echo ;
+    else
+      CSVLINE="$(basename $INFILE)\t$S\t$R\t$A\t$C\t$G\t$T\t$N\t$fA%\t$fC%\t$fG%\t$fT%\t$fN%\t$fAT%\t$fGC%\t$MIN\t$Q25\t$Q50\t$Q75\t$MAX\t$AVG\t$AUN\t$N50\t$N75\t$N90\t$L50\t$L75\t$L90";
+      [ $GENOME_SIZE -ne 0 ]&&CSVLINE="$CSVLINE\t$ER";
+      echo -e "$CSVLINE" ;
+    fi
+
+    continue ;
   fi
-done
 
-rm -f $SEQS ;
+  
+  ## residue details
 
-exit
+  lseq="$(awk '(length()>max){s=$0;max=length()}END{print s}' $SEQS)"; # lseq = the longest contig
+  wslseq="$(tr -cd ACGT <<<"$lseq" | tr ACGT WSSW)"; # the longest contig with only AT/CG replaced by W/S
+  tr -d '\15\32' < $INFILE | awk '!/^>/{s=s$0;next}(s!=""){print toupper(s);s=""}{print}END{print toupper(s)}' | paste - - > $SEQS ; ## tsv file: FASTA header \t contig
 
+  FTMP=$(randomfile); ## $REP GC% observed from substrings of length $R of the longest contig
+  trap "echo -n interrupting ; rm -f $SEQS $FTMP &>/dev/null ; echo ; exit " SIGINT
 
+  while IFS=$'\t' read -r fh seq
+  do
+    R=${#seq};
+    if [ $R -lt $MIN_CONTIG_LGT ]; then continue; fi
 
+    NAME=$(tr -d '>' <<<"$fh");
+    A=$(tr -cd A <<<"$seq" | wc -c); fA=$(bc -l <<<"scale=2;100*$A/$R" | sed 's/^\./0./');
+    C=$(tr -cd C <<<"$seq" | wc -c); fC=$(bc -l <<<"scale=2;100*$C/$R" | sed 's/^\./0./');
+    G=$(tr -cd G <<<"$seq" | wc -c); fG=$(bc -l <<<"scale=2;100*$G/$R" | sed 's/^\./0./');
+    T=$(tr -cd T <<<"$seq" | wc -c); fT=$(bc -l <<<"scale=2;100*$T/$R" | sed 's/^\./0./');
+    N=$(tr -cd N <<<"$seq" | wc -c); fN=$(bc -l <<<"scale=2;100*$N/$R" | sed 's/^\./0./');
+    ACGT=$(( $A + $C + $G + $T ));
+    fGC=$(bc -l <<<"scale=2;100*($C+$G)/$ACGT" | sed 's/^\./0./'); fAT=$(bc -l <<<"scale=2;100-$fGC" | sed 's/^\./0./');
 
+    REP=111; [ $R -gt 1000000 ] && REP=99; [ $R -gt 10000000 ] && REP=77;
+    [ "$seq" != "$lseq" ] && awk -v l=$R -v r=$REP '{srand(l-r);x=length()-l;while(--r>=0){s=substr($0,1+int(x*rand()),l);printf("%.6f\n",gsub("S","S",s)/length(s))}}' <<<"$wslseq" > $FTMP ;
+    gc=$(awk -v gc=$(( $C + $G )) -v acgt=$ACGT 'BEGIN{printf("%.6f\n",gc/acgt)}');
+    #echo "$gc   $(tr '\n' ' ' <$FTMP)" ;
+    [ -e $FTMP ] && pv=$(awk -v x=$gc '{++n;(x<=$0)&&++p}END{printf("%.3f",p/n)}' $FTMP) || pv=0.500;
+    pv=$(awk '{p=(($0<0.5)?0.5-$0:$0-0.5);printf("%.3f",1-2*p)}' <<<"$pv");
+    [ ! -e $FTMP ] && pv=$pv"0";
+    rm -f $FTMP ;
+    
+    if ! $TSVOUT
+    then
+      echo ;
+      echo "File                           $(basename $INFILE)" ;
+      echo ;
+      echo "Sequence                       $NAME" ;
+      echo ;
+      echo "Residue counts:" ;
+      echo "  Number of A's                $A  $fA %" ;
+      echo "  Number of C's                $C  $fC %" ;
+      echo "  Number of G's                $G  $fG %" ;
+      echo "  Number of T's                $T  $fT %" ;
+      echo "  Number of N's                $N  $fN %" ;
+      echo "  Total                        $R" ;
+      echo ;
+      echo "  %AT                          $fAT %" ;
+      echo "  %GC                          $fGC %" ;
+      echo ;
+      echo "Composition test p-value:      $pv" ;
+      echo ;
+    else
+      CSVLINE="$(basename $INFILE)\t$NAME\t$R\t$A\t$C\t$G\t$T\t$N\t$fA\t$fC\t$fG\t$fT\t$fN\t$fAT\t$fGC\t$pv";
+      echo -e "$CSVLINE" ;
+    fi
+  done < $SEQS
+  
+done
 
+rm -f $SEQS $FTMP ;
 
+exit ;
 
+  
  
-- 
GitLab