Commit bbc48833 authored by Alexis  CRISCUOLO's avatar Alexis CRISCUOLO
Browse files

2.0

parent 677afd62
This diff is collapsed.
......@@ -34,8 +34,8 @@
# #
VERSION=2.0.210401ac #
# + estimates the distribution of the (expected) number of error(s) per HTS read #
# + adding option -r #
# + modified output tables #
# + faster running times with option -v t #
# #
# VERSION=1.0.210325ac #
# #
......@@ -67,8 +67,8 @@
AWK=awk;
GAWK=gawk; POSIX92=false;
[ $(command -v $GAWK) ] && AWK=$GAWK;
if [ ! $(command -v $AWK) ]; then echo " no $AWK detected " >&2 ; exit 1 ; fi
$AWK 'BEGIN{switch(IGNORECASE){case 0:a[0][0]=1;break}}' 2>/dev/null :
if [ ! $(command -v $AWK) ]; then echo " no $AWK detected " >&2 ; exit 1 ; fi
$AWK 'BEGIN{switch(IGNORECASE){case 0:a[0][0]=1;break}}' 2>/dev/null ;
[ $? -ne 0 ] && POSIX92=true; # standard awk
# #
# -- gzip (www.gzip.org) ---------------------------------------------------------------------------------- #
......@@ -153,8 +153,10 @@ mandoc() {
subsampling rate; when set to 1, 2, 3, 4 or 5, results are
based on 100%, ~33%, ~20%, ~15% or ~10% of all FASTQ blocks
(default: 5)
-v <char> reduced (r), full (f) or tab-delimited (t) result output
(default: r)
-v <char> reduced table (r), full table (f), or tab-delimited (t)
output (default: r)
-r residue content per position in output table (default: not
set)
-p <int> Phred quality offset (default: 33)
-d DOS end-of-lines in input file(s) (default: not set)
-t <int> number of thread(s) for decompressing files (default: 1)
......@@ -178,17 +180,19 @@ export LC_ALL=C;
OFFSET=33;
SPEED=5;
RESFREQ=0;
OUT="r"; # r=reduced f=full t=tab-delimited
NTHREADS=1;
CHECK=false;
DOS2UNIX=false;
ALTAWK="N.o.N.e";
while getopts p:s:t:v:a:cdh option
while getopts p:s:t:v:a:rcdh option
do
case $option in
p) OFFSET=$OPTARG ;;
p) OFFSET=$OPTARG ;;
s) SPEED=$OPTARG ;;
r) RESFREQ=1 ;;
v) OUT="$OPTARG" ;;
t) NTHREADS=$OPTARG ;;
c) CHECK=true ;;
......@@ -247,6 +251,7 @@ STEP=$(( 4 * $SPEED - 2 ));
[ $NTHREADS -lt 1 ] && NTHREADS=1;
[ $NTHREADS -gt $(nproc) ] && NTHREADS=$(nproc);
[ $NTHREADS -gt 2 ] && let NTHREADS--;
[ "$OUT" == "t" ] && RESFREQ=0;
TMP1=$(mktemp -t -p ${TMPDIR:-/tmp});
TMP2=$(mktemp -t -p ${TMPDIR:-/tmp});
trap "disown -a ; kill -9 $(jobs -pr) &> /dev/null ; rm -f $TMP1 $TMP2 &>/dev/null ; exit " SIGINT ;
......@@ -305,25 +310,32 @@ do
fi
$DOS2UNIX && READFILE="$READFILE | tr -d '\r'" ;
NEEDGC=1; [ "$OUT" == "t" ] && NEEDGC=0; ## option -v t: no need to estimate GC-content per position
if ! $POSIX92
then ############################################################################################################################ GNU AWK v4+
$READFILE |
$READFILE 2>/dev/null |
tee >( sed -n '4~4p' | LC_ALL=C wc -l -c > $TMP1 ) |
sed -n "2~$STEP p" |
$AWK -v offset=$OFFSET -v ngc=$NEEDGC '
$AWK -v offset=$OFFSET -v rf=$RESFREQ '
BEGIN{
FS="";c=32;while(++c<=126){c2q[sprintf("%c",c)]=(q=c-offset);q2p[q]=10^(-q/10)}
}
(c){ ## (c!=0)?read:phred
if(! ngc){++ld[NF];c=0;next}
if(! rf){++ld[NF];c=0;next}
++ld[c=NF];
while(c){($c~/[ATat]/)&&++at[c]||($c~/[CGcg]/)&&++gc[c];--c}
while(c){
switch($c){
case"A":case"a":++fa[c];break;
case"C":case"c":++fc[c];break;
case"G":case"g":++fg[c];break;
case"T":case"t":++ft[c];break;
case"N":case"n":++fn[c];break;
}
--c;
}
next; ## here, c==0
}
{
++ld[c=NF];sq=e=0;while(c){sq+=(q=c2q[$c]);e+=q2p[q];++qd[c][q];--c}
++ld[c=NF];sq=e=0;while(c){sq+=(q=c2q[$c]);e+=q2p[q];++qd[c][q];--c}
++qrd[int(0.5+sq/NF)];++erd[int(e)]; c=(++np); ## here, c!=0
}
END{
......@@ -336,72 +348,78 @@ do
anrm+=nrm[c];
}
printf"-------------------------------------------";q=0;while(++q<=qmax)printf"------";print"";
printf"n Lfreq GCfreq Efreq Q1 Q2 Q3 Q= 0";q=0;while(++q<=qmax)printf(" %5s", q);print"";
printf"----- ------ ------ ------ -- -- -- ------";q=0;while(++q<=qmax)printf" -----";print"";
printf"-----------------------------------------------------------------------";q=0;while(++q<=qmax)printf"------";print"";
printf"n Lfreq pA pC pG pT pN Efreq Q1 Q2 Q3 Q= 0";q=0;while(++q<=qmax)printf(" %5s", q);print"";
printf"----- ------ ------ ------ ------ ------ ------ ------ -- -- -- ------";q=0;while(++q<=qmax)printf" -----";print"";
c=0;
printf("%-5s . .", c);
x=((c in erd)?sprintf("%.2f", 100*erd[c]/np):".");printf(" %6s", x); ## expected 0 error freq
printf("%-5s . . . . . .", c);
x=((c in erd)?sprintf("%.2f", 100*erd[c]/np):".");printf(" %6s", x); ## expected 0 error freq
out=" . . . ";q=-1;while(++q<=qmax)out=out" .";print out;
while(++c<=lmax){
printf("%-5s", c); ## n
x=((c in ld)?sprintf("%.2f", 100*ld[c]/(nr+np)):".");printf(" %6s", x); ## lgt freq
x=((c in gc)?sprintf("%.2f", 100*gc[c]/(at[c]+gc[c])):".");printf(" %6s", x); ## GC% at pos c
x=((c in erd)?sprintf("%.2f", 100*erd[c]/np):".");printf(" %6s ", x); ## expected error freq
printf("%-5s", c); ## n
x=((c in ld)?sprintf("%.2f", 100*ld[c]/(nr+np)):".");printf(" %6s", x); ## lgt freq
fac=((c in fa)?fa[c]:0);fcc=((c in fc)?fc[c]:0);fgc=((c in fg)?fg[c]:0);
ftc=((c in ft)?ft[c]:0);fnc=((c in fn)?fn[c]:0);sfc=fac+fcc+fgc+ftc+fnc;
x=((c in fa)?sprintf("%.2f", 100*fac/sfc):".");printf(" %6s", x); ## A% at pos c
x=((c in fc)?sprintf("%.2f", 100*fcc/sfc):".");printf(" %6s", x); ## C% at pos c
x=((c in fg)?sprintf("%.2f", 100*fgc/sfc):".");printf(" %6s", x); ## G% at pos c
x=((c in ft)?sprintf("%.2f", 100*ftc/sfc):".");printf(" %6s", x); ## T% at pos c
x=((c in fn)?sprintf("%.2f", 100*fnc/sfc):".");printf(" %6s", x); ## N% at pos c
x=((c in erd)?sprintf("%.2f", 100*erd[c]/np):".");printf(" %6s ", x); ## expected error freq
cdf=0;out="";q1=q2=q3=q=-1;
while(++q<=qmax){
if(q in qd[c]){f=100*qd[c][q]/nrm[c];cdf+=f;x=sprintf("%.1f", f)}else{x="."}
out=out""sprintf(" %5s", x); ## Q freq at pos c
out=out""sprintf(" %5s", x); ## Q freq at pos c
(q1<0&&cdf>=25)&&q1=q;(q2<0&&cdf>=50)&&q2=q;(q3<0&&cdf>=75)&&q3=q;
}
q1=sprintf("%2s", q1);q2=sprintf("%2s", q2);q3=sprintf("%2s", q3);
print q1" "q2" "q3" "out;
}
printf"----- ------ ------ ------ -- -- -- ------";q=0;while(++q<=qmax)printf" -----";print"";
printf" Q1 Q2 Q3 Q= 0";q=0;while(++q<=qmax)printf(" %5s", q);print"";
printf" -- -- -- ------";q=0;while(++q<=qmax)printf" -----";print"";
printf"Phred.per.base(B) ";
printf"----- ------ ------ ------ ------ ------ ------ ------ -- -- -- ------";q=0;while(++q<=qmax)printf" -----";print"";
printf" Q1 Q2 Q3 Q= 0";q=0;while(++q<=qmax)printf(" %5s", q);print"";
printf" -- -- -- ------";q=0;while(++q<=qmax)printf" -----";print"";
printf"all.Phred(B) ";
cdf=0;out="";q1=q2=q3=q=-1;
while(++q<=qmax){
if(q in qbd){f=100*qbd[q]/anrm;cdf+=f;x=sprintf("%.1f", f)}else{x="."}
out=out""sprintf(" %5s", x); ## Q freq
out=out""sprintf(" %5s", x); ## Q freq
(q1<0&&cdf>=25)&&q1=q;(q2<0&&cdf>=50)&&q2=q;(q3<0&&cdf>=75)&&q3=q;
}
q1=sprintf("%2s", q1);q2=sprintf("%2s", q2);q3=sprintf("%2s", q3);
print q1" "q2" "q3" "out;
printf"avg.Phred.per.read(R) ";
printf"avg.Phred(R) ";
cdf=0;out="";q1=q2=q3=q=-1;
while(++q<=qmax){
if(q in qrd){f=100*qrd[q]/np;cdf+=f;x=sprintf("%.1f", f)}else{x="."}
out=out""sprintf(" %5s", x); ## read freq per avg Q
out=out""sprintf(" %5s", x); ## read freq per avg Q
(q1<0&&cdf>=25)&&q1=q;(q2<0&&cdf>=50)&&q2=q;(q3<0&&cdf>=75)&&q3=q;
}
q1=sprintf("%2s", q1);q2=sprintf("%2s", q2);q3=sprintf("%2s", q3);
print q1" "q2" "q3" "out;
printf"error.per.read(E) ";
printf"no.Errors(E) ";
q1=q2=q3=c=-1;cdf=0;
while(++c<=lmax)if(c in erd){cdf+=100*erd[c]/np;(q1<0&&cdf>=25)&&q1=c;(q2<0&&cdf>=50)&&q2=c;(q3<0&&cdf>=75)&&q3=c}
out="";q=-1;while(++q<=qmax)out=out" .";
out="";q=-1;while(++q<=qmax)out=out" ."; ## no. error per read
q1=sprintf("%2s", q1);q2=sprintf("%2s", q2);q3=sprintf("%2s", q3);
print q1" "q2" "q3" "out;
printf"-------------------------------------------";q=0;while(++q<=qmax)printf"------";print"";
printf"-----------------------------------------------------------------------";q=0;while(++q<=qmax)printf"------";print"";
print((nr<np)?np:nr);
}' > $TMP2 ;
else ############################################################################################################################ AWK POSIX 1003.2
$READFILE |
$READFILE 2>/dev/null |
tee >( sed -n '4~4p' | LC_ALL=C wc -l -c > $TMP1 ) |
sed -n "2~$STEP p" |
$AWK -v offset=$OFFSET -v ngc=$NEEDGC '
$AWK -v offset=$OFFSET -v rf=$RESFREQ '
BEGIN{
FS="";c=32;while(++c<=126){c2q[sprintf("%c",c)]=(q=c-offset);q2p[q]=10^(-q/10)}
}
(c){ ## (c!=0)?read:phred
if(! ngc){++ld[NF];c=0;next}
if(! rf){++ld[NF];c=0;next}
++ld[c=NF];
while(c){($c~/[ATat]/)&&++at[c]||($c~/[CGcg]/)&&++gc[c];--c}
while(c){((ch=$c)~/[Aa]/)&&++fa[c]||(ch~/[Cc]/)&&++fc[c]||(ch~/[Gg]/)&&++fg[c]||(ch~/[Tt]/)&&++ft[c]||++fn[c];--c}
next; ## here, c==0
}
{
......@@ -418,17 +436,23 @@ do
anrm+=nrm[c];
}
printf"-------------------------------------------";q=0;while(++q<=qmax)printf"------";print"";
printf"n Lfreq GCfreq Efreq Q1 Q2 Q3 Q= 0";q=0;while(++q<=qmax)printf(" %5s", q);print"";
printf"----- ------ ------ ------ -- -- -- ------";q=0;while(++q<=qmax)printf" -----";print"";
printf"-----------------------------------------------------------------------";q=0;while(++q<=qmax)printf"------";print"";
printf"n Lfreq pA pC pG pT pN Efreq Q1 Q2 Q3 Q= 0";q=0;while(++q<=qmax)printf(" %5s", q);print"";
printf"----- ------ ------ ------ ------ ------ ------ ------ -- -- -- ------";q=0;while(++q<=qmax)printf" -----";print"";
c=0;
printf("%-5s . .", c);
printf("%-5s . . . . . .", c);
x=((c in erd)?sprintf("%.2f", 100*erd[c]/np):".");printf(" %6s", x); ## expected 0 error freq
out=" . . . ";q=-1;while(++q<=qmax)out=out" .";print out;
while(++c<=lmax){
printf("%-5s", c); ## n
x=((c in ld)?sprintf("%.2f", 100*ld[c]/(nr+np)):".");printf(" %6s", x); ## lgt freq
x=((c in gc)?sprintf("%.2f", 100*gc[c]/(at[c]+gc[c])):".");printf(" %6s", x); ## GC% at pos c
fac=((c in fa)?fa[c]:0);fcc=((c in fc)?fc[c]:0);fgc=((c in fg)?fg[c]:0);
ftc=((c in ft)?ft[c]:0);fnc=((c in fn)?fn[c]:0);sfc=fac+fcc+fgc+ftc+fnc;
x=((c in fa)?sprintf("%.2f", 100*fac/sfc):".");printf(" %6s", x); ## A% at pos c
x=((c in fc)?sprintf("%.2f", 100*fcc/sfc):".");printf(" %6s", x); ## C% at pos c
x=((c in fg)?sprintf("%.2f", 100*fgc/sfc):".");printf(" %6s", x); ## G% at pos c
x=((c in ft)?sprintf("%.2f", 100*ftc/sfc):".");printf(" %6s", x); ## T% at pos c
x=((c in fn)?sprintf("%.2f", 100*fnc/sfc):".");printf(" %6s", x); ## N% at pos c
x=((c in erd)?sprintf("%.2f", 100*erd[c]/np):".");printf(" %6s ", x); ## expected error freq
cdf=0;out="";q1=q2=q3=q=-1;
while(++q<=qmax){
......@@ -440,10 +464,10 @@ do
print q1" "q2" "q3" "out;
}
printf"----- ------ ------ ------ -- -- -- ------";q=0;while(++q<=qmax)printf" -----";print"";
printf" Q1 Q2 Q3 Q= 0";q=0;while(++q<=qmax)printf(" %5s", q);print"";
printf" -- -- -- ------";q=0;while(++q<=qmax)printf" -----";print"";
printf"Phred.per.base(B) ";
printf"----- ------ ------ ------ ------ ------ ------ ------ -- -- -- ------";q=0;while(++q<=qmax)printf" -----";print"";
printf" Q1 Q2 Q3 Q= 0";q=0;while(++q<=qmax)printf(" %5s", q);print"";
printf" -- -- -- ------";q=0;while(++q<=qmax)printf" -----";print"";
printf"all.Phred(B) ";
cdf=0;out="";q1=q2=q3=q=-1;
while(++q<=qmax){
if(q in qbd){f=100*qbd[q]/anrm;cdf+=f;x=sprintf("%.1f", f)}else{x="."}
......@@ -453,7 +477,7 @@ do
q1=sprintf("%2s", q1);q2=sprintf("%2s", q2);q3=sprintf("%2s", q3);
print q1" "q2" "q3" "out;
printf"avg.Phred.per.read(R) ";
printf"avg.Phred(R) ";
cdf=0;out="";q1=q2=q3=q=-1;
while(++q<=qmax){
if(q in qrd){f=100*qrd[q]/np;cdf+=f;x=sprintf("%.1f", f)}else{x="."}
......@@ -463,14 +487,14 @@ do
q1=sprintf("%2s", q1);q2=sprintf("%2s", q2);q3=sprintf("%2s", q3);
print q1" "q2" "q3" "out;
printf"error.per.read(E) ";
printf"no.Errors(E) ";
q1=q2=q3=c=-1;cdf=0;
while(++c<=lmax)
if(c in erd){cdf+=100*erd[c]/np;if(q1<0&&cdf>=25)q1=c;if(q2<0&&cdf>=50)q2=c;if(q3<0&&cdf>=75)q3=c}
out="";q=-1;while(++q<=qmax)out=out" .";
q1=sprintf("%2s", q1);q2=sprintf("%2s", q2);q3=sprintf("%2s", q3);
print q1" "q2" "q3" "out;
printf"-------------------------------------------";q=0;while(++q<=qmax)printf"------";print"";
printf"-----------------------------------------------------------------------";q=0;while(++q<=qmax)printf"------";print"";
print((nr<np)?np:nr);
}' > $TMP2 ;
fi
......@@ -491,17 +515,23 @@ do
echo "#avg.lgt(AL): $AL" ;
if [ "$OUT" == "r" ]
then
sed '$d' $TMP2 | cut -c1-36 ;
if [ $RESFREQ -ne 0 ]
then sed '$d' $TMP2 | cut -c1-64 ;
else sed '$d' $TMP2 | cut -c1-12,48-64 ;
fi
else
sed '$d' $TMP2 ;
if [ $RESFREQ -ne 0 ]
then sed '$d' $TMP2 ;
else sed '$d' $TMP2 | cut -c1-12,48- ;
fi
echo "#subsampling.rate: "$(bc -l <<<"scale=3;$SS/$NR" | sed 's/^0$/0.00/;s/^\./0./') ;
fi
echo ;
elif [ "$OUT" == "t" ]
then
BQ123="$(tail $TMP2 | grep -F "(B)" $TMP2 | cut -c28-36 | tr -s ' ' | sed 's/^ //' | tr ' ' '\t')";
RQ123="$(tail $TMP2 | grep -F "(R)" $TMP2 | cut -c28-36 | tr -s ' ' | sed 's/^ //' | tr ' ' '\t')";
EQ123="$(tail $TMP2 | grep -F "(E)" $TMP2 | cut -c28-36 | tr -s ' ' | sed 's/^ //' | tr ' ' '\t')";
BQ123="$(tail $TMP2 | grep -F "(B)" $TMP2 | cut -c48-64 | tr -s ' ' | sed 's/^ //' | tr ' ' '\t')";
RQ123="$(tail $TMP2 | grep -F "(R)" $TMP2 | cut -c48-64 | tr -s ' ' | sed 's/^ //' | tr ' ' '\t')";
EQ123="$(tail $TMP2 | grep -F "(E)" $TMP2 | cut -c48-64 | tr -s ' ' | sed 's/^ //' | tr ' ' '\t')";
echo -e "$INFILE\t$NR\t$NB\t$AL\t$BQ123\t$RQ123\t$EQ123" ;
fi
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment