Skip to content
Snippets Groups Projects
Commit 42ecf6e7 authored by Alexis  CRISCUOLO's avatar Alexis CRISCUOLO :black_circle:
Browse files

2.0

parent 622e57f2
No related branches found
No related tags found
No related merge requests found
This diff is collapsed.
......@@ -33,7 +33,7 @@
# = VERSIONS = #
# ============ #
# #
VERSION=2.0.210312ac #
VERSION=2.0.210315ac #
# + adding option -r #
# + adding trap when interrupting script #
# #
......@@ -157,7 +157,7 @@ then
fi
SEQS=$(randomfile); ## txt file with one sequence per line
trap "echo -n interrupting ; rm -f $SEQS &>/dev/null ; echo ; exit " SIGINT
trap "echo -n interrupting ... ; rm -f $SEQS &>/dev/null ; echo ; exit " SIGINT
for INFILE in "$@"
do
......@@ -165,26 +165,35 @@ do
tr -d '\15\32' < $INFILE | awk -v thr=$MIN_CONTIG_LGT '/^>/{if(length(s)>=thr)print s;s="";next}{s=s$0}END{if(length(s)>=thr)print s}' | tr '[:lower:]' '[:upper:]' > $SEQS ;
A=$(tr -cd A < $SEQS | wc -c);
C=$(tr -cd C < $SEQS | wc -c);
G=$(tr -cd G < $SEQS | wc -c);
T=$(tr -cd T < $SEQS | wc -c);
ACGT=$(( $A + $C + $G + $T ));
if ! $RES_CONTENT
then
R=$(tr -d [:cntrl:] < $SEQS | wc -c); S=$(wc -l < $SEQS); AVG=$(bc -l<<<"scale=2;$R/$S" | sed 's/^\./0./');
A=$(tr -cd A < $SEQS | wc -c); fA=$(bc -l <<<"scale=2;100*$A/$R" | sed 's/^\./0./');
C=$(tr -cd C < $SEQS | wc -c); fC=$(bc -l <<<"scale=2;100*$C/$R" | sed 's/^\./0./');
G=$(tr -cd G < $SEQS | wc -c); fG=$(bc -l <<<"scale=2;100*$G/$R" | sed 's/^\./0./');
T=$(tr -cd T < $SEQS | wc -c); fT=$(bc -l <<<"scale=2;100*$T/$R" | sed 's/^\./0./');
N=$(tr -cd N < $SEQS | wc -c); fN=$(bc -l <<<"scale=2;100*$N/$R" | sed 's/^\./0./');
ACGT=$(( $A + $C + $G + $T ));
fGC=$(bc -l <<<"scale=2;100*($C+$G)/$ACGT" | sed 's/^\./0./'); fAT=$(bc -l <<<"scale=2;100-$fGC" | sed 's/^\./0./');
R=$(tr -d [:cntrl:] < $SEQS | wc -c);
S=$(wc -l < $SEQS);
AVG=$(bc -l<<<"scale=2;$R/$S" | sed 's/^\./0./');
fA=$(bc -l <<<"scale=2;100*$A/$R" | sed 's/^0$/0.00/;s/^\./0./');
fC=$(bc -l <<<"scale=2;100*$C/$R" | sed 's/^0$/0.00/;s/^\./0./');
fG=$(bc -l <<<"scale=2;100*$G/$R" | sed 's/^0$/0.00/;s/^\./0./');
fT=$(bc -l <<<"scale=2;100*$T/$R" | sed 's/^0$/0.00/;s/^\./0./');
N=0; [ $R -ne $ACGT ] && N=$(tr -cd N < $SEQS | wc -c);
fN=$(bc -l <<<"scale=2;100*$N/$R" | sed 's/^0$/0.00/;s/^\./0./');
fGC=$(bc -l <<<"scale=2;100*($C+$G)/$ACGT" | sed 's/^0$/0.00/;s/^\./0./');
fAT=$(bc -l <<<"scale=2;100-$fGC" | sed 's/^0$/0.00/;s/^\./0./');
ER=$R; [ $GENOME_SIZE != 0 ] && ER=$GENOME_SIZE;
STATS=$(awk '{print length}' $SEQS | sort -rn | awk -v g=$ER '{l[++n]=$0;aun+=$0*$0}
END{OFMT="%f";g50=g/2;g75=3*g/4;g90=9*g/10;i=s=n50=n75=n90=0;
while(++i<=n&&n90==0){s+=l[i];n50==0&&s>=g50&&n50=l[i]+(l50=i);n75==0&&s>=g75&&n75=l[i]+(l75=i);n90==0&&s>=g90&&n90=l[i]+(l90=i)}
n90==0&&n90=l[n]+(l90=n);n75==0&&n75=l[n]+(l75=n);n50==0&&n50=l[n]+(l50=n);
iq1=int(n/4+1);iq2=int(n/2+1);iq3=int(3*n/4+1);
print(n50-l50)"\t"(n75-l75)"\t"(n90-l90)"\t"l50"\t"l75"\t"l90"\t"l[1]"\t"l[iq1]"\t"l[iq2]"\t"l[iq3]"\t"l[n]"\t"int(0.5+aun/g)}');
N50=$(cut -f1 <<<"$STATS"); N75=$(cut -f2 <<<"$STATS"); N90=$(cut -f3 <<<"$STATS");
L50=$(cut -f4 <<<"$STATS"); L75=$(cut -f5 <<<"$STATS"); L90=$(cut -f6 <<<"$STATS");
Q75=$(cut -f8 <<<"$STATS"); Q50=$(cut -f9 <<<"$STATS"); Q25=$(cut -f10 <<<"$STATS");
iq3=int(n/4+1);iq2=int(n/2+1);iq1=int(3*n/4+1);
print(n50-l50)"\t"(n75-l75)"\t"(n90-l90)"\t"l50"\t"l75"\t"l90"\t"l[1]"\t"l[iq3]"\t"l[iq2]"\t"l[iq1]"\t"l[n]"\t"int(0.5+aun/g)}');
N50=$(cut -f1 <<<"$STATS"); N75=$(cut -f2 <<<"$STATS"); N90=$(cut -f3 <<<"$STATS");
L50=$(cut -f4 <<<"$STATS"); L75=$(cut -f5 <<<"$STATS"); L90=$(cut -f6 <<<"$STATS");
Q75=$(cut -f8 <<<"$STATS"); Q50=$(cut -f9 <<<"$STATS"); Q25=$(cut -f10 <<<"$STATS");
MIN=$(cut -f11 <<<"$STATS"); MAX=$(cut -f7 <<<"$STATS"); AUN=$(cut -f12 <<<"$STATS");
if ! $TSVOUT
......@@ -235,12 +244,16 @@ do
## residue details
lseq="$(awk '(length()>max){s=$0;max=length()}END{print s}' $SEQS)"; # lseq = the longest contig
wslseq="$(tr -cd ACGT <<<"$lseq" | tr ACGT WSSW)"; # the longest contig with only AT/CG replaced by W/S
tr -d '\15\32' < $INFILE | awk '!/^>/{s=s$0;next}(s!=""){print toupper(s);s=""}{print}END{print toupper(s)}' | paste - - > $SEQS ; ## tsv file: FASTA header \t contig
WS=200; # window size
NS=5000; # no. samples from all the contigs
GCALL=$(randomfile); # %GC estimated from $NS segments of length $WS regularly sampled from all the contigs
tr ACGT WSSW < $SEQS | fold -w $WS | tr -cd WS'\n' | awk -v ws=$WS -v step=$(( 1 + $ACGT / ($WS*$NS) )) '(length()!=ws){next}(++n==step){printf("%.6f\n",gsub("S","S")/ws);n=0}' > $GCALL ;
NS=500; # no. samples from each contig
GCSEQ=$(randomfile); # %GC estimated from $NS segments of length $WS regularly sampled from each contig
trap "echo -n interrupting ... ; rm -f $SEQS $GCALL $GCSEQ &>/dev/null ; echo ; exit " SIGINT
FTMP=$(randomfile); ## $REP GC% observed from substrings of length $R of the longest contig
trap "echo -n interrupting ; rm -f $SEQS $FTMP &>/dev/null ; echo ; exit " SIGINT
tr -d '\15\32' < $INFILE | awk '!/^>/{s=s$0;next}(s!=""){print toupper(s);s=""}{print}END{print toupper(s)}' | paste - - > $SEQS ; ## tsv file: FASTA header \t contig
while IFS=$'\t' read -r fh seq
do
......@@ -248,22 +261,34 @@ do
if [ $R -lt $MIN_CONTIG_LGT ]; then continue; fi
NAME=$(tr -d '>' <<<"$fh");
A=$(tr -cd A <<<"$seq" | wc -c); fA=$(bc -l <<<"scale=2;100*$A/$R" | sed 's/^\./0./');
C=$(tr -cd C <<<"$seq" | wc -c); fC=$(bc -l <<<"scale=2;100*$C/$R" | sed 's/^\./0./');
G=$(tr -cd G <<<"$seq" | wc -c); fG=$(bc -l <<<"scale=2;100*$G/$R" | sed 's/^\./0./');
T=$(tr -cd T <<<"$seq" | wc -c); fT=$(bc -l <<<"scale=2;100*$T/$R" | sed 's/^\./0./');
N=$(tr -cd N <<<"$seq" | wc -c); fN=$(bc -l <<<"scale=2;100*$N/$R" | sed 's/^\./0./');
A=$(tr -cd A <<<"$seq" | wc -c); fA=$(bc -l <<<"scale=2;100*$A/$R" | sed 's/^0$/0.00/;s/^\./0./');
C=$(tr -cd C <<<"$seq" | wc -c); fC=$(bc -l <<<"scale=2;100*$C/$R" | sed 's/^0$/0.00/;s/^\./0./');
G=$(tr -cd G <<<"$seq" | wc -c); fG=$(bc -l <<<"scale=2;100*$G/$R" | sed 's/^0$/0.00/;s/^\./0./');
T=$(tr -cd T <<<"$seq" | wc -c); fT=$(bc -l <<<"scale=2;100*$T/$R" | sed 's/^0$/0.00/;s/^\./0./');
ACGT=$(( $A + $C + $G + $T ));
fGC=$(bc -l <<<"scale=2;100*($C+$G)/$ACGT" | sed 's/^\./0./'); fAT=$(bc -l <<<"scale=2;100-$fGC" | sed 's/^\./0./');
N=0; [ $R -ne $ACGT ] && N=$(tr -cd N <<<"$seq" | wc -c);
fN=$(bc -l <<<"scale=2;100*$N/$R" | sed 's/^0$/0.00/;s/^\./0./');
fGC=$(bc -l <<<"scale=2;100*($C+$G)/$ACGT" | sed 's/^0$/0.00/;s/^\./0./');
fAT=$(bc -l <<<"scale=2;100-$fGC" | sed 's/^0$/0.00/;s/^\./0./');
REP=111; [ $R -gt 1000000 ] && REP=99; [ $R -gt 10000000 ] && REP=77;
[ "$seq" != "$lseq" ] && awk -v l=$R -v r=$REP '{srand(l-r);x=length()-l;while(--r>=0){s=substr($0,1+int(x*rand()),l);printf("%.6f\n",gsub("S","S",s)/length(s))}}' <<<"$wslseq" > $FTMP ;
gc=$(awk -v gc=$(( $C + $G )) -v acgt=$ACGT 'BEGIN{printf("%.6f\n",gc/acgt)}');
#echo "$gc $(tr '\n' ' ' <$FTMP)" ;
[ -e $FTMP ] && pv=$(awk -v x=$gc '{++n;(x<=$0)&&++p}END{printf("%.3f",p/n)}' $FTMP) || pv=0.500;
pv=$(awk '{p=(($0<0.5)?0.5-$0:$0-0.5);printf("%.3f",1-2*p)}' <<<"$pv");
[ ! -e $FTMP ] && pv=$pv"0";
rm -f $FTMP ;
### 1. getting (up to) $NS segments of length $WS regularly sampled from $seq and saving all the corresponding %GC in $GCSEQ
tr ACGT WSSW <<<"$seq" | fold -w $WS | tr -cd WS'\n' | awk -v ws=$WS -v step=$(( 1 + ($R / ($WS*$NS)) )) '(length()!=ws){next}(++n==step){printf("%.6f\n",gsub("S","S")/ws);n=0}' > $GCSEQ ;
### 2. comparing the %GC values sampled from $seq (file $GCSEQ) to the %GC values sampled from all the contigs (file $GCALL)
### the adequation between the two sets of %GC values is assessed by a Mann-Whitney U test
pv=$(awk 'function abs(x){return (x<0)?-x:x}
function erf(x){q=t=1.0/(1+0.47047*x);sum=0.3480242*t;sum-=0.0958798*(t*=q);sum+=0.7478556*(t*=q);return 1.0-(sum*exp(-x*x))}
(FNR==NR){a1[++n1]=$0;next}{a2[++n2]=$0}
END{while(++i2<=n2){v2=a2[i2];i1=0;while(++i1<=n1)(v2>a1[i1])&&++u}
if(n2==0){print"n/a";exit}
if(n2==1){p=u/n1;p=abs(0.5-p);printf("%.4f",1-2*p);exit}
mean=n1*n2/2;(u>mean)&&u=2*mean-u;sd=sqrt(mean*(n1+n2+1)/6);z=abs(u-mean)/sd;p=(1+erf(z/sqrt(2)))/2;p=2*(1-p); printf("%.4f",p)}' $GCALL $GCSEQ);
false && awk 'function abs(x){return (x<0)?-x:x}
function erf(x){q=t=1.0/(1+0.47047*x);sum=0.3480242*t;sum-=0.0958798*(t*=q);sum+=0.7478556*(t*=q);return 1.0-(sum*exp(-x*x))}
(FNR==NR){a1[++n1]=$0;next}{a2[++n2]=$0}
END{while(++i2<=n2){v2=a2[i2];i1=0;while(++i1<=n1)(v2>a1[i1])&&++u}
if(n2==0){print"n/a";exit}
if(n2==1){p=u/n1;p=abs(0.5-p);print n1" "n2" "u" "(u/n1)" "(1-2*p);exit}
mean=n1*n2/2;(u>mean)&&u=2*mean-u;sd=sqrt(mean*(n1+n2+1)/6);z=abs(u-mean)/sd;p=(1+erf(z/sqrt(2)))/2;p=2*(1-p); print n1" "n2" "u" "mean" "sd" "z" "p}' $GCALL $GCSEQ
if ! $TSVOUT
then
......@@ -286,14 +311,14 @@ do
echo "Composition test p-value: $pv" ;
echo ;
else
CSVLINE="$(basename $INFILE)\t$NAME\t$R\t$A\t$C\t$G\t$T\t$N\t$fA\t$fC\t$fG\t$fT\t$fN\t$fAT\t$fGC\t$pv";
CSVLINE="$(basename $INFILE)\t$NAME\t$R\t$A\t$C\t$G\t$T\t$N\t$fA%\t$fC%\t$fG%\t$fT%\t$fN%\t$fAT%\t$fGC%\t$pv";
echo -e "$CSVLINE" ;
fi
done < $SEQS
done
rm -f $SEQS $FTMP ;
rm -f $SEQS $GCALL $GCSEQ ;
exit ;
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment