Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
contig_info
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
GIPhy
contig_info
Commits
42ecf6e7
Commit
42ecf6e7
authored
4 years ago
by
Alexis CRISCUOLO
Browse files
Options
Downloads
Patches
Plain Diff
2.0
parent
622e57f2
No related branches found
No related tags found
No related merge requests found
Changes
2
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
README.md
+60
-37
60 additions, 37 deletions
README.md
contig_info.sh
+61
-36
61 additions, 36 deletions
contig_info.sh
with
121 additions
and
73 deletions
README.md
+
60
−
37
View file @
42ecf6e7
This diff is collapsed.
Click to expand it.
contig_info.sh
+
61
−
36
View file @
42ecf6e7
...
...
@@ -33,7 +33,7 @@
# = VERSIONS = #
# ============ #
# #
VERSION
=
2.0.21031
2
ac
#
VERSION
=
2.0.21031
5
ac
#
# + adding option -r #
# + adding trap when interrupting script #
# #
...
...
@@ -157,7 +157,7 @@ then
fi
SEQS
=
$(
randomfile
)
;
## txt file with one sequence per line
trap
"echo -n interrupting ; rm -f
$SEQS
&>/dev/null ; echo ; exit "
SIGINT
trap
"echo -n interrupting
...
; rm -f
$SEQS
&>/dev/null ; echo ; exit "
SIGINT
for
INFILE
in
"
$@
"
do
...
...
@@ -165,26 +165,35 @@ do
tr
-d
'\15\32'
<
$INFILE
|
awk
-v
thr
=
$MIN_CONTIG_LGT
'/^>/{if(length(s)>=thr)print s;s="";next}{s=s$0}END{if(length(s)>=thr)print s}'
|
tr
'[:lower:]'
'[:upper:]'
>
$SEQS
;
A
=
$(
tr
-cd
A <
$SEQS
|
wc
-c
)
;
C
=
$(
tr
-cd
C <
$SEQS
|
wc
-c
)
;
G
=
$(
tr
-cd
G <
$SEQS
|
wc
-c
)
;
T
=
$(
tr
-cd
T <
$SEQS
|
wc
-c
)
;
ACGT
=
$((
$A
+
$C
+
$G
+
$T
))
;
if
!
$RES_CONTENT
then
R
=
$(
tr
-d
[
:cntrl:] <
$SEQS
|
wc
-c
)
;
S
=
$(
wc
-l
<
$SEQS
)
;
AVG
=
$(
bc
-l
<<<
"scale=2;
$R
/
$S
"
|
sed
's/^\./0./'
)
;
A
=
$(
tr
-cd
A <
$SEQS
|
wc
-c
)
;
fA
=
$(
bc
-l
<<<
"scale=2;100*
$A
/
$R
"
|
sed
's/^\./0./'
)
;
C
=
$(
tr
-cd
C <
$SEQS
|
wc
-c
)
;
fC
=
$(
bc
-l
<<<
"scale=2;100*
$C
/
$R
"
|
sed
's/^\./0./'
)
;
G
=
$(
tr
-cd
G <
$SEQS
|
wc
-c
)
;
fG
=
$(
bc
-l
<<<
"scale=2;100*
$G
/
$R
"
|
sed
's/^\./0./'
)
;
T
=
$(
tr
-cd
T <
$SEQS
|
wc
-c
)
;
fT
=
$(
bc
-l
<<<
"scale=2;100*
$T
/
$R
"
|
sed
's/^\./0./'
)
;
N
=
$(
tr
-cd
N <
$SEQS
|
wc
-c
)
;
fN
=
$(
bc
-l
<<<
"scale=2;100*
$N
/
$R
"
|
sed
's/^\./0./'
)
;
ACGT
=
$((
$A
+
$C
+
$G
+
$T
))
;
fGC
=
$(
bc
-l
<<<
"scale=2;100*(
$C
+
$G
)/
$ACGT
"
|
sed
's/^\./0./'
)
;
fAT
=
$(
bc
-l
<<<
"scale=2;100-
$fGC
"
|
sed
's/^\./0./'
)
;
R
=
$(
tr
-d
[
:cntrl:] <
$SEQS
|
wc
-c
)
;
S
=
$(
wc
-l
<
$SEQS
)
;
AVG
=
$(
bc
-l
<<<
"scale=2;
$R
/
$S
"
|
sed
's/^\./0./'
)
;
fA
=
$(
bc
-l
<<<
"scale=2;100*
$A
/
$R
"
|
sed
's/^0$/0.00/;s/^\./0./'
)
;
fC
=
$(
bc
-l
<<<
"scale=2;100*
$C
/
$R
"
|
sed
's/^0$/0.00/;s/^\./0./'
)
;
fG
=
$(
bc
-l
<<<
"scale=2;100*
$G
/
$R
"
|
sed
's/^0$/0.00/;s/^\./0./'
)
;
fT
=
$(
bc
-l
<<<
"scale=2;100*
$T
/
$R
"
|
sed
's/^0$/0.00/;s/^\./0./'
)
;
N
=
0
;
[
$R
-ne
$ACGT
]
&&
N
=
$(
tr
-cd
N <
$SEQS
|
wc
-c
)
;
fN
=
$(
bc
-l
<<<
"scale=2;100*
$N
/
$R
"
|
sed
's/^0$/0.00/;s/^\./0./'
)
;
fGC
=
$(
bc
-l
<<<
"scale=2;100*(
$C
+
$G
)/
$ACGT
"
|
sed
's/^0$/0.00/;s/^\./0./'
)
;
fAT
=
$(
bc
-l
<<<
"scale=2;100-
$fGC
"
|
sed
's/^0$/0.00/;s/^\./0./'
)
;
ER
=
$R
;
[
$GENOME_SIZE
!=
0
]
&&
ER
=
$GENOME_SIZE
;
STATS
=
$(
awk
'{print length}'
$SEQS
|
sort
-rn
|
awk
-v
g
=
$ER
'{l[++n]=$0;aun+=$0*$0}
END{OFMT="%f";g50=g/2;g75=3*g/4;g90=9*g/10;i=s=n50=n75=n90=0;
while(++i<=n&&n90==0){s+=l[i];n50==0&&s>=g50&&n50=l[i]+(l50=i);n75==0&&s>=g75&&n75=l[i]+(l75=i);n90==0&&s>=g90&&n90=l[i]+(l90=i)}
n90==0&&n90=l[n]+(l90=n);n75==0&&n75=l[n]+(l75=n);n50==0&&n50=l[n]+(l50=n);
iq
1
=int(n/4+1);iq2=int(n/2+1);iq
3
=int(3*n/4+1);
print(n50-l50)"\t"(n75-l75)"\t"(n90-l90)"\t"l50"\t"l75"\t"l90"\t"l[1]"\t"l[iq
1
]"\t"l[iq2]"\t"l[iq
3
]"\t"l[n]"\t"int(0.5+aun/g)}'
)
;
N50
=
$(
cut
-f1
<<<
"
$STATS
"
)
;
N75
=
$(
cut
-f2
<<<
"
$STATS
"
)
;
N90
=
$(
cut
-f3
<<<
"
$STATS
"
)
;
L50
=
$(
cut
-f4
<<<
"
$STATS
"
)
;
L75
=
$(
cut
-f5
<<<
"
$STATS
"
)
;
L90
=
$(
cut
-f6
<<<
"
$STATS
"
)
;
Q75
=
$(
cut
-f8
<<<
"
$STATS
"
)
;
Q50
=
$(
cut
-f9
<<<
"
$STATS
"
)
;
Q25
=
$(
cut
-f10
<<<
"
$STATS
"
)
;
iq
3
=int(n/4+1);iq2=int(n/2+1);iq
1
=int(3*n/4+1);
print(n50-l50)"\t"(n75-l75)"\t"(n90-l90)"\t"l50"\t"l75"\t"l90"\t"l[1]"\t"l[iq
3
]"\t"l[iq2]"\t"l[iq
1
]"\t"l[n]"\t"int(0.5+aun/g)}'
)
;
N50
=
$(
cut
-f1
<<<
"
$STATS
"
)
;
N75
=
$(
cut
-f2
<<<
"
$STATS
"
)
;
N90
=
$(
cut
-f3
<<<
"
$STATS
"
)
;
L50
=
$(
cut
-f4
<<<
"
$STATS
"
)
;
L75
=
$(
cut
-f5
<<<
"
$STATS
"
)
;
L90
=
$(
cut
-f6
<<<
"
$STATS
"
)
;
Q75
=
$(
cut
-f8
<<<
"
$STATS
"
)
;
Q50
=
$(
cut
-f9
<<<
"
$STATS
"
)
;
Q25
=
$(
cut
-f10
<<<
"
$STATS
"
)
;
MIN
=
$(
cut
-f11
<<<
"
$STATS
"
)
;
MAX
=
$(
cut
-f7
<<<
"
$STATS
"
)
;
AUN
=
$(
cut
-f12
<<<
"
$STATS
"
)
;
if
!
$TSVOUT
...
...
@@ -235,12 +244,16 @@ do
## residue details
lseq
=
"
$(
awk
'(length()>max){s=$0;max=length()}END{print s}'
$SEQS
)
"
;
# lseq = the longest contig
wslseq
=
"
$(
tr
-cd
ACGT
<<<
"
$lseq
"
|
tr
ACGT WSSW
)
"
;
# the longest contig with only AT/CG replaced by W/S
tr
-d
'\15\32'
<
$INFILE
|
awk
'!/^>/{s=s$0;next}(s!=""){print toupper(s);s=""}{print}END{print toupper(s)}'
|
paste
- -
>
$SEQS
;
## tsv file: FASTA header \t contig
WS
=
200
;
# window size
NS
=
5000
;
# no. samples from all the contigs
GCALL
=
$(
randomfile
)
;
# %GC estimated from $NS segments of length $WS regularly sampled from all the contigs
tr
ACGT WSSW <
$SEQS
|
fold
-w
$WS
|
tr
-cd
WS
'\n'
|
awk
-v
ws
=
$WS
-v
step
=
$((
1
+
$ACGT
/
(
$WS
*
$NS
)
))
'(length()!=ws){next}(++n==step){printf("%.6f\n",gsub("S","S")/ws);n=0}'
>
$GCALL
;
NS
=
500
;
# no. samples from each contig
GCSEQ
=
$(
randomfile
)
;
# %GC estimated from $NS segments of length $WS regularly sampled from each contig
trap
"echo -n interrupting ... ; rm -f
$SEQS
$GCALL
$GCSEQ
&>/dev/null ; echo ; exit "
SIGINT
FTMP
=
$(
randomfile
)
;
## $REP GC% observed from substrings of length $R of the longest contig
trap
"echo -n interrupting ; rm -f
$SEQS
$FTMP
&>/dev/null ; echo ; exit "
SIGINT
tr
-d
'\15\32'
<
$INFILE
|
awk
'!/^>/{s=s$0;next}(s!=""){print toupper(s);s=""}{print}END{print toupper(s)}'
|
paste
- -
>
$SEQS
;
## tsv file: FASTA header \t contig
while
IFS
=
$'
\t
'
read
-r
fh
seq
do
...
...
@@ -248,22 +261,34 @@ do
if
[
$R
-lt
$MIN_CONTIG_LGT
]
;
then continue
;
fi
NAME
=
$(
tr
-d
'>'
<<<
"
$fh
"
)
;
A
=
$(
tr
-cd
A
<<<
"
$seq
"
|
wc
-c
)
;
fA
=
$(
bc
-l
<<<
"scale=2;100*
$A
/
$R
"
|
sed
's/^\./0./'
)
;
C
=
$(
tr
-cd
C
<<<
"
$seq
"
|
wc
-c
)
;
fC
=
$(
bc
-l
<<<
"scale=2;100*
$C
/
$R
"
|
sed
's/^\./0./'
)
;
G
=
$(
tr
-cd
G
<<<
"
$seq
"
|
wc
-c
)
;
fG
=
$(
bc
-l
<<<
"scale=2;100*
$G
/
$R
"
|
sed
's/^\./0./'
)
;
T
=
$(
tr
-cd
T
<<<
"
$seq
"
|
wc
-c
)
;
fT
=
$(
bc
-l
<<<
"scale=2;100*
$T
/
$R
"
|
sed
's/^\./0./'
)
;
N
=
$(
tr
-cd
N
<<<
"
$seq
"
|
wc
-c
)
;
fN
=
$(
bc
-l
<<<
"scale=2;100*
$N
/
$R
"
|
sed
's/^\./0./'
)
;
A
=
$(
tr
-cd
A
<<<
"
$seq
"
|
wc
-c
)
;
fA
=
$(
bc
-l
<<<
"scale=2;100*
$A
/
$R
"
|
sed
's/^0$/0.00/;s/^\./0./'
)
;
C
=
$(
tr
-cd
C
<<<
"
$seq
"
|
wc
-c
)
;
fC
=
$(
bc
-l
<<<
"scale=2;100*
$C
/
$R
"
|
sed
's/^0$/0.00/;s/^\./0./'
)
;
G
=
$(
tr
-cd
G
<<<
"
$seq
"
|
wc
-c
)
;
fG
=
$(
bc
-l
<<<
"scale=2;100*
$G
/
$R
"
|
sed
's/^0$/0.00/;s/^\./0./'
)
;
T
=
$(
tr
-cd
T
<<<
"
$seq
"
|
wc
-c
)
;
fT
=
$(
bc
-l
<<<
"scale=2;100*
$T
/
$R
"
|
sed
's/^0$/0.00/;s/^\./0./'
)
;
ACGT
=
$((
$A
+
$C
+
$G
+
$T
))
;
fGC
=
$(
bc
-l
<<<
"scale=2;100*(
$C
+
$G
)/
$ACGT
"
|
sed
's/^\./0./'
)
;
fAT
=
$(
bc
-l
<<<
"scale=2;100-
$fGC
"
|
sed
's/^\./0./'
)
;
N
=
0
;
[
$R
-ne
$ACGT
]
&&
N
=
$(
tr
-cd
N
<<<
"
$seq
"
|
wc
-c
)
;
fN
=
$(
bc
-l
<<<
"scale=2;100*
$N
/
$R
"
|
sed
's/^0$/0.00/;s/^\./0./'
)
;
fGC
=
$(
bc
-l
<<<
"scale=2;100*(
$C
+
$G
)/
$ACGT
"
|
sed
's/^0$/0.00/;s/^\./0./'
)
;
fAT
=
$(
bc
-l
<<<
"scale=2;100-
$fGC
"
|
sed
's/^0$/0.00/;s/^\./0./'
)
;
REP
=
111
;
[
$R
-gt
1000000
]
&&
REP
=
99
;
[
$R
-gt
10000000
]
&&
REP
=
77
;
[
"
$seq
"
!=
"
$lseq
"
]
&&
awk
-v
l
=
$R
-v
r
=
$REP
'{srand(l-r);x=length()-l;while(--r>=0){s=substr($0,1+int(x*rand()),l);printf("%.6f\n",gsub("S","S",s)/length(s))}}'
<<<
"
$wslseq
"
>
$FTMP
;
gc
=
$(
awk
-v
gc
=
$((
$C
+
$G
))
-v
acgt
=
$ACGT
'BEGIN{printf("%.6f\n",gc/acgt)}'
)
;
#echo "$gc $(tr '\n' ' ' <$FTMP)" ;
[
-e
$FTMP
]
&&
pv
=
$(
awk
-v
x
=
$gc
'{++n;(x<=$0)&&++p}END{printf("%.3f",p/n)}'
$FTMP
)
||
pv
=
0.500
;
pv
=
$(
awk
'{p=(($0<0.5)?0.5-$0:$0-0.5);printf("%.3f",1-2*p)}'
<<<
"
$pv
"
)
;
[
!
-e
$FTMP
]
&&
pv
=
$pv
"0"
;
rm
-f
$FTMP
;
### 1. getting (up to) $NS segments of length $WS regularly sampled from $seq and saving all the corresponding %GC in $GCSEQ
tr
ACGT WSSW
<<<
"
$seq
"
|
fold
-w
$WS
|
tr
-cd
WS
'\n'
|
awk
-v
ws
=
$WS
-v
step
=
$((
1
+
(
$R
/
(
$WS
*
$NS
))
))
'(length()!=ws){next}(++n==step){printf("%.6f\n",gsub("S","S")/ws);n=0}'
>
$GCSEQ
;
### 2. comparing the %GC values sampled from $seq (file $GCSEQ) to the %GC values sampled from all the contigs (file $GCALL)
### the adequation between the two sets of %GC values is assessed by a Mann-Whitney U test
pv
=
$(
awk
'function abs(x){return (x<0)?-x:x}
function erf(x){q=t=1.0/(1+0.47047*x);sum=0.3480242*t;sum-=0.0958798*(t*=q);sum+=0.7478556*(t*=q);return 1.0-(sum*exp(-x*x))}
(FNR==NR){a1[++n1]=$0;next}{a2[++n2]=$0}
END{while(++i2<=n2){v2=a2[i2];i1=0;while(++i1<=n1)(v2>a1[i1])&&++u}
if(n2==0){print"n/a";exit}
if(n2==1){p=u/n1;p=abs(0.5-p);printf("%.4f",1-2*p);exit}
mean=n1*n2/2;(u>mean)&&u=2*mean-u;sd=sqrt(mean*(n1+n2+1)/6);z=abs(u-mean)/sd;p=(1+erf(z/sqrt(2)))/2;p=2*(1-p); printf("%.4f",p)}'
$GCALL
$GCSEQ
)
;
false
&&
awk
'function abs(x){return (x<0)?-x:x}
function erf(x){q=t=1.0/(1+0.47047*x);sum=0.3480242*t;sum-=0.0958798*(t*=q);sum+=0.7478556*(t*=q);return 1.0-(sum*exp(-x*x))}
(FNR==NR){a1[++n1]=$0;next}{a2[++n2]=$0}
END{while(++i2<=n2){v2=a2[i2];i1=0;while(++i1<=n1)(v2>a1[i1])&&++u}
if(n2==0){print"n/a";exit}
if(n2==1){p=u/n1;p=abs(0.5-p);print n1" "n2" "u" "(u/n1)" "(1-2*p);exit}
mean=n1*n2/2;(u>mean)&&u=2*mean-u;sd=sqrt(mean*(n1+n2+1)/6);z=abs(u-mean)/sd;p=(1+erf(z/sqrt(2)))/2;p=2*(1-p); print n1" "n2" "u" "mean" "sd" "z" "p}'
$GCALL
$GCSEQ
if
!
$TSVOUT
then
...
...
@@ -286,14 +311,14 @@ do
echo
"Composition test p-value:
$pv
"
;
echo
;
else
CSVLINE
=
"
$(
basename
$INFILE
)
\t
$NAME
\t
$R
\t
$A
\t
$C
\t
$G
\t
$T
\t
$N
\t
$fA
\t
$fC
\t
$fG
\t
$fT
\t
$fN
\t
$fAT
\t
$fGC
\t
$pv
"
;
CSVLINE
=
"
$(
basename
$INFILE
)
\t
$NAME
\t
$R
\t
$A
\t
$C
\t
$G
\t
$T
\t
$N
\t
$fA
%
\t
$fC
%
\t
$fG
%
\t
$fT
%
\t
$fN
%
\t
$fAT
%
\t
$fGC
%
\t
$pv
"
;
echo
-e
"
$CSVLINE
"
;
fi
done
<
$SEQS
done
rm
-f
$SEQS
$
FTMP
;
rm
-f
$SEQS
$
GCALL
$GCSEQ
;
exit
;
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment