v0.5

5b5e683f · Alexis CRISCUOLO · 11d2792d · 5b5e683f · 5b5e683f
Commit 5b5e683f authored 4 years ago by Alexis CRISCUOLO
--- a/README.md
+++ b/README.md
@@ -86,6 +86,8 @@ Launch _wgetGenBankWGS_ without option to read the following documentation:
 * The output file names are created with the organism name, followed by the intraspecific and isolate names (if any), and ending with the WGS master (is any) and the assembly accession. File extension depends on the file type specified using option -f.
+* Flag "--T--" is added in the output file name when the corresponding assembly correspond to a type material. Flag "--t--" is added for putative type material that does not meet the full required criteria. Flag "--w--" is added as warning when some assembly anomalies are specified in the report file (for more details, see [https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/](https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/).
 * After each usage, a file `summary.txt` containing the selected raw(s) of the GenBank or RefSeq tab-separated assembly report is written. If the option -n is not set, this file is completed by the name(s) of the written files (first column 'file').
 * Very fast running times are expected when running _wgetGenBankWGS_ on multiple threads. As a rule of thumb, using twice the maximum number of available threads generally leads to good performances with bacterial genomes (depending on the bandwidth).

--- a/wgetGenBankWGS.sh
+++ b/wgetGenBankWGS.sh
@@ -33,7 +33,11 @@
 # = VERSIONS =                                                                                              #
 # ============                                                                                              #
 #                                                                                                           #
-  VERSION=0.4.200504ac                                                                                      #
+  VERSION=0.5.201018ac                                                                                      #
+# + adding flag -T- or -t- in file name for type material                                                   #
+# + adding flag -w- in file name for genomes excluded from RefSeq                                           #
+#                                                                                                           #
+# VERSION=0.4.200504ac                                                                                      #
 # + discarding option -t (type strain info)                                                                 #
 # + option -t for multithread (instead of -c)                                                               #
 # + adding single quote (') in the list of special characters                                               #
@@ -63,13 +67,13 @@ if [ "$1" = "-?" ] || [ "$1" = "-h" ] || [ $# -le 1 ]
 then                                                                                                        #
  cat <<EOF
- wgetGenBankWGS v.$VERSION
+ wgetGenBankWGS v.$VERSION                                 Copyright (C) 2019-2020  Institut Pasteur
 Downloading sequence files corresponding to selected entries from genome assembly report files:
   GenBank:  ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt
   RefSeq:   ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt
- Writing output files 'Species.isolate--accn--GCA' with the following content (and extension):
+ Writing output files 'Species.isolate--accn--GC' with the following content (and extension):
   -f 1      genomic sequence(s) in FASTA format (.fasta)
   -f 2      genomic sequence(s) in GenBank format (.gbk)
   -f 3      annotations in GFF3 format (.gff)
@@ -287,7 +291,7 @@ head -1 $SUMMARY | sed 's/^# /# file\t/' > $FULLSUMMARY ;
 tr '\t' '|' < $SUMMARY > $tmp ; mv $tmp $SUMMARY ;  ## to deal with empty entries, not well managed using IFS=$'\t'
 START=$SECONDS;
 i=-1;
-while IFS="|" read -r assembly_accession _ _ wgs_master _ _ _ organism_name infraspecific_name isolate _ _ _ _ _ _ _ _ _ ftp_path _ _
+while IFS="|" read -r assembly_accession _ _ wgs_master _ _ _ organism_name infraspecific_name isolate _ _ _ _ _ _ _ _ _ ftp_path excluded_from_refseq relation_to_type_material
 do
  let i++; if [ $i -lt 1 ]; then continue; fi
@@ -300,14 +304,19 @@ do
  ISOLATE=$(echo "$isolate" | tr ",/\?%*:|'\"<>()[]#;" '_');                                       ### replacing special char. by '_'
  [ -n "$ISOLATE" ] && [ $(echo "$NAME" | grep -c -F "$ISOLATE") -eq 0 ] && NAME="$NAME.$ISOLATE";
-  accn=$(tr -d '0123456789.' <<< "$wgs_master")"01";
+  [ "$relation_to_type_material" == "assembly from type material" ] && NAME="$NAME""--T";
-  [ -n "$wgs_master" ] && NAME="$NAME""--""$accn";
+  [ "$relation_to_type_material" == "assembly from synonym type material" ] && NAME="$NAME""--T";
+  [ "$excluded_from_refseq" == "untrustworthy as type" ] && NAME="$NAME""--t";
+  [ -n "$excluded_from_refseq" ] && [ "$excluded_from_refseq" != "untrustworthy as type" ] && NAME="$NAME""--w";
+  # accn=$(tr -d '0123456789.' <<< "$wgs_master")"01"; [ -n "$wgs_master" ] && NAME="$NAME""--""$accn";
+  [ -n "$wgs_master" ] && NAME="$NAME""--""$wgs_master";
  [ -n "$assembly_accession" ] && NAME="$NAME""--""$assembly_accession";
  GZFILE=$(basename $ftp_path)$INEXT;
  URL=$(echo $ftp_path | sed "s/ftp:/$PROTOCOL/")"/$GZFILE";
-  >&2 echo "$(gettime $START) [$i/$n] $organism_name | $infraspecific_name | $isolate | $assembly_accession | $wgs_master | $ftp_path" ;
+  >&2 echo -e "$(gettime $START) [$i/$n] $organism_name | $infraspecific_name | $isolate | $assembly_accession | $wgs_master | \e[31m$excluded_from_refseq\e[0m \e[34m$relation_to_type_material\e[0m | $ftp_path" ;
  OUTFILE=$(echo "$NAME" | tr ' ' '.' | sed 's/\.\.*/\./g')$OUTEXT;                                ### replacing blank spaces by '.', and successive dots by only one