diff --git a/wgetGenBankWGS.sh b/wgetGenBankWGS.sh index e50eaead29c3975d225362b7009d24e6a6c62bf1..41822302706f8ec6e2108152f02f9c2bf5594b06 100755 --- a/wgetGenBankWGS.sh +++ b/wgetGenBankWGS.sh @@ -4,7 +4,7 @@ # # # wgetGenBankWGS: downloading WGS genome assembly files from NCBI # # # -# Copyright (C) 2019-2021 Institut Pasteur # +# Copyright (C) 2019-2023 Institut Pasteur # # # # This program is free software: you can redistribute it and/or modify it under the terms of the GNU # # General Public License as published by the Free Software Foundation, either version 3 of the License, or # @@ -20,9 +20,7 @@ # Contact: # # Alexis Criscuolo alexis.criscuolo@pasteur.fr # # Genome Informatics & Phylogenetics (GIPhy) giphy.pasteur.fr # -# Bioinformatics and Biostatistics Hub research.pasteur.fr/team/hub-giphy # -# USR 3756 IP CNRS research.pasteur.fr/team/bioinformatics-and-biostatistics-hub # -# Dpt. Biologie Computationnelle research.pasteur.fr/department/computational-biology # +# Centre de Ressources Biologiques de l'Institut Pasteur (CRBIP) research.pasteur.fr/en/b/VTq # # Institut Pasteur, Paris, FRANCE research.pasteur.fr # # # ############################################################################################################# @@ -33,7 +31,10 @@ # = VERSIONS = # # ============ # # # - VERSION=0.7.211026ac # + VERSION=0.8.230612ac # +# + takes into account the empty fields recently replaced with "na" # +# # +# VERSION=0.7.211026ac # # + takes into account the new protocol https in field ftp_path of the genome assembly report files # # # # VERSION=0.6.211018ac # @@ -338,32 +339,39 @@ head -1 $SUMMARY | sed 's/^# /# file\t/' > $FULLSUMMARY ; tr '\t' '|' < $SUMMARY > $tmp ; mv $tmp $SUMMARY ; ## to deal with empty entries, not well managed using IFS=$'\t' START=$SECONDS; i=-1; +#230612 assembly_accession _ _ wgs_master _ _ _ organism_name infraspecific_name isolate _ _ _ _ _ _ _ _ _ ftp_path excluded_from_refseq relation_to_type_material _ while IFS="|" read -r assembly_accession _ _ wgs_master _ _ _ organism_name infraspecific_name isolate _ _ _ _ _ _ _ _ _ ftp_path excluded_from_refseq relation_to_type_material _ do let i++; if [ $i -lt 1 ]; then continue; fi - NAME=$(echo "$organism_name" | tr ",/\?%*:|'\"<>()[]#;" '_' | ### replacing special char. by '_' + NAME=$(echo "$organism_name" | tr ",/\?%*:|'\"<>()[]#;" '_' | ### replacing special char. by '_' sed -e 's/ bv\./ bv/;s/ genomosp\./ genomosp/;s/ sp\./ sp/;s/ str\./ str/;s/ subsp\./ subsp/'); - STRAIN=$(echo "$infraspecific_name" | sed 's/strain=//g' | tr ",/\?%*:|'\"<>()[]#;" '_'); ### replacing special char. by '_' + STRAIN=$(echo "$infraspecific_name" | grep -v "^na$" | sed 's/strain=//g' | tr ",/\?%*:|'\"<>()[]#;" '_'); ### replacing special char. by '_' [ -n "$STRAIN" ] && [ $(echo "$NAME" | grep -c -F "$STRAIN") -eq 0 ] && NAME="$NAME.$STRAIN"; - ISOLATE=$(echo "$isolate" | tr ",/\?%*:|'\"<>()[]#;" '_'); ### replacing special char. by '_' + ISOLATE=$(echo "$isolate" | grep -v "^na$" | tr ",/\?%*:|'\"<>()[]#;" '_'); ### replacing special char. by '_' [ -n "$ISOLATE" ] && [ $(echo "$NAME" | grep -c -F "$ISOLATE") -eq 0 ] && NAME="$NAME.$ISOLATE"; - [ "$relation_to_type_material" == "assembly from type material" ] && NAME="$NAME""--T"; - [ "$relation_to_type_material" == "assembly from synonym type material" ] && NAME="$NAME""--T"; - [ "$excluded_from_refseq" == "untrustworthy as type" ] && NAME="$NAME""--t"; - [ -n "$excluded_from_refseq" ] && [ "$excluded_from_refseq" != "untrustworthy as type" ] && NAME="$NAME""--w"; - - # accn=$(tr -d '0123456789.' <<< "$wgs_master")"01"; [ -n "$wgs_master" ] && NAME="$NAME""--""$accn"; - [ -n "$wgs_master" ] && NAME="$NAME""--""$wgs_master"; - [ -n "$assembly_accession" ] && NAME="$NAME""--""$assembly_accession"; + TYPE_MATERIAL="$(grep -v "^na$" <<<"$relation_to_type_material")"; + [ "$TYPE_MATERIAL" == "assembly from type material" ] && NAME="$NAME""--T"; + [ "$TYPE_MATERIAL" == "assembly from synonym type material" ] && NAME="$NAME""--T"; + [ "$TYPE_MATERIAL" == "assembly designated as neotype" ] && NAME="$NAME""--T"; + + NOT_REFSEQ="$(grep -v "^na$" <<<"$excluded_from_refseq")"; + [ "$NOT_REFSEQ" == "untrustworthy as type" ] && NAME="$NAME""--t"; + [ -n "$NOT_REFSEQ" ] && [ "$NOT_REFSEQ" != "untrustworthy as type" ] && NAME="$NAME""--w"; + + WGS_ACCN="$(grep -v "^na$" <<<"$wgs_master")"; + [ -n "$WGS_ACCN" ] && NAME="$NAME""--""$WGS_ACCN"; + + ASS_ACCN="$(grep -v "^na$" <<<"$assembly_accession")"; + [ -n "$ASS_ACCN" ] && NAME="$NAME""--""$ASS_ACCN"; GZFILE=$(basename $ftp_path)$INEXT; URL=$(echo $ftp_path | sed "s/ftp:/$PROTOCOL/")"/$GZFILE"; - >&2 echo -e "$(gettime $START) [$i/$n] $organism_name | $infraspecific_name | $isolate | $assembly_accession | $wgs_master | \e[31m$excluded_from_refseq\e[0m \e[34m$relation_to_type_material\e[0m | $ftp_path" ; + >&2 echo -e "$(gettime $START) [$i/$n] $organism_name | $infraspecific_name | $ISOLATE | $ASS_ACCN | $WGS_ACCN | \e[31m$NOT_REFSEQ\e[0m \e[34m$TYPE_MATERIAL\e[0m | $ftp_path" ; OUTFILE=$(echo "$NAME" | tr ' ' '.' | sed 's/\.\.*/\./g')$OUTEXT; ### replacing blank spaces by '.', and successive dots by only one