diff --git a/README.md b/README.md index 99dde52f83cf603a06d763898fcc6a4d1e22726d..22b12b4181e97df1ea5d83bd9305296f37a33c45 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # wgetGenBankWGS -_wgetGenBankWGS_ is a command line program written in [Bash](https://www.gnu.org/software/bash/) to download genome assembly files in FASTA format from the GenBank or RefSeq repositories. -The FASTA files to dowload are selected from the [GenBank](https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt) or [RefSeq](https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt) genome assembly reports using [extended regular expressions](https://www.gnu.org/software/grep/manual/grep.html#Regular-Expressions) as implemented by [_grep_](https://www.gnu.org/software/grep/) (with option -E). +_wgetGenBankWGS_ is a command line program written in [Bash](https://www.gnu.org/software/bash/) to download genome assembly files from the GenBank or RefSeq repositories. +The files to dowload are selected from the [GenBank](https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt) or [RefSeq](https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt) genome assembly reports using [extended regular expressions](https://www.gnu.org/software/grep/manual/grep.html#Regular-Expressions) as implemented by [_grep_](https://www.gnu.org/software/grep/) (with option -E). Every download is performed by the standard tool [_wget_](https://www.gnu.org/software/wget/). @@ -28,43 +28,65 @@ Execute _wgetGenBankWGS_ with the following command line model: Launch _wgetGenBankWGS_ without option to read the following documentation: ``` - wgetGenBankWGS - - Downloading FASTA-formatted nucleotide sequence files corresponding to selected entries from genome assembly report files: - GenBank: ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt - RefSeq: ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt - - USAGE: - wgetGenBankWGS.sh -e <pattern> [-v <pattern>] [-o <outdir>] [-t <nthreads>] [-n] + wgetGenBankWGS v.0.4.200504ac + + Downloading sequence files corresponding to selected entries from genome assembly report files: + GenBank: ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt + RefSeq: ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt + + Writing output files 'Species.isolate--accn--GCA' with the following content (and extension): + -f 1 genomic sequence(s) in FASTA format (.fasta) + -f 2 genomic sequence(s) in GenBank format (.gbk) + -f 3 annotations in GFF3 format (.gff) + -f 4 codon CDS in FASTA format (.fasta) + -f 5 amino acid CDS in FASTA format (.fasta) + -f 6 RNA sequences in FASTA format (.fasta) + + USAGE: + wgetGenBankWGS.sh -e <pattern> [-v <pattern>] [-o <outdir>] [-f <integer>] [-n] [-z] [-t <nthreads>] where: - -e <pattern> extended regexp selection pattern (grep -E style; mandatory) - -v <pattern> extended regexp exclusion pattern (grep -E style; default: none) + -e <pattern> extended regexp selection pattern (mandatory) + -v <pattern> extended regexp exclusion pattern (default: none) -d <string> either 'genbank' or 'refseq' (default: genbank) -n no download, i.e. to only print the number of selected files (default: not set) - -t type strain name(s) for each selected species gathered from straininfo.net (default: not set) + -f <integer> file type identifier (see above; default: 1) + -z no unzip, i.e. downloaded files are compressed (default: not set) -o <outdir> output directory (default: .) - -c <nthreads> number of threads (default: 1) + -t <nthreads> number of threads (default: 1) EXAMPLES: - + get the total number of available complete Salmonella genomes inside RefSeq, as well as the type strain list: + + getting the total number of available complete Salmonella genomes inside RefSeq: wgetGenBankWGS.sh -e "Salmonella.*Complete Genome" -v "phage|virus" -d refseq -n - + get the total number of genomes deposited in 1996 (see details in the written file summary.txt): - wgetGenBankWGS.sh -e "1996/[01-12]+/[01-31]+" -n - - + download in the directory Dermatophilaceae every available genome sequence from this family using 30 threads: + + getting the total number of genomes inside GenBank deposited in 1996: + wgetGenBankWGS.sh -e "1996/[01-12]+/[01-31]+" -n + + + getting the total number of available SARS-CoV-2 genomes (taxid=694009) inside GenBank: + wgetGenBankWGS.sh -e $'\t'694009$'\t' -n + + + downloading the full RefSeq assembly report: + wgetGenBankWGS.sh -e "/" -d refseq -n + + + downloading the GenBank files with the assembly accessions GCF_900002335, GCF_000002415 and GCF_000002765: + wgetGenBankWGS.sh -e "GCF_900002335|GCF_000002415|GCF_000002765" -d refseq + + + downloading in the directory Dermatophilaceae every available genome sequence from this family using 30 threads: wgetGenBankWGS.sh -e "Austwickia|Dermatophilus|Kineosphaera|Mobilicoccus|Piscicoccus|Tonsilliphilus" -o Dermatophilaceae -t 30 - + download in the current directory the non-Listeria genomes with the wgs_master starting with "PPP": - wgetGenBankWGS.sh -e $'\t'"PPP.00000000" -v "Listeria" - ``` + + downloading the non-Listeria proteomes with the wgs_master starting with "PPP": + wgetGenBankWGS.sh -e $'\t'"PPP.00000000" -v "Listeria" -f 5 + + + downloading the genome annotation of every Klesiella type strain in compressed gff3 format using 30 threads + wgetGenBankWGS.sh -e "Klebsiella.*type material" -f 3 -z -t 30 + +``` ## Notes -* The output FASTA file names are created with the organism name, followed by the intraspecific and isolate names (if any), and ending with the WGS master (is any) and the assembly accession. +* The output file names are created with the organism name, followed by the intraspecific and isolate names (if any), and ending with the WGS master (is any) and the assembly accession. File extension depends on the file type specified using option -f. -* After each usage, a file `summary.txt` containing the selected raw(s) of the GenBank or RefSeq tab-separated assembly report is written. If the option -n is not set, this file is completed by the name(s) of the written FASTA files (first column 'fasta_file'). +* After each usage, a file `summary.txt` containing the selected raw(s) of the GenBank or RefSeq tab-separated assembly report is written. If the option -n is not set, this file is completed by the name(s) of the written files (first column 'file'). * Very fast running times are expected when running _wgetGenBankWGS_ on multiple threads. As a rule of thumb, using twice the maximum number of available threads generally leads to good performances with bacterial genomes (depending on the bandwidth). diff --git a/wgetGenBankWGS.sh b/wgetGenBankWGS.sh index cb1580cafcdec176884ed5741cdc75ddef62cd6f..e9b60947afef12e9fb8ac73cc369e3567fc70f86 100755 --- a/wgetGenBankWGS.sh +++ b/wgetGenBankWGS.sh @@ -2,9 +2,9 @@ ############################################################################################################# # # -# wgetGenBankWGS: downloading WGS nucleotide sequences from NCBI # +# wgetGenBankWGS: downloading WGS genome assembly files from NCBI # # # -# Copyright (C) 2019 Alexis Criscuolo # +# Copyright (C) 2019,2020 Institut Pasteur # # # # This program is free software: you can redistribute it and/or modify it under the terms of the GNU # # General Public License as published by the Free Software Foundation, either version 3 of the License, or # @@ -17,13 +17,13 @@ # You should have received a copy of the GNU General Public License along with this program. If not, see # # <http://www.gnu.org/licenses/>. # # # -# Contact: # -# Institut Pasteur # -# Bioinformatics and Biostatistics Hub # -# C3BI, USR 3756 IP CNRS # -# Paris, FRANCE # -# # -# alexis.criscuolo@pasteur.fr # +# Contact: # +# Alexis Criscuolo alexis.criscuolo@pasteur.fr # +# Genome Informatics & Phylogenetics (GIPhy) giphy.pasteur.fr # +# Bioinformatics and Biostatistics Hub research.pasteur.fr/team/hub-giphy # +# USR 3756 IP CNRS research.pasteur.fr/team/bioinformatics-and-biostatistics-hub # +# Dpt. Biologie Computationnelle research.pasteur.fr/department/computational-biology # +# Institut Pasteur, Paris, FRANCE research.pasteur.fr # # # ############################################################################################################# @@ -33,7 +33,15 @@ # = VERSIONS = # # ============ # # # - VERSION=0.3.190613ac # + VERSION=0.4.200504ac # +# + discarding option -t (type strain info) # +# + option -t for multithread (instead of -c) # +# + adding single quote (') in the list of special characters # +# + deals with wgs_master starting with 6 alphabetic characters # +# + new option -f to download different file types # +# + new option -z to keep compressed format # +# # +# VERSION=0.3.190613ac # # + no test between ftp and http protocols; use directly http # # + fixed bug when the specified pattern has no match # # # @@ -57,33 +65,54 @@ then wgetGenBankWGS v.$VERSION - Downloading FASTA-formatted nucleotide sequence files corresponding to selected entries from genome assembly report files: - GenBank: ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt - RefSeq: ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt + Downloading sequence files corresponding to selected entries from genome assembly report files: + GenBank: ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt + RefSeq: ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt + + Writing output files 'Species.isolate--accn--GCA' with the following content (and extension): + -f 1 genomic sequence(s) in FASTA format (.fasta) + -f 2 genomic sequence(s) in GenBank format (.gbk) + -f 3 annotations in GFF3 format (.gff) + -f 4 codon CDS in FASTA format (.fasta) + -f 5 amino acid CDS in FASTA format (.fasta) + -f 6 RNA sequences in FASTA format (.fasta) USAGE: - wgetGenBankWGS.sh -e <pattern> [-v <pattern>] [-o <outdir>] [-t <nthreads>] [-n] + wgetGenBankWGS.sh -e <pattern> [-v <pattern>] [-o <outdir>] [-f <integer>] [-n] [-z] [-t <nthreads>] where: -e <pattern> extended regexp selection pattern (mandatory) -v <pattern> extended regexp exclusion pattern (default: none) -d <string> either 'genbank' or 'refseq' (default: genbank) -n no download, i.e. to only print the number of selected files (default: not set) - -t type strain name(s) for each selected species gathered from straininfo.net (default: not set) + -f <integer> file type identifier (see above; default: 1) + -z no unzip, i.e. downloaded files are compressed (default: not set) -o <outdir> output directory (default: .) - -c <nthreads> number of threads (default: 1) + -t <nthreads> number of threads (default: 1) EXAMPLES: + getting the total number of available complete Salmonella genomes inside RefSeq: wgetGenBankWGS.sh -e "Salmonella.*Complete Genome" -v "phage|virus" -d refseq -n - + getting the total number of genomes deposited in 1996: - wgetGenBankWGS.sh -e "1996/[01-12]+/[01-31]+" -n + + getting the total number of genomes inside GenBank deposited in 1996: + wgetGenBankWGS.sh -e "1996/[01-12]+/[01-31]+" -n + + + getting the total number of available SARS-CoV-2 genomes (taxid=694009) inside GenBank: + wgetGenBankWGS.sh -e $'\t'694009$'\t' -n + + + downloading the full RefSeq assembly report: + wgetGenBankWGS.sh -e "/" -d refseq -n + + + downloading the GenBank files with the assembly accessions GCF_900002335, GCF_000002415 and GCF_000002765: + wgetGenBankWGS.sh -e "GCF_900002335|GCF_000002415|GCF_000002765" -d refseq + downloading in the directory Dermatophilaceae every available genome sequence from this family using 30 threads: wgetGenBankWGS.sh -e "Austwickia|Dermatophilus|Kineosphaera|Mobilicoccus|Piscicoccus|Tonsilliphilus" -o Dermatophilaceae -t 30 - + downloading in the current directory the non-Listeria genomes with the wgs_master starting with "PPP": - wgetGenBankWGS.sh -e $'\t'"PPP.00000000" -v "Listeria" + + downloading the non-Listeria proteomes with the wgs_master starting with "PPP": + wgetGenBankWGS.sh -e $'\t'"PPP.00000000" -v "Listeria" -f 5 + + + downloading the genome annotation of every Klesiella type strain in compressed gff3 format using 30 threads + wgetGenBankWGS.sh -e "Klebsiella.*type material" -f 3 -z -t 30 EOF exit 1 ; # @@ -97,24 +126,29 @@ fi # = CONSTANTS = # # =============== # # # - WGETOPT="--retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -q"; - STRAININFO="http://www.straininfo.net"; +# = PROTOCOL can be either "ftp:" or "https"; however, "https:" is generally faster ====================== # +# # + PROTOCOL="https:"; +# # +# = WGETOPT are the basic wget options =================================================================== # +# # + WGETOPT="--no-check-certificate --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -q"; # # # # # =============== # # = FUNCTIONS = # # =============== # # # -# = gettime() arguments: ============================================================================== # -# 1. START: the starting time in seconds -# returns the elapsed time since $START +# = gettime() arguments: ================================================================================= # +# 1. START: the starting time in seconds # +# returns the elapsed time since $START # gettime() { t=$(( $SECONDS - $1 )); sec=$(( $t % 60 )); min=$(( $t / 60 )); if [ $sec -lt 10 ]; then sec="0$sec"; fi if [ $min -lt 10 ]; then min="0$min"; fi echo "[$min:$sec]" ; } - +# # # = randomfile() arguments: ============================================================================== # # 1. PREFIX: prefix file name # # returns a random file name from a given PREFIX file name # @@ -131,12 +165,14 @@ randomfile() { # # dwnl() { tmp=$(randomfile $2); + wget $WGETOPT --spider $1 || return 1 ; while [ 1 ] do wget $WGETOPT -O $tmp $1 ; if [ $? == 0 ]; then mv $tmp $2 ; break; fi sleep 1 ; done + return 0 ; } # # # = dwnlgz() arguments: ================================================================================== # @@ -146,40 +182,19 @@ dwnl() { # # dwnlgz() { tmp=$(randomfile $2); + wget $WGETOPT --spider $1 || return 1 ; while [ 1 ] do - wget $WGETOPT -O - $1 | gunzip -c > $tmp ; - if [ $? == 0 ]; then mv $tmp $2 ; break; fi + wget $WGETOPT -O $tmp $1 ; + if [ $? == 0 ]; then gunzip -c $tmp > $2 ; rm $tmp ; break; fi sleep 1 ; done -} -# # -# = straininfo() arguments: ============================================================================== # -# 1. GENUS: genus name # -# 2. SPECIES: species name # -# 3. COOKIE: cookie file name (if unknown, it will be first generated) # -# returns the list of the type strain names gathered from straininfo.net # -# # -straininfo() { - if [ ! -e $3 ] - then - while [ 1 ] - do - wget $WGETOPT --keep-session-cookies --save-cookies=$3 -O /dev/null "$STRAININFO" ; - [ $? == 0 ] && break || sleep 1 ; - done - fi - while [ 1 ] - do - strainlist="$(wget $WGETOPT --load-cookies=$3 -O - "$STRAININFO/taxonGet.jsp?taxon=$1%20$2" | grep -F "is <strong>type strain</strong> of:<br/>" | - sed -e 's/<div class='"'"'popup'"'"'>//g;s/ is <strong>type strain<\/strong> of:<br\/>//g' | tr '\n' '\t' | sed 's/\t$/\n/')"; - [ $? == 0 ] && break || sleep 1 ; - done - echo -e "$strainlist" ; + return 0 ; } # # ############################################################################################################# + ############################################################################################################# #### #### #### INITIALIZING PARAMETERS AND READING OPTIONS #### @@ -189,37 +204,57 @@ INCLUDE_PATTERN=""; EXCLUDE_PATTERN="^#"; REPOSITORY="genbank"; OUTDIR="."; -TYPES=false; NTHREADS=1; DWNL=true; +FTYPE=1; +UNZIP=true; WAITIME=0.5; -while getopts :e:v:o:c:d:nt option +while getopts :e:v:o:t:d:f:nz option do case $option in - e) INCLUDE_PATTERN="$OPTARG" ;; - v) EXCLUDE_PATTERN="$OPTARG" ;; - d) REPOSITORY="$OPTARG" ;; - o) OUTDIR="$OPTARG" ;; - c) NTHREADS=$OPTARG ;; - n) DWNL=false ;; - t) TYPES=true ;; + e) INCLUDE_PATTERN="$OPTARG" ;; + v) EXCLUDE_PATTERN="$OPTARG" ;; + d) REPOSITORY="$OPTARG" ;; + o) OUTDIR="$OPTARG" ;; + t) NTHREADS=$OPTARG ;; + f) FTYPE=$OPTARG ;; + n) DWNL=false ;; + z) UNZIP=false ;; :) echo "option $OPTARG : missing argument" ; exit 1 ;; \?) echo "$OPTARG : option invalide" ; exit 1 ;; esac done -if [ -z "$INCLUDE_PATTERN" ]; then echo "no specified pattern (option -p)" ; exit 1 ; fi +if [ -z "$INCLUDE_PATTERN" ]; then echo "no specified pattern (option -e)" ; exit 1 ; fi if [ $NTHREADS -lt 1 ]; then echo "incorrect number of threads (option -t): $THREADS" ; exit 1 ; fi if [ "$REPOSITORY" != "genbank" ] && [ "$REPOSITORY" != "refseq" ]; then "incorrect repository name (options -d): $REPOSITORY" ; exit 1 ; fi -ASSEMBLY_REPORT=ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_$REPOSITORY.txt; +INEXT="_genomic.fna.gz"; OUTEXT=".fasta"; +if $DWNL +then + if [ "$FTYPE" == "1" ]; then echo "file type: genomic sequence(s) in FASTA format" ; FTYPE=1; INEXT="_genomic.fna.gz"; OUTEXT=".fasta"; + elif [ "$FTYPE" == "2" ]; then echo "file type: genomic sequence(s) in GenBank format" ; FTYPE=2; INEXT="_genomic.gbff.gz"; OUTEXT=".gbk"; + elif [ "$FTYPE" == "3" ]; then echo "file type: annotations in GFF3 format" ; FTYPE=3; INEXT="_genomic.gff.gz"; OUTEXT=".gff"; + elif [ "$FTYPE" == "4" ]; then echo "file type: codon CDS in FASTA format" ; FTYPE=4; INEXT="_cds_from_genomic.fna.gz"; OUTEXT=".fasta"; + elif [ "$FTYPE" == "5" ]; then echo "file type: amino acid CDS in FASTA format" ; FTYPE=5; INEXT="_protein.faa.gz"; OUTEXT=".fasta"; + elif [ "$FTYPE" == "6" ]; then echo "file type: RNA sequences in FASTA format" ; FTYPE=6; INEXT="_rna_from_genomic.fna.gz"; OUTEXT=".fasta"; + fi +fi +OUTDIR=$(dirname $OUTDIR/.); if [ ! -e $OUTDIR ]; then echo "creating output directory: $OUTDIR" ; mkdir $OUTDIR ; fi + +trap "echo interrupting wgetGenBankWGS ; wait ; if [ \"$OUTDIR\" != "." ]; then rm -r $OUTDIR ; fi ; exit 1" INT ; + + +############################################################################################################# +#### #### +#### DOWNLOADING GENOME ASSEMBLY REPORT FILE #### +#### #### +############################################################################################################# +echo -n "downloading $REPOSITORY assembly report ... " ; +ASSEMBLY_REPORT=ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_$REPOSITORY.txt; SUMMARY=$OUTDIR/summary.txt; -PROTOCOL="https:"; -while [ 1 ] -do - wget $WGETOPT -O - $PROTOCOL"//"$ASSEMBLY_REPORT > $SUMMARY ; - [ $? == 0 ] && break || sleep 1 ; -done +dwnl $PROTOCOL"//"$ASSEMBLY_REPORT $SUMMARY ; +echo "[ok]" ; ############################################################################################################# @@ -233,30 +268,13 @@ tmp=$(randomfile $SUMMARY); mv $SUMMARY $tmp ; sed -n '2p' $tmp > $SUMMARY ; sed '1,2d' $tmp | grep -E "$INCLUDE_PATTERN" | grep -v -E "$EXCLUDE_PATTERN" | grep -F "ftp://ftp.ncbi.nlm.nih.gov" >> $SUMMARY ; -rm -f $tmp ; +rm $tmp ; n=$(grep -v -c "^#" $SUMMARY); -echo "$REPOSITORY: $n WGS nucleotide sequence FASTA files" ; +echo "$REPOSITORY: $n entries" ; if [ $n -eq 0 ]; then exit 0 ; fi -if ! $DWNL ; then echo "see details in the report file: $SUMMARY" ; fi - - -############################################################################################################# -#### #### -#### GETTING TYPE STRAIN ISOLATE NAMES #### -#### #### -############################################################################################################# -if $TYPES -then - TYPESTRAINS=$OUTDIR/type.strains.txt; - COOKIE=$(randomfile $TYPESTRAINS); - awk -F"\t" '! /^#/{print$8}' $SUMMARY | awk -F" " '{print$1"\t"$2}' | sort -u | - while read -r genus species ; do echo -e "$genus $species\t$(straininfo $genus $species $COOKIE)" ; done > $TYPESTRAINS ; - echo "see type strain name(s) for each selected species in the following file: $TYPESTRAINS" ; - rm -f $COOKIE ; -fi -if ! $DWNL ; then exit 0 ; fi +if ! $DWNL ; then echo "see details in the report file: $SUMMARY" ; exit 0 ; fi ############################################################################################################# @@ -265,44 +283,64 @@ if ! $DWNL ; then exit 0 ; fi #### #### ############################################################################################################# FULLSUMMARY=$(randomfile $SUMMARY); -head -1 $SUMMARY | sed 's/^# /# fasta_file\t/' > $FULLSUMMARY ; +head -1 $SUMMARY | sed 's/^# /# file\t/' > $FULLSUMMARY ; +tr '\t' '|' < $SUMMARY > $tmp ; mv $tmp $SUMMARY ; ## to deal with empty entries, not well managed using IFS=$'\t' START=$SECONDS; i=-1; -tr '\t' '|' < $SUMMARY | - while IFS="|" read -r assembly_accession _ _ wgs_master _ _ _ organism_name infraspecific_name isolate _ _ _ _ _ _ _ _ _ ftp_path _ _ - do - let i++; if [ $i -lt 1 ]; then continue; fi - >&2 echo "$(gettime $START) [$i/$n] $organism_name | $infraspecific_name | $isolate | $assembly_accession | $wgs_master | $ftp_path" ; - - GZFILE=$(basename $ftp_path)"_genomic.fna.gz"; +while IFS="|" read -r assembly_accession _ _ wgs_master _ _ _ organism_name infraspecific_name isolate _ _ _ _ _ _ _ _ _ ftp_path _ _ +do + let i++; if [ $i -lt 1 ]; then continue; fi - NAME=$(echo "$organism_name" | tr ',/\?%*:|"<>()[]#;' '_' | ### replacing special char. by '_' - sed -e 's/ bv\./ bv/;s/ genomosp\./ genomosp/;s/ sp\./ sp/;s/ str\./ str/;s/ subsp\./ subsp/'); + NAME=$(echo "$organism_name" | tr ",/\?%*:|'\"<>()[]#;" '_' | ### replacing special char. by '_' + sed -e 's/ bv\./ bv/;s/ genomosp\./ genomosp/;s/ sp\./ sp/;s/ str\./ str/;s/ subsp\./ subsp/'); - STRAIN=$(echo "$infraspecific_name" | sed 's/strain=//g' | tr ',/\?%*:|"<>()[]#;' '_'); ### replacing special char. by '_' - [ -n "$STRAIN" ] && [ $(echo "$NAME" | grep -c -F "$STRAIN") -eq 0 ] && NAME="$NAME.$STRAIN"; + STRAIN=$(echo "$infraspecific_name" | sed 's/strain=//g' | tr ",/\?%*:|'\"<>()[]#;" '_'); ### replacing special char. by '_' + [ -n "$STRAIN" ] && [ $(echo "$NAME" | grep -c -F "$STRAIN") -eq 0 ] && NAME="$NAME.$STRAIN"; - ISOLATE=$(echo "$isolate" | tr ',/\?%*:|"<>()[]#;' '_'); ### replacing special char. by '_' - [ -n "$ISOLATE" ] && [ $(echo "$NAME" | grep -c -F "$ISOLATE") -eq 0 ] && NAME="$NAME.$ISOLATE"; + ISOLATE=$(echo "$isolate" | tr ",/\?%*:|'\"<>()[]#;" '_'); ### replacing special char. by '_' + [ -n "$ISOLATE" ] && [ $(echo "$NAME" | grep -c -F "$ISOLATE") -eq 0 ] && NAME="$NAME.$ISOLATE"; - accn=${wgs_master:0:5}"1"; - [ -n "$wgs_master" ] && NAME="$NAME""--""$accn"; - [ -n "$assembly_accession" ] && NAME="$NAME""--""$assembly_accession"; + accn=$(tr -d '0123456789.' <<< "$wgs_master")"01"; + [ -n "$wgs_master" ] && NAME="$NAME""--""$accn"; + [ -n "$assembly_accession" ] && NAME="$NAME""--""$assembly_accession"; - URL=$(echo $ftp_path | sed "s/ftp:/$PROTOCOL/")"/$GZFILE"; - OUTFILE=$(echo "$NAME" | tr ' ' '.' | sed 's/\.\.*/\./g').fasta; ### replacing blank spaces by '.', and successive dots by only one + GZFILE=$(basename $ftp_path)$INEXT; + URL=$(echo $ftp_path | sed "s/ftp:/$PROTOCOL/")"/$GZFILE"; - echo -e "$OUTFILE\t$(sed -n "$(( $i + 1 )) p" $SUMMARY)" ; - + >&2 echo "$(gettime $START) [$i/$n] $organism_name | $infraspecific_name | $isolate | $assembly_accession | $wgs_master | $ftp_path" ; + + OUTFILE=$(echo "$NAME" | tr ' ' '.' | sed 's/\.\.*/\./g')$OUTEXT; ### replacing blank spaces by '.', and successive dots by only one + + if $UNZIP + then dwnlgz $URL $OUTDIR/$OUTFILE & + echo -e "$OUTFILE\t$(sed -n $(( $i + 1 ))p $SUMMARY)" ; + else + dwnl $URL $OUTDIR/$OUTFILE.gz & + echo -e "$OUTFILE.gz\t$(sed -n $(( $i + 1 ))p $SUMMARY)" ; + fi + + while [ $(jobs -r | wc -l) -gt $NTHREADS ]; do sleep $WAITIME ; done + # if [ $i -eq $n ]; then wait ; fi + +done < $SUMMARY | tr '|' '\t' >> $FULLSUMMARY ; - while [ $(jobs -r | wc -l) -gt $NTHREADS ]; do sleep $WAITIME ; done - done >> $FULLSUMMARY wait ; -mv $FULLSUMMARY $SUMMARY ; +############################################################################################################# +#### #### +#### CHECKING EXISTING FILES #### +#### #### +############################################################################################################# +awk -v d=$OUTDIR 'BEGIN{FS=OFS="\t"} + (NR==1){print;next} + {l=$0;if(getline < (d"/"$1) <= 0){$1="na";l=$0}print l}' $FULLSUMMARY > $SUMMARY ; +rm $FULLSUMMARY ; +n=$(grep -Pc "^na\t" $SUMMARY); +if [ $n -ne 0 ]; then echo "WARNING: $n files are not available with the specified file type (-f $FTYPE)" ; fi echo "see details in the report file: $SUMMARY" ; + exit ;