diff --git a/README.md b/README.md index 22b12b4181e97df1ea5d83bd9305296f37a33c45..98640e4d160997e59c003298be88bba4857fdf50 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,8 @@ Launch _wgetGenBankWGS_ without option to read the following documentation: * The output file names are created with the organism name, followed by the intraspecific and isolate names (if any), and ending with the WGS master (is any) and the assembly accession. File extension depends on the file type specified using option -f. +* Flag "--T--" is added in the output file name when the corresponding assembly correspond to a type material. Flag "--t--" is added for putative type material that does not meet the full required criteria. Flag "--w--" is added as warning when some assembly anomalies are specified in the report file (for more details, see [https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/](https://www.ncbi.nlm.nih.gov/assembly/help/anomnotrefseq/). + * After each usage, a file `summary.txt` containing the selected raw(s) of the GenBank or RefSeq tab-separated assembly report is written. If the option -n is not set, this file is completed by the name(s) of the written files (first column 'file'). * Very fast running times are expected when running _wgetGenBankWGS_ on multiple threads. As a rule of thumb, using twice the maximum number of available threads generally leads to good performances with bacterial genomes (depending on the bandwidth). diff --git a/wgetGenBankWGS.sh b/wgetGenBankWGS.sh index e9b60947afef12e9fb8ac73cc369e3567fc70f86..2e7afae4a951f79591fcc9bbd6008094e1e96537 100755 --- a/wgetGenBankWGS.sh +++ b/wgetGenBankWGS.sh @@ -33,7 +33,11 @@ # = VERSIONS = # # ============ # # # - VERSION=0.4.200504ac # + VERSION=0.5.201018ac # +# + adding flag -T- or -t- in file name for type material # +# + adding flag -w- in file name for genomes excluded from RefSeq # +# # +# VERSION=0.4.200504ac # # + discarding option -t (type strain info) # # + option -t for multithread (instead of -c) # # + adding single quote (') in the list of special characters # @@ -63,13 +67,13 @@ if [ "$1" = "-?" ] || [ "$1" = "-h" ] || [ $# -le 1 ] then # cat <<EOF - wgetGenBankWGS v.$VERSION + wgetGenBankWGS v.$VERSION Copyright (C) 2019-2020 Institut Pasteur Downloading sequence files corresponding to selected entries from genome assembly report files: GenBank: ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_genbank.txt RefSeq: ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt - Writing output files 'Species.isolate--accn--GCA' with the following content (and extension): + Writing output files 'Species.isolate--accn--GC' with the following content (and extension): -f 1 genomic sequence(s) in FASTA format (.fasta) -f 2 genomic sequence(s) in GenBank format (.gbk) -f 3 annotations in GFF3 format (.gff) @@ -287,7 +291,7 @@ head -1 $SUMMARY | sed 's/^# /# file\t/' > $FULLSUMMARY ; tr '\t' '|' < $SUMMARY > $tmp ; mv $tmp $SUMMARY ; ## to deal with empty entries, not well managed using IFS=$'\t' START=$SECONDS; i=-1; -while IFS="|" read -r assembly_accession _ _ wgs_master _ _ _ organism_name infraspecific_name isolate _ _ _ _ _ _ _ _ _ ftp_path _ _ +while IFS="|" read -r assembly_accession _ _ wgs_master _ _ _ organism_name infraspecific_name isolate _ _ _ _ _ _ _ _ _ ftp_path excluded_from_refseq relation_to_type_material do let i++; if [ $i -lt 1 ]; then continue; fi @@ -300,14 +304,19 @@ do ISOLATE=$(echo "$isolate" | tr ",/\?%*:|'\"<>()[]#;" '_'); ### replacing special char. by '_' [ -n "$ISOLATE" ] && [ $(echo "$NAME" | grep -c -F "$ISOLATE") -eq 0 ] && NAME="$NAME.$ISOLATE"; - accn=$(tr -d '0123456789.' <<< "$wgs_master")"01"; - [ -n "$wgs_master" ] && NAME="$NAME""--""$accn"; + [ "$relation_to_type_material" == "assembly from type material" ] && NAME="$NAME""--T"; + [ "$relation_to_type_material" == "assembly from synonym type material" ] && NAME="$NAME""--T"; + [ "$excluded_from_refseq" == "untrustworthy as type" ] && NAME="$NAME""--t"; + [ -n "$excluded_from_refseq" ] && [ "$excluded_from_refseq" != "untrustworthy as type" ] && NAME="$NAME""--w"; + + # accn=$(tr -d '0123456789.' <<< "$wgs_master")"01"; [ -n "$wgs_master" ] && NAME="$NAME""--""$accn"; + [ -n "$wgs_master" ] && NAME="$NAME""--""$wgs_master"; [ -n "$assembly_accession" ] && NAME="$NAME""--""$assembly_accession"; GZFILE=$(basename $ftp_path)$INEXT; URL=$(echo $ftp_path | sed "s/ftp:/$PROTOCOL/")"/$GZFILE"; - >&2 echo "$(gettime $START) [$i/$n] $organism_name | $infraspecific_name | $isolate | $assembly_accession | $wgs_master | $ftp_path" ; + >&2 echo -e "$(gettime $START) [$i/$n] $organism_name | $infraspecific_name | $isolate | $assembly_accession | $wgs_master | \e[31m$excluded_from_refseq\e[0m \e[34m$relation_to_type_material\e[0m | $ftp_path" ; OUTFILE=$(echo "$NAME" | tr ' ' '.' | sed 's/\.\.*/\./g')$OUTEXT; ### replacing blank spaces by '.', and successive dots by only one