diff --git a/README.md b/README.md index 215b4264b6c579268cafc8ba8da7b3e1d2a66bc5..e7968461b2075cc5a025f417d38ba486f923bb53 100644 --- a/README.md +++ b/README.md @@ -8,34 +8,47 @@ The source code of _eFASTA_ is inside the _src_ directory and could be compiled #### Building an executable jar file -On computers with [Oracle JDK](http://www.oracle.com/technetwork/java/javase/downloads/index.html) (6 or higher) installed, a Java executable jar file could be created. In a command-line window, go to the _src_ directory and type: +Clone this repository with the following command line: +```bash +git clone https://gitlab.pasteur.fr/GIPhy/eFASTA.git +``` +On computers with [Oracle JDK](http://www.oracle.com/technetwork/java/javase/downloads/index.html) (6 or higher) installed, a Java executable jar file can be created. In a command-line window, go to the _src_ directory and type: ```bash javac EFASTA.java echo Main-Class: EFASTA > MANIFEST.MF jar -cmvf MANIFEST.MF EFASTA.jar EFASTA.class rm MANIFEST.MF EFASTA.class ``` -This will create the executable jar file `EFASTA.jar` that could be launched with the following command line model: +This will create the executable jar file `EFASTA.jar` that could be run with the following command line model: ```bash java -jar EFASTA.jar [options] ``` #### Building a native code binary -On computers with the [GNU compiler GCJ](https://gcc.gnu.org/onlinedocs/gcc-4.2.4/gcj/) installed, a binary could also be built. In a command-line window, go to the _src_ directory, and type: +Clone this repository with the following command line: +```bash +git clone https://gitlab.pasteur.fr/GIPhy/eFASTA.git +``` +On computers with [GraalVM](https://www.graalvm.org/downloads/) installed, a native executable can be built. In a command-line window, go to the _src_ directory, and type: ```bash -make +javac EFASTA.java +native-image EFASTA eFASTA +rm EFASTA.class ``` -This will create the executable binary file `eFASTA` that could be launched with the following command line model: +This will create the native executable `eFASTA` that can be run with the following command line model: ```bash ./eFASTA [options] ``` + ## Usage -Launch _eFASTA_ without option to read the following documentation: +Run _eFASTA_ without option to read the following documentation: ``` + eFASTA + USAGE: eFASTA <options> where options are: @@ -52,17 +65,17 @@ Launch _eFASTA_ without option to read the following documentation: leading to the writing of its amino acid translation (standard genetic code) into a second FASTA-formatted output file [basename].faa - -fcds to search for the full CDS (starting and ending by start - and stop codons, respectively) that includes the region + -fcds to search for the full CDS (starting and ending by START + and STOP codons, respectively) that includes the region specified by option -c - -orf to search for the ORF (starting and ending by stop codons) + -orf to search for the ORF (starting and ending by STOP codons) that includes the region specified by option -c - -start, -init <string> (only with options -cds and -fcds) several alternate start + -start, -init <string> (only with options -cds and -fcds) several alternate START codons (non ATG) could be specified with this option: PROK = ATG GTG TTG PROK+ = ATG GTG TTG CTG ATT PROK++ = ATG GTG TTG CTG ATT ATC ATA - -stop (only with options -cds, fcds and -orf) to include the stop + -stop (only with options -cds, -fcds or -orf) to include the STOP codon in the outputed sequences ``` @@ -74,7 +87,7 @@ The directory _example_ contains the FASTA-formatted file _Ecoli.O104H4.plsm.fna ```bash eFASTA -f Ecoli.O104H4.plsm.fna -c CP003291:31400-31293 ``` -This command line allows writing the file _seq.fna_ containing the following nucleotide segment: +This command line creates the file _seq.fna_ containing the following nucleotide segment: ``` >CP003291.1 plasmid pAA-EA11::31400-31293 GGGCTGATCGGCACCTGCCGTCTGAACGGTATCGATCCGGAAGCGTATCTGCGCCATATTCTGAGCGTACTGCCGGAATGGCCTTCCAACCGAGTTGGCGAACTCCTG @@ -84,7 +97,7 @@ GGGCTGATCGGCACCTGCCGTCTGAACGGTATCGATCCGGAAGCGTATCTGCGCCATATTCTGAGCGTACTGCCGGAATG ```bash eFASTA -f Ecoli.O104H4.plsm.fna -c CP003291:31400-31293 -orf ``` -This command line allows writing the file _seq.fna_ with the smallest ORF containing the specified region: +This command line creates the file _seq.fna_ with the smallest ORF containing the specified region: ``` >CP003291.1 plasmid pAA-EA11::31433-31266 GTACAGGGCTGGAGCTTGTGTGCACTGCTGTACGGGCTGATCGGCACCTGCCGTCTGAACGGTATCGATCCGGAAGCGTATCTGCGCCATATTCTGAGCGTACTGCCGGAATGGCCTTCCAACCGAGTTGGCGAACTCCTGCCATGGAACGTAGTACTCACCAATAAA @@ -95,11 +108,11 @@ as well as the file _seq.faa_ containing its translation (standard genetic code VQGWSLCALLYGLIGTCRLNGIDPEAYLRHILSVLPEWPSNRVGELLPWNVVLTNK ``` -##### Open Reading Frame (ORF) with codons stop +##### Open Reading Frame (ORF) with codons STOP ```bash eFASTA -f Ecoli.O104H4.plsm.fna -c CP003291:31400-31293 -orf -stop ``` -This command line allows writing the two files _seq.fna_ and _seq.faa_ containing the ORF and its translation, respectively, with the two ending-up stop codons: +This command line creates the two files _seq.fna_ and _seq.faa_ containing the ORF and its translation, respectively, with the two ending-up codons STOP: ``` >CP003291.1 plasmid pAA-EA11::31436-31263 TGAGTACAGGGCTGGAGCTTGTGTGCACTGCTGTACGGGCTGATCGGCACCTGCCGTCTGAACGGTATCGATCCGGAAGCGTATCTGCGCCATATTCTGAGCGTACTGCCGGAATGGCCTTCCAACCGAGTTGGCGAACTCCTGCCATGGAACGTAGTACTCACCAATAAATAA @@ -113,23 +126,23 @@ TGAGTACAGGGCTGGAGCTTGTGTGCACTGCTGTACGGGCTGATCGGCACCTGCCGTCTGAACGGTATCGATCCGGAAGC ```bash eFASTA -f Ecoli.O104H4.plsm.fna -c CP003291:31400-31293 -fcds ``` -This command line allows writing the file _seq.fna_ with the the smallest putative CDS containing the specified region: +This command line creates the file _seq.fna_ with the the smallest putative CDS containing the specified region: ``` >CP003291.1 plasmid pAA-EA11::31433-31266 GTACAGGGCTGGAGCTTGTGTGCACTGCTGTACGGGCTGATCGGCACCTGCCGTCTGAACGGTATCGATCCGGAAGCGTATCTGCGCCATATTCTGAGCGTACTGCCGGAATGGCCTTCCAACCGAGTTGGCGAACTCCTGCCATGGAACGTAGTACTCACCAATAAA ``` -as well as the file _seq.faa_ containing its translation (standard genetic code): +as well as the file _seq.faa_ containing its translation (standard genetic code): ``` >CP003291.1 plasmid pAA-EA11::31433-31266 VQGWSLCALLYGLIGTCRLNGIDPEAYLRHILSVLPEWPSNRVGELLPWNVVLTNK ``` -Of note, it is the same results as with option `-orf` because the stop codon TGA is occuring first (before any start codon ATG). +Of note, it is the same results as with option `-orf` because the STOP codon TGA is occurring first (before any START codon ATG). -##### Coding Sequence (CDS) with alternate codon start +##### Coding Sequence (CDS) with alternate codon START ```bash eFASTA -f Ecoli.O104H4.plsm.fna -c CP003291:31400-31293 -fcds -start PROK ``` -This command line allows writing the two files _seq.fna_ and _seq.faa_ containing the CDS and its translation, respectively, by considering each of the three codons ATG, GTG and TTG as a putative start codon: +This command line creates the two files _seq.fna_ and _seq.faa_ containing the CDS and its translation, respectively, by considering each of the three codons ATG, GTG and TTG as a putative codon START: ``` >CP003291.1 plasmid pAA-EA11::31418-31266 TTGTGTGCACTGCTGTACGGGCTGATCGGCACCTGCCGTCTGAACGGTATCGATCCGGAAGCGTATCTGCGCCATATTCTGAGCGTACTGCCGGAATGGCCTTCCAACCGAGTTGGCGAACTCCTGCCATGGAACGTAGTACTCACCAATAAA @@ -139,11 +152,11 @@ TTGTGTGCACTGCTGTACGGGCTGATCGGCACCTGCCGTCTGAACGGTATCGATCCGGAAGCGTATCTGCGCCATATTCT MCALLYGLIGTCRLNGIDPEAYLRHILSVLPEWPSNRVGELLPWNVVLTNK ``` -##### Coding Sequence (CDS) with alternate start codon and stop codon +##### Coding Sequence (CDS) with alternate codons START and codon STOP ```bash eFASTA -f Ecoli.O104H4.plsm.fna -c CP003291:31400-31293 -fcds -start PROK -stop ``` -This command line allows writing the two files _seq.fna_ and _seq.faa_ containing the CDS and its translation, respectively, by considering each of the three codons ATG, GTG and TTG as a putative start codon, and including the termination codon: +This command line creates the two files _seq.fna_ and _seq.faa_ containing the CDS and its translation, respectively, by considering each of the three codons ATG, GTG and TTG as a putative codon START, and including the termination codon: ``` >CP003291.1 plasmid pAA-EA11::31418-31263 TTGTGTGCACTGCTGTACGGGCTGATCGGCACCTGCCGTCTGAACGGTATCGATCCGGAAGCGTATCTGCGCCATATTCTGAGCGTACTGCCGGAATGGCCTTCCAACCGAGTTGGCGAACTCCTGCCATGGAACGTAGTACTCACCAATAAATAA diff --git a/src/EFASTA.java b/src/EFASTA.java index 4077964b4b927aea38e56b81cafc3f6fc299cddd..cf9ddcb0031374f89c19cd4651dd513f249617c8 100644 --- a/src/EFASTA.java +++ b/src/EFASTA.java @@ -1,30 +1,30 @@ /* - #################################################################### - EFASTA: extracting nucleotide segments from FASTA file - - Copyright (C) 2017,2018 Alexis Criscuolo + ######################################################################################################## - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or + EFASTA: extracting nucleotide segments from FASTA file + + Copyright (C) 2017-2020 Institut Pasteur + + This program is free software: you can redistribute it and/or modify it under the terms of the GNU + General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even + the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. - - Contact: - Institut Pasteur - Bioinformatics and Biostatistics Hub - C3BI, USR 3756 IP CNRS - Paris, FRANCE + You should have received a copy of the GNU General Public License along with this program. If not, see + <http://www.gnu.org/licenses/>. + + Contact: + Alexis Criscuolo alexis.criscuolo@pasteur.fr + Genome Informatics & Phylogenetics (GIPhy) giphy.pasteur.fr + Bioinformatics and Biostatistics Hub research.pasteur.fr/team/hub-giphy + USR 3756 IP CNRS research.pasteur.fr/team/bioinformatics-and-biostatistics-hub + Dpt. Biologie Computationnelle research.pasteur.fr/department/computational-biology + Institut Pasteur, Paris, FRANCE research.pasteur.fr - alexis.criscuolo@pasteur.fr - #################################################################### + ######################################################################################################## */ import java.io.*; @@ -32,7 +32,7 @@ import java.util.*; public class EFASTA { //### constants ########## - final static String VERSION = "1.2.180511ac"; + final static String VERSION = "1.2b.201024ac"; final static String[] CODON = {"AAA" , "AAC" , "AAG" , "AAT" , // A A - "ACA" , "ACC" , "ACG" , "ACT" , // . C - "AGA" , "AGC" , "AGG" , "AGT" , // . G - @@ -97,9 +97,9 @@ public class EFASTA { // ########################### if ( args.length < 2 ) { System.out.println(""); - System.out.println(" eFASTA v." + VERSION); + System.out.println(" eFASTA v." + VERSION + " Copyright (C) 2017-2020 Institut Pasteur"); System.out.println(""); - System.out.println(" USAGE: eFASTA <options>"); + System.out.println(" USAGE: eFASTA [options]"); System.out.println(""); System.out.println(" where options are:"); System.out.println(""); @@ -115,20 +115,20 @@ public class EFASTA { System.out.println(" leading to the writing of its amino acid translation"); System.out.println(" (standard genetic code) into a second FASTA-formatted"); System.out.println(" output file [basename].faa"); - System.out.println(" -fcds to search for the full CDS (starting and ending by start"); - System.out.println(" and stop codons, respectively) that includes the region"); + System.out.println(" -fcds to search for the full CDS (starting and ending by START"); + System.out.println(" and STOP codons, respectively) that includes the region"); System.out.println(" specified by option -c"); - System.out.println(" -orf to search for the ORF (starting and ending by stop codons)"); + System.out.println(" -orf to search for the ORF (starting and ending by STOP codons)"); System.out.println(" that includes the region specified by option -c"); - System.out.println(" -start, -init <string> (only with options -cds and -fcds) several alternate start"); + System.out.println(" -start, -init <string> (only with options -cds and -fcds) several alternate START"); System.out.println(" codons (non ATG) could be specified with this option:"); System.out.println(" PROK = ATG GTG TTG"); System.out.println(" PROK+ = ATG GTG TTG CTG ATT"); System.out.println(" PROK++ = ATG GTG TTG CTG ATT ATC ATA"); - System.out.println(" -stop (only with options -cds, fcds and -orf) to include the stop"); + System.out.println(" -stop (only with options -cds, -fcds or -orf) to include the STOP"); System.out.println(" codon in the outputed sequences"); System.out.println(""); - System.exit(1); + System.exit(0); } // ########################### diff --git a/src/Makefile b/src/Makefile deleted file mode 100644 index 861a98c6d60b12f1499bb18d954f30612d4ef6e4..0000000000000000000000000000000000000000 --- a/src/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -GCJ=gcj -GCJFLAGS=-fsource=1.6 -march=native -msse2 -O3 -minline-all-stringops -fomit-frame-pointer -momit-leaf-frame-pointer -fstrict-aliasing -fno-store-check -fno-bounds-check -funroll-all-loops -Wall -OTHERFLAGS=-funsafe-math-optimizations -ffast-math -MAIN=EFASTA -EXEC=eFASTA - -eFASTA: EFASTA.java - $(GCJ) $(GCJFLAGS) --main=$(MAIN) $(MAIN).java -o $(EXEC)