Skip to content
Snippets Groups Projects
Commit 22cf5fdd authored by Alexis  CRISCUOLO's avatar Alexis CRISCUOLO :black_circle:
Browse files

Initial commit

parents
No related branches found
No related tags found
No related merge requests found
This diff is collapsed.
# FASTA2AGP
_FASTA2AGP_ is a command line program written in [Java](https://docs.oracle.com/javase/8/docs/technotes/guides/language/index.html) that allows creating [AGP](https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/) and contig sequence files from a FASTA-formatted scaffold sequence file.
## Compilation and execution
The source code of _FASTA2AGP_ is inside the _src_ directory and could be compiled and executed in two different ways.
#### Building an executable jar file
On computers with [Oracle JDK](http://www.oracle.com/technetwork/java/javase/downloads/index.html) (6 or higher) installed, a Java executable jar file could be created. In a command-line window, go to the _src_ directory and type:
```bash
javac FASTA2AGP.java
echo Main-Class: FASTA2AGP > MANIFEST.MF
jar -cmvf MANIFEST.MF FASTA2AGP.jar FASTA2AGP.class
rm MANIFEST.MF FASTA2AGP.class
```
This will create the executable jar file `FASTA2AGP.jar` that could be launched with the following command line model:
```bash
java -jar FASTA2AGP.jar [options]
```
#### Building a native code binary
On computers with the [GNU compiler GCJ](https://gcc.gnu.org/onlinedocs/gcc-4.2.4/gcj/) installed, a binary could also be built. In a command-line window, go to the _src_ directory, and type:
```bash
make
```
This will create the executable binary file `FASTA2AGP` that could be launched with the following command line model:
```bash
./FASTA2AGP [options]
```
## Usage
Launch _FASTA2AGP_ without option to read the following documentation:
```
USAGE: FASTA2AGP -i <scaffolds.fasta> [-o <contigs.fasta>] [-a <info.agp>] [-n <length>]
where options are:
-i <infile> FASTA-formatted scaffold sequence file (mandatory)
-o <outfile> FASTA-formatted contig sequence output file name (default: <infile>.fna)
-a <outfile> AGP-formatted output file name (default: <infile>.agp)
-n <integer> minimum length of scaffolding stretch of Ns (default: 10)
```
/*
####################################################################
FASTA2AGP: creating agp and contig sequence files from a FASTA-
formatted scaffold sequence file
[Version 1.2.180515ac]
Copyright (C) 2015,2016,2018 Alexis Criscuolo
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Contact:
Institut Pasteur
Bioinformatics and Biostatistics Hub
C3BI, USR 3756 IP CNRS
Paris, FRANCE
alexis.criscuolo@pasteur.fr
####################################################################
*/
import java.io.*;
import java.util.*;
public class FASTA2AGP {
// constants
static String NOFILE = "N.O.F.I.L.E";
// options
static File infile; // -i
static File ctgFile; // -o
static File agpFile; // -a
static int polyNlgt; // -n
// io
static BufferedReader in;
static BufferedWriter outc, outa;
// data
static int size;
static ArrayList<String> fh, sq;
// stuffs
static int o, i, s, sN, eN, l, cptctg, part_number, base;
static String line, hdr, polyN;
static StringBuilder sb;
public static void main(String[] args) throws IOException {
// ############################
// ### man #######
// ############################
if ( args.length < 2 ) {
System.out.println(""); System.out.println(" USAGE: FASTA2AGP -i <scaffolds.fasta> [-o <contigs.fasta>] [-a <info.agp>] [-n <length>]");
System.out.println(""); System.out.println(" where options are:"); System.out.println("");
System.out.println(" -i <infile> FASTA-formatted scaffold sequence file (mandatory)");
System.out.println(" -o <outfile> FASTA-formatted contig sequence output file name (default: <infile>.fna)");
System.out.println(" -a <outfile> AGP-formatted output file name (default: <infile>.agp)");
System.out.println(" -n <integer> minimum length of scaffolding stretch of Ns (default: 10)");
System.out.println(""); System.exit(0);
}
// ############################
// ### parsing options #######
// ############################
infile = new File(NOFILE);
ctgFile = new File(NOFILE);
agpFile = new File(NOFILE);
polyNlgt = 10;
o = -1;
while ( ++o < args.length ) {
if ( args[o].equals("-i") ) { infile = new File(args[++o]); continue; }
if ( args[o].equals("-o") ) { ctgFile = new File(args[++o]); continue; }
if ( args[o].equals("-a") ) { agpFile = new File(args[++o]); continue; }
if ( args[o].equals("-n") ) {
try { polyNlgt = Integer.parseInt(args[++o]); }
catch ( NumberFormatException e ) { System.out.println("incorrect integer value: " + args[o] + " (option -n)"); System.exit(1); }
if ( polyNlgt < 1 ) { System.out.println("incorrect integer value: " + polyNlgt + " (option -n)"); System.exit(1); }
continue;
}
}
if ( ! infile.exists() ) { System.out.println(infile.toString() + " not found (options -i)"); System.exit(1); }
if ( ctgFile.toString().equals(NOFILE) ) ctgFile = new File(infile.toString() + ".fna");
if ( agpFile.toString().equals(NOFILE) ) agpFile = new File(infile.toString() + ".agp");
sb = new StringBuilder(""); s = polyNlgt; while ( --s >= 0 ) sb = sb.append('N'); polyN = sb.toString();
// ############################
// ### reading infile ########
// ############################
fh = new ArrayList<String>(); sq = new ArrayList<String>(); sb = new StringBuilder(""); in = new BufferedReader(new FileReader(infile));
while ( true ) {
try { line = in.readLine().trim(); } catch ( NullPointerException e ) { in.close(); break; }
if ( line.startsWith(">") ) { if ( sb.length() != 0 ) { sq.add(sb.toString()); sb = new StringBuilder(""); } fh.add(line); continue; }
sb = sb.append(line);
}
if ( sb.length() != 0 ) sq.add(sb.toString());
size = fh.size(); o = 10; while ( o < size ) o *= 10; base = ("" + (o *= 10)).length();
// ##############################
// ### writing outfiles ########
// ##############################
outc = new BufferedWriter(new FileWriter(ctgFile)); outa = new BufferedWriter(new FileWriter(agpFile)); cptctg = 0; i = -1;
while ( ++i < size ) {
line = sq.get(i).toUpperCase(); l = line.length(); s = 0; sN = line.indexOf(polyN);
if ( s == -1 ) {
hdr = "contig_" + frmt(++cptctg,base); outc.write(">" + hdr); outc.newLine(); outc.write(line); outc.newLine();
// =======================================================================================================================================================
// object object_beg object_end part_number component_type component_id component_beg component_end orientation
// | | | | | | | | |
outa.write("scaffold_" + frmt(i+1,base) + "\t1\t" + l + "\t1\t" + "W\t" + hdr + "\t1\t" + l + "\t+");
outa.newLine();
continue;
}
part_number = 0;
while ( sN != -1 ) {
eN = sN; while ( (++eN < l) && (line.charAt(eN) == 'N') ) {}
// ========================================================
// s sN eN
// | | |
// ...NNNAACTGTCACTACGAATGCTNNNNNNNNNNNNNNNNNACGTACGT
hdr = "contig_" + frmt(++cptctg,base); outc.write(">" + hdr); outc.newLine(); outc.write(line.substring(s, sN)); outc.newLine();
// =====================================================================================================================================================================
// object object_beg object_end part_number component_type component_id component_beg component_end orientation
// | | | | | | | | |
outa.write("scaffold_" + frmt(i+1,base) + "\t" + (s+1) + "\t" + sN + "\t" + (++part_number) + "\tW\t" + hdr + "\t1\t" + (sN-s) + "\t+"); outa.newLine();
// ============================================================================================================================================================
// object object_beg object_end part_number component_type gap_length gap_type linkage link.evidence
// | | | | | | | | |
outa.write("scaffold_" + frmt(i+1,base) + "\t" + (sN+1) + "\t" + eN + "\t" + (++part_number) + "\tN\t" + (eN-sN) + "\tscaffold\tyes\t" + "paired-ends"); outa.newLine();
s = eN; sN = line.indexOf(polyN, s);
}
// =====================================
// s
// |
// ...NNNAACTGTCACTACGAATGCTACGTACGT
hdr = "contig_" + frmt(++cptctg,base); outc.write(">" + hdr); outc.newLine(); outc.write(line.substring(s)); outc.newLine();
// ======================================================================================================================================================================
// object object_beg object_end part_number component_type component_id component_beg component_end orientation
// | | | | | | | | |
outa.write("scaffold_" + frmt(i+1,base) + "\t" + (s+1) + "\t" + l + "\t" + (++part_number) + "\tW\t" + hdr + "\t1\t" + (l-s) + "\t+"); outa.newLine();
}
outc.close(); outa.close();
}
static String frmt( int x, int base ) { return Integer.toString((int)(Math.pow(10, base) + x)).substring(1); }
}
GCJ=gcj
GCJFLAGS=-fsource=1.6 -march=native -msse2 -O3 -minline-all-stringops -fomit-frame-pointer -momit-leaf-frame-pointer -fstrict-aliasing -fno-store-check -fno-bounds-check -funroll-all-loops -Wall
OTHERFLAGS=-funsafe-math-optimizations -ffast-math
MAIN=FASTA2AGP
EXEC=FASTA2AGP
FASTA2AGP: FASTA2AGP.java
$(GCJ) $(GCJFLAGS) --main=$(MAIN) $(MAIN).java -o $(EXEC)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment