Commit 1df5e66d authored by Amine  GHOZLANE's avatar Amine GHOZLANE

Initial commit

parents
# MBMA
A Mapping Based Microbiome Analysis tool for species and resistance genes quantification
## About
A main challenge in the field of metagenomics is to quickly and precisely determine the composition and the relative abundance of constituents of a microbial community. State-of-the-art metagenomic methods rely on mapping-, kmer- or assembly- based approaches for identification and quantification. However, currently available methods are either time-consuming, less sensitive and generates false positives, which leads to a lack of accuracy in quantification at gene and strain levels.
To overcome these limitations, we developed Mapping Based Microbiome Analysis (MBMA), a mapping-based approach for identification and quantification of species, strains and resistance genes, with three main innovations, the use of : <br />
- a **efficient and discriminatory database** for rapid quantification, <br />
- an **advanced counting method** to decrease the false discovery rate, <br />
- combined with **variant calling** from samples sequences for an accurate abundance prediction.
**MBMA** identifies and quantifies constituents (species, resistance genes) from metagenomic samples by mapping reads against a database and performing variant calling. Other constituents can be quantifies by changing the database. It has 3 way of working :
- **mapping**, it simply map reads against an indexed reference database, using different mapping tools (bowtie2, bwa and novoalign) and different counting methods (best, ex-aequo and shared), for non redundant databases.
- **variant**, it performs, in addition to the mapping step, a variant calling step. Reads are mapped against a clustered database, and then variant calling using GATK is performed, to able an accurate quantification at a gene level.
- **mode**, it provide two presets modes for the identification of species (option : --species) and resistance genes (option : --resistance). For bacterial species, reads are mapped reads against *RefMG.v1*, and for resistance genes, against *ResFinder*.
## Version
1.0
## Requirements
## Usage
## Examples
library(ggplot2)
# Lecture des arguments en ligne de commande
filename = "../data/mock/perc_especes_mdg.txt"
# Lecture des fichiers
d = read.table(filename, header=T ,sep='\t', stringsAsFactors = F)
new_d=c()
Ech = unique(d$Echantillon)
for(i in Ech){
sample = d[d$Echantillon == i,]
sample = sample[order(-sample$Percentage),]
new_d=rbind(new_d,sample)
}
## species
ggplot(new_d,aes(x=new_d$Echantillon,y=new_d$Percentage))+
geom_bar(aes(fill=new_d$Species, width=0.75),colour="white",stat="identity")+
ylab("Pourcentage") +
xlab("Echantillon") +
scale_y_continuous(labels = scales::percent) +
theme(strip.text=element_text(size=14, face='bold'),
legend.position="None",
axis.text.x = element_text(angle = 90, hjust=1),
axis.title = element_text(size = 15, face ='bold'),
legend.title = element_text(face = 'bold', size = 12))
#### genes de resistance
ggplot(new_d,aes(x=new_d$Echantillon,y=new_d$Percentage))+
geom_bar(aes(fill=new_d$Resistance.genes, width=0.75),colour="white",stat="identity")+
ylab("Relative abundance") +
xlab("Sample") +
scale_y_continuous(labels = scales::percent) +
theme(strip.text=element_text(size=14, face='bold'),
legend.position="None",
axis.text.x = element_text(angle = 90, hjust=1),
axis.title = element_text(size = 15, face ='bold'),
legend.title = element_text(face = 'bold', size = 12))
write.table(new_d, file="resist_krem_ordonné.txt", row.names=FALSE, quote=FALSE, sep='\t', col.names = colnames(new_d))
## calcul de la divergence de KL
# function
KLtest = function(x1,x2)
{
res = 0
x1 = x1/sum(x1)
x2 = x2/sum(x2)
for(i in 1:length(x1)){
if(x1[i] != 0 && x2[i] != 0)
{
res = res + x1[i]*log(x1[i]/x2[i])
}
}
return(res)
}
#value = KLtest(percentage1, percentage2)
\ No newline at end of file
library("optparse")
option_list = list(
make_option(c("-f", "--file"), type = "character", default=NULL,
help="count matrice file name", metavar="character"),
make_option(c("-l","--label"), type = "character", default = "MBMA",
help = "label for the analysis [default= %default]", metavar = "character"),
make_option(c("-o", "--out"), type="character", default="out.txt",
help="output file name [default= %default]", metavar="character")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (is.null(opt$file)){
print_help(opt_parser)
stop("At least one argument must be supplied (input file)", call.=FALSE)
}
# Lecture des fichiers
matrix = read.table(opt$file, header=T ,sep='\t', stringsAsFactors = F)
# Récupération du taxID des souches
geneID = NULL
for(id in matrix$ref_genome) {
geneID = c(geneID, unlist(strsplit(id, "_"))[1])
}
# ajout du taxID et des noms de chaque espèces
matrix = cbind(geneID, matrix)
head = c("Echantillon", "Program", "Resistance genes", "Percentage", "Rpkm", "Nb mapped reads")
echantillon = c()
resistance = c()
percentage = c()
rpkm_ech = c()
mapped_ech = c()
col = colnames(matrix)
for(i in 4:length(col)){
sample = data.frame(matrix$geneID, matrix[i])
colnames(sample) = c("geneID", "count")
rpkm = (sample$count * (10**9))/(matrix$Size * as.numeric(sum(sample$count)))
sample = cbind(sample, rpkm)
sample = aggregate(cbind(rpkm, count) ~ geneID, sample, FUN=sum)
perc = round(sample$rpkm/sum(sample$rpkm),3)
sample = cbind(sample, perc)
sample = sample[sample$perc != 0,]
# sample_name = unlist(strsplit(col[i],"[.]|_"))[1]
echantillon = c(echantillon,rep(col[i], dim(sample)[1]))
resistance = c(resistance,as.character(sample$geneID))
percentage = c(percentage, sample$perc)
rpkm_ech = c(rpkm_ech, sample$rpkm)
mapped_ech = c(mapped_ech, sample$count)
}
mod = rep(opt$label,length(echantillon))
df = data.frame(echantillon, mod, resistance, percentage, rpkm_ech, mapped_ech)
write.table(df, file=opt$out, row.names=FALSE, quote=FALSE, sep='\t', col.names = head)
This source diff could not be displayed because it is too large. You can view the blob instead.
# # Lecture des arguments en ligne de commande
library("optparse")
option_list = list(
make_option(c("-f", "--file"), type = "character", default=NULL,
help="count matrice file name", metavar="character"),
make_option(c("-r", "--reference_db"), type="character", default=NULL,
help="dabases genes informations", metavar = "character"),
make_option(c("-l","--label"), type = "character", default = "MBMA",
help = "label for the analysis [default= %default]", metavar = "character"),
make_option(c("-o", "--out"), type="character", default="out.txt",
help="output file name [default= %default]", metavar="character")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (is.null(opt$file)){
print_help(opt_parser)
stop("At least two argument must be supplied (input file & reference file)", call.=FALSE)
}
# Lecture des fichiers
matrix = read.table(opt$file, header=T ,sep='\t', stringsAsFactors = F)
ref=read.table(opt$reference_db,header=T, sep='\t')
# Récupération du taxID des souches
TaxID = NULL
for(id in matrix$ref_genome) {
TaxID = c(TaxID, unlist(strsplit(id, "[.]"))[1])
}
# Attribution des noms de chaque espèces au TaxID
name = NULL
for(id in TaxID){
species = ref[which(ref$TaxID == id),]$Species
name=c(name, as.character(species))
}
# ajout du taxID et des noms de chaque espèces
matrix = cbind(TaxID, name, matrix)
head = c("Echantillon", "Program", "Species", "Percentage", "Rpkm", "Nb mapped reads")
echantillon = c()
species = c()
percentage = c()
rpkm_ech = c()
mapped_ech = c()
col = colnames(matrix)
for(i in 5:length(col)){
sample = data.frame(matrix$name, matrix$TaxID, matrix[i])
colnames(sample) = c("reference", "TaxID", "count")
rpkm = (sample$count * (10**9))/(matrix$Size * as.numeric(sum(sample$count)))
sample = cbind(sample, rpkm)
sample = aggregate(cbind(rpkm, count) ~ TaxID + reference, sample, FUN=function(x) c(mn=mean(x), s=sum(x) ))
sample = aggregate(cbind(rpkm[,1], count[,2]) ~ reference, sample, sum)
perc = round(sample$V1/sum(sample$V1),3)
sample = cbind(sample, perc)
sample = sample[sample$perc != 0,]
# sample_name = unlist(strsplit(col[i],"[.]|_"))[1]
echantillon = c(echantillon,rep(col[i], dim(sample)[1]))
species = c(species,as.character(sample$reference))
percentage = c(percentage, sample$perc)
rpkm_ech = c(rpkm_ech, sample$V1)
mapped_ech = c(mapped_ech, sample$V2)
}
mod = rep(opt$label,length(echantillon))
df = data.frame(echantillon, mod, species, percentage, rpkm_ech, mapped_ech)
write.table(df, file=opt$out, row.names=FALSE, quote=FALSE, sep='\t', col.names = head)
#! /usr/bin/env python
# -*- coding: utf8 -*-
__author__ = "Anita Annamalé"
__version__ = "1.0"
__copyright__ = "copyleft"
__date__ = "2016/05"
#-------------------------- MODULES IMPORTATION -------------------------------#
import sys
import os
#-------------------------- FUNCTIONS DEFINITION ------------------------------#
def parse_cltr(filename):
"""
Function that parse a cluster information file from CDHIT-est
Args :
filename [STR] = cluster informations file name
Returns:
annot [DICT] = contains gene as key and his cluster name as value
clstr_vs_ref [DICT] = contains cluster name as key and cluster
representative gene as value
"""
# initialize
annot = {}
clstr_vs_ref = {}
with open(filename, "rt") as infile:
for line in infile:
if line.startswith('>Cluster '):
clst_nb = line.rstrip()
clst_nb = clst_nb.replace(" ", "") # number of the cluster
else :
genes_info = line.split(" ")
# annotate each gene to a cluster
annot[genes_info[1][:-3]] = clst_nb[1:]
# if gene is the representative gene of the cluster
if genes_info[2] == '*\n':
# annotate each cluster to his reference sequence
clstr_vs_ref[clst_nb[1:]] = genes_info[1][1:-3]
return annot, clstr_vs_ref
def create_multifasta(fasta_file, annot, clstr_vs_ref, outdir):
"""
Function that create multifasta file in th provided directory (outdir).
Multifasta files are named by the clusters representatives name, and
contains all the genes that are similar to the representatives.
Args:
fasta_file [STR] = fasta file of the un clusterized database
annot [DICT] = contains gene as key and his cluster name as value
clstr_vs_ref [DICT] = contains cluster name as key and cluster
representative gene as value
outdir [STR] = output directory name
No return
"""
# initialize
fasta_data = {}
# open the database fasta file and get all information
with open(fasta_file, "rt") as multifasta:
for line in multifasta:
if line.startswith('>'):
name = line.rstrip() # get gene name
clstr_name = annot[name] # get gene cluster name
ref_name = clstr_vs_ref[clstr_name] # get cluster
# representative gene name
# append to the fasta dictionnary
fasta_data.setdefault(ref_name, []).append(line)
# write all the multifasta files
for element in fasta_data: # for representative
with open('{0}/{1}.fa'.format(outdir,element), "wt") as output:
output.write("".join(fasta_data[element])) # write sequences
#--------------------------------- MAIN ---------------------------------------#
if __name__ == '__main__' :
# command-line parsing
if not len(sys.argv) == 4:
msg = "usage: python cluster2multifasta.py <file.clstr> <file.fasta> \
<fasta_dir>"
print msg
exit(1)
# get absolute path
clstr_name = os.path.abspath(sys.argv[1])
fasta_name = os.path.abspath(sys.argv[2])
dir_name = os.path.abspath(sys.argv[3])
# parse the cluster information file from CDHIT
annotate_gene, get_ref = parse_cltr(clstr_name)
# create a repertory in the working directory
os.mkdir(dir_name)
# create multifasta files
create_multifasta(fasta_name, annotate_gene, get_ref, dir_name)
#!/bin/sh
#$ -S /bin/bash
#$ -M {email}
#$ -m bea
#$ -q {queue}
#$ -N {jobname}
#$ -j y
#$ -o {stdout}
### Import your modules
module add Python/2.7.8 java/1.8.0 cd-hit/v4.6.1 muscle/3.8.31
###PROGRAMS
# clustering
cd-hit-est -i {database} -c {id_threshold} -aS 0.9 -d 0 -g 1 -r 1 -o {clustered_db}
# create multifasta
python {script_loc}/cluster2multifasta.py {clustered_db}.clstr {database} {clstr_fasta}
# filter only multifasta
mkdir {clstr_fasta}/cluster
for file in {clstr_fasta}/*.fa; do a=`grep '>' $file | wc -l`; if [ ! $a == 1 ] ; then mv $file {clstr_fasta}/cluster/ ; fi; done;
# alignement
mkdir {output}/alignment
for file in {clstr_fasta}/cluster/*.fa; do muscle -in $file -out {output}/alignment/`basename $file` -clwstrict; done;
for file in {output}/alignment/*.fa; do mv $file ${{file%fa}}aln; done;
# VCF calling
mkdir {output}/vcf_db
for file in {output}/alignment/*.aln ; do {script_loc}/tools/jvarkit/dist/msa2vcf $file -c `basename ${{file%.aln}}` -R `basename ${{file%.aln}}` -o {output}/vcf_db/`basename ${{file%aln}}`vcf ; done;
#! /usr/bin/env python
# -*- coding: utf8 -*-
"""
"""
__author__ = "Anita Annamalé"
__version__ = "2.0"
__copyright__ = "copyleft"
__date__ = "2016/07"
#-------------------------- MODULES IMPORTATION -------------------------------#
import sys
import argparse
import os
import subprocess
import shlex
#-------------------------- FUNCTIONS DEFINITION ------------------------------#
def is_fasta(path):
"""
Check if a path is an existing FASTA file.
Args:
path [STR] = Path to the file
Returns:
abs_path [STR] = absolute path
or quit
"""
abs_path = os.path.abspath(path) # get absolute path
# if not a file
if not os.path.isfile(abs_path):
# check if it is a path to a dir
if os.path.isdir(abs_path):
msg = "{0} is a directory not a file.".format(abs_path)
# else the path doesn't not exist
else:
msg = "The path {0} does not exist.".format(abs_path)
raise argparse.ArgumentTypeError(msg)
else :
ext = os.path.splitext(abs_path)[1] # get the file extension
if (ext != 'fa') or (ext != 'fasta'):
msg = ("{0} isn't a fasta file. "
"Please, provide a database in FASTA format with extension \
.fasta or .fa".format(abs_path))
return abs_path
def is_dir(path):
"""
Check if a given path is an existing directory.
Args:
path [string] = Path to the directory
Returns:
abs_path [string] = absolute path
or quit
"""
abs_path = os.path.abspath(path) # get absolute path
# if not a directory
if not os.path.isdir(abs_path):
# check if it is a path to a file
if os.path.isfile(abs_path):
msg = "{0} is a file not a directory.".format(abs_path)
# else the path doesn't not exist
else:
msg = "The path {0} does not exist.".format(abs_path)
raise argparse.ArgumentTypeError(msg)
return abs_path
def write_soum(template, param):
"""
Write the qsub submission script.
Args:
param [dict] = containing all information needed for the submission
script creation
No returns
"""
# read the template
filein = open(template, "rt")
temp = filein.read()
# Write the submission script
with open("{0}/submission.sh".format(param['output']), "wt") as out:
out.write(temp.format(**param))
# close the template
filein.close()
#---------------------------- PROGRAMME MAIN ----------------------------------#
if __name__ == '__main__' :
parser = argparse.ArgumentParser(description = "VCF matrices generation \
program ")
parser.add_argument('database',
metavar = 'database',
type = is_fasta,
help = "input database in fasta format to clusterize\n")
parser.add_argument("-e", "--email",
type = str,
required = True,
metavar="STRING",
help = "Email for qsub notification. Should be an \
pasteur email.\n\n")
parser.add_argument("-q", "--queue",
type = str,
required = True,
metavar = "STRING",
help = "Queue name for SGE qsub, there is a specific \
queue \nfor your team.\n\n")
parser.add_argument("-j", "--jobname",
type = str,
nargs = '?',
default = "Job",
metavar = "STRING",
help = "The name name for this qub job. Jobname will \
appear \nin the qstat box.\n Usage : -j dataset1 [default 'Job']\n\n")
parser.add_argument('-output',
action = 'store',
metavar = 'DIR',
type = is_dir,
default = os.getcwd(),
help = "output path\n")
parser.add_argument('-id_threshold',
action = 'store',
metavar = 'float',
type = float,
default = 0.90,
help = "sequence identity threshold for clustering two \
sequence. Must be between 0 and 1 [default : 0.90]\n")
# command-line parsing
if not len(sys.argv) > 1:
parser.print_help()
exit(1)
try :
args = parser.parse_args()
args = vars(args)
except:
exit(1)
# clean the dictionary
for key, value in args.items():
if value == None:
del args[key]
# get script location
script_path = os.path.dirname(os.path.abspath(sys.argv[0]))
# complete dictionary
database = args['database']
basename_wo_ext = os.path.splitext(os.path.basename(database))[0]
clusterized_db = '{0}/cdhit_{1}_{2}.fa'.format(args['output'],
args['id_threshold'],
basename_wo_ext)
args['clustered_db'] = clusterized_db
args['script_loc'] = script_path
args['clstr_fasta'] = '{0}/clstr_fasta'.format(args['output'])
args['stdout'] = '{0}/{1}.out'.format(args['output'], args['jobname'])
# create a submission script
submission_template = script_path + '/jobs_make_matrices.sh'
try :
write_soum(submission_template, args)
except (IOError, OSError) :
sys.exit("Template file is missing")
# execute the job