-
Veronique Legrand authoredVeronique Legrand authored
ROCKparams.cpp 12.91 KiB
/*
* ROCKparams.cpp
*
* Created on: Jul 25, 2016
* Author: vlegrand
*/
#include <limits.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <cmath>
#include "FqConstants.h"
#include "ROCKparams.h"
using namespace std;
const int ROCKparams::output_ext;
const int ROCKparams::undef_ext;
/*
void ROCKparams::computeLambda() {
unsigned long tmp=parms.filter_size;
tmp*=1073741824; // this is in fact 1024*1024*1024.
parms.lambda=tmp/UINT_MAX;
if (parms.kappa>get_mask<unsigned char>::value) parms.lambda=parms.lambda/sizeof(unsigned short);
}*/
/*
int ROCKparams::getfilterPEMode() {
return parms.filter_PE_separately;
}*/
void ROCKparams::processMainArgs(int optind, const int argc, char ** argv,std::vector<string>& v_input_lines) {
if (optind==argc) return; // no arguments
while (optind<argc) { // otherwise, only arguments expected are names of files to filter.
string iline=argv[optind];
v_input_lines.push_back(iline);
optind++;
}
}
void ROCKparams::optArgConsistency(const string& input_file,const string& output_file,
const int& g,CMSparams& parms,const int& nb_k_mers,
const std::vector<string>& v_input_lines) {
if (input_file.empty() && v_input_lines.empty()) {
std::cout<<"You must provide filename via -i or arguments to indicate ROCK what to filter." <<std::endl;
usage(EXIT_FAILURE);
}
if (!input_file.empty() && !v_input_lines.empty()) {
std::cout<<"files to filter are provided via the -i option or as arguments. It cannot be both." <<std::endl;
usage(EXIT_FAILURE);
}
if (g!=0 and parms.lambda!=0) {// user set both lambda and cms size=> inconsistency.
cout<<"-l and -g options are mutually exclusive."<<endl;
usage(EXIT_FAILURE);
}
if (nb_k_mers!=0 and parms.lambda!=0) {// user set both lambda and number of k-mers=> inconsistency.
cout<<"-l and -n options are mutually exclusive."<<endl;
usage(EXIT_FAILURE);
}
if (nb_k_mers) { // user indicated number of k-mers; use it to minimize parms.lambda
int smallest_kappa;
if (parms.kappa_prime==0) smallest_kappa=parms.kappa;
else smallest_kappa=parms.kappa_prime;
parms.lambda=getBestLambdaForN(nb_k_mers,smallest_kappa,Pi_js[0],k_max_collision_proba);
}
if (parms.kappa_prime>=parms.kappa) {
cout<<"ERROR lower filter is higher than high filter!"<<endl;
exit(EXIT_FAILURE);
}
if (qual_thres.nucl_score_threshold<0) {
cout<<"ERROR nucleotide score threshold must be positive."<<endl;
exit(EXIT_FAILURE);
}
if (qual_thres.min_correct_k_mers_in_read<1) {
cout<<"minimum number of correct k-mers in read must be a positive integer."<<endl;
exit(EXIT_FAILURE);
}
if (parms.lambda==0) {
// This happens when the user doesn't specify lambda nor nb_k_mers.
parms.lambda=k_proposed_lamda;
}
#ifdef DEBUG
cout<<"parms.lambda="<<parms.lambda<<" UINT_MAX="<<UINT_MAX<<endl;
#endif
unsigned long cms_size=k_arr_cms_size;
cms_size*=parms.lambda;
if (parms.kappa>get_mask<unsigned char>::value) cms_size=cms_size*sizeof(unsigned short);
cms_size=ceil(cms_size/1024.0/1024/1024); // convert to gigabytes.
if (cms_size>getNodePhysMemory()-defaultGRPMAXSize) {
cout<<"Not enough RAM on the machine to run rock with a CMS of size:"<<cms_size<<" GB."<<endl;
cout<<" Maybe you should think of increasing the value for the low filter (-c option)"<<endl;
exit(EXIT_FAILURE);
}
}
/*
* Loads the content of a text file (containing input fastq file names to be filtered or names of files that ROCK must generate).
*/
int ROCKparams::loadFileArgs(const std::string& afile,std::vector<string>& v_lines) {
ifstream infiles_names(afile.c_str());
if (!infiles_names) cout<<"couldn't open file: "<<afile<<endl;
while (infiles_names) {
string iline;
if (!getline(infiles_names,iline)) break;
v_lines.push_back(iline);
}
if (!infiles_names.eof())
{
std::cout<<"error while reading input or output file"<<std::endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
/*
* Reads the names of input and output fastq files that ROCK must process from text files whose names are passed as
* argument of the -i/-o options. Fills the appropriate structures.
* orresponding output files are supposed to be in the same order as input files. No checking here. It is ut to the user to give something correct in input.
*/
int ROCKparams::loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<string>& v_input_lines,std::vector<string>& v_output_lines) {
int reti=EXIT_SUCCESS;
int reto=EXIT_SUCCESS;
if (!input_file.empty()) {
reti=loadFileArgs(input_file,v_input_lines);
}
if (!output_file.empty()) {
reto=loadFileArgs(output_file,v_output_lines);
}
if (reti==EXIT_SUCCESS && reto==EXIT_SUCCESS) return EXIT_SUCCESS;
else return EXIT_FAILURE;
}
void ROCKparams::removePathfromFName(string& FName) {
std::size_t i_found2 =FName.find_last_of(path_sep); // remove path from filename.
if (i_found2!=std::string::npos) {
FName=FName.substr(i_found2+1);
}
}
void ROCKparams::changeExtension(string& FName,const int& extension_type) { // changes .fq into .rock.fq or adds .rock.fq
std::size_t o_found = FName.find_last_of(k_ext);
if (extension_type==ROCKparams::output_ext) {
if (o_found!=std::string::npos) FName.replace(o_found,1,".rock.");
else FName.append(".rock.fq");
}
else {
if (o_found!=std::string::npos) FName.replace(o_found,1,".undefined.");
else FName.append(".undefined.fq");
}
}
string ROCKparams::genUndefFilename(const string& fname,const string& dname) {
string undef_filename="";
string new_name=fname;
removePathfromFName(new_name);
if (dname.compare(".")!=0) {
undef_filename=dname;
undef_filename.append("/");
}
changeExtension(new_name,ROCKparams::undef_ext);
undef_filename.append(new_name);
return undef_filename;
}
void ROCKparams::genOutFilenames(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines) {
std::vector<string>::const_iterator it_in;
string o_line;
for (it_in=v_input_lines.begin();it_in!=v_input_lines.end();++it_in) {
std::size_t i_found = (*it_in).find_first_of(k_sep_pair_end);
if (i_found!=std::string::npos) {// PE files are separared by a ','
string i_name_PE1=(*it_in).substr(0,i_found);
removePathfromFName(i_name_PE1);
string i_name_PE2=(*it_in).substr(i_found+1);
removePathfromFName(i_name_PE2);
changeExtension(i_name_PE1,ROCKparams::output_ext);
changeExtension(i_name_PE2,ROCKparams::output_ext);
o_line=i_name_PE1;
o_line+=k_sep_pair_end;
o_line.append(i_name_PE2);
} else {
o_line=*it_in;
removePathfromFName(o_line);
changeExtension(o_line,ROCKparams::output_ext);
}
v_output_lines.push_back(o_line);
}
}
int ROCKparams::processInOutFileArgs(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines,std::vector<IO_fq_files>& single_files,vector<PE_files>& v_PE_files,int& f_id) {
if (v_output_lines.empty()) {
// in that case, generate output filenames from input filenames
genOutFilenames(v_input_lines,v_output_lines);
}
if (v_input_lines.size()!=v_output_lines.size()) {
cout<< "Inconsistency between input and output files lists!"<<endl;
return EXIT_FAILURE;
} else {
std::vector<string>::const_iterator it_in;
std::vector<string>::const_iterator it_out;
it_in=v_input_lines.begin();
it_out=v_output_lines.begin();
while (it_in!=v_input_lines.end()) {
std::size_t i_found = (*it_in).find_first_of(k_sep_pair_end);
if (i_found!=std::string::npos) {
// this is PE
f_id+=2;
string i_name_PE1=(*it_in).substr(0,i_found);
string i_name_PE2=(*it_in).substr(i_found+1);
std::size_t o_found = (*it_out).find_first_of(k_sep_pair_end);
if (o_found==std::string::npos) {
cout<< "Inconsistency between input and output files lists!"<<endl;
return EXIT_FAILURE;
}
string o_name_PE1=(*it_out).substr(0,o_found);
string o_dir_PE1=checkDirExists(o_name_PE1);
string o_name_PE2=(*it_out).substr(o_found+1);
string o_dir_PE2=checkDirExists(o_name_PE2);
//cout<<o_name_PE2<<endl;
PE_files pe;
pe.PE1.in_fq_file=i_name_PE1;
pe.PE1.out_fq_file=o_name_PE1;
pe.PE1.undef_fq_file=genUndefFilename(i_name_PE1,o_dir_PE1);
pe.PE2.in_fq_file=i_name_PE2;
pe.PE2.out_fq_file=o_name_PE2;
pe.PE2.undef_fq_file=genUndefFilename(i_name_PE2,o_dir_PE2);
v_PE_files.push_back(pe);
} else {
// this is single.
f_id+=1;
IO_fq_files p;
p.in_fq_file=*it_in;
string o_dir_single=checkDirExists(*it_out);
p.undef_fq_file=genUndefFilename(*it_in,o_dir_single);
p.out_fq_file=*it_out;
single_files.push_back(p);
}
++it_in;
++it_out;
}
if (f_id>k_max_input_files) {
cout<<"ROCK cannot handle more than "<<k_max_input_files<<" input files."<<endl;
return EXIT_FAILURE;
}
}
return EXIT_SUCCESS;
}
void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) {
int i,q,m;
std::vector<string> v_input_lines;
std::vector<string> v_output_lines;
static int PE_separately=1;
while((i = getopt(argc, argv, "i:o:l:k:c:C:n:vq:m:")) != -1) {
switch(i) {
case 0:
break;
case 'i':
input_file=optarg;break;
case 'c':
parms.kappa_prime=atoi(optarg);break;
case 'h':
usage(EXIT_SUCCESS); break;
case 'o':
output_file=optarg; break;
case 'C':
parms.kappa=atoi(optarg);
if (parms.kappa<=0 || parms.kappa>get_mask<unsigned short>::value) {
cout<<"Bad value for kappa. Must choose kappa so that 0<kappa<="<<get_mask<unsigned short>::value<<endl;
usage(EXIT_FAILURE);
}
break;
case 'l':
parms.lambda = atoi(optarg);
if (parms.lambda<=0) {
cout<<"Bad value for lambda. Choose a value that is >0 or let ROCK choose for you."<<endl;
usage(EXIT_FAILURE);
}
break;
case 'k':
k=atoi(optarg);
if (k<=0 || k>32) {
cout<<"Bad value for k. Must choose k so that 0<k<=32."<<endl;
usage(EXIT_FAILURE);
}
break;
case 'n':
nb_k_mers=atoi(optarg);break; // number of distinct k-mers
case 'v':
verbose_mode=1;
break;
case 'q':
q=atoi(optarg);
if (q<0) {
cout<<"q must be >=0"<<endl;
usage(EXIT_FAILURE);
}
qual_thres.nucl_score_threshold=atoi(optarg)+k_phred_32;
break;
case 'm':
m=atoi(optarg);
if (m<1) {
cout<<"minimum number of valid k-mer for keeping a read must be an integer >=1"<<endl;
usage(EXIT_FAILURE);
}
qual_thres.min_correct_k_mers_in_read=atoi(optarg);
break;
default:
usage(EXIT_FAILURE); break; }
}
processMainArgs(optind, argc, argv,v_input_lines);
optArgConsistency(input_file,output_file,g,parms,nb_k_mers,v_input_lines);
if (nb_k_mers) {
int smallest_kappa;
if (parms.kappa_prime==0) smallest_kappa=parms.kappa;
else smallest_kappa=parms.kappa_prime;
expected_collision_proba=getCollisionProba(smallest_kappa,nb_k_mers,Pi_js[0],parms.lambda);
}
if ((v_input_lines.empty() || v_output_lines.empty()) && (loadInOutFileArgs(input_file,output_file,v_input_lines,v_output_lines)==EXIT_FAILURE)) throw EXIT_FAILURE;
if (processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id)!=EXIT_SUCCESS) throw EXIT_FAILURE;
}
FasqQualThreshold ROCKparams::getQualThreshold() {
qual_thres.k=k;
return qual_thres;
}