Skip to content
Snippets Groups Projects
ROCKparams.cpp 12.91 KiB
/*
 * ROCKparams.cpp
 *
 *  Created on: Jul 25, 2016
 *      Author: vlegrand
 */
#include <limits.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <cmath>
#include "FqConstants.h"
#include "ROCKparams.h"
using namespace std;

const int ROCKparams::output_ext;
const int ROCKparams::undef_ext;


/*
void ROCKparams::computeLambda() {
    unsigned long tmp=parms.filter_size;
    tmp*=1073741824; // this is in fact 1024*1024*1024.
    parms.lambda=tmp/UINT_MAX;
    if (parms.kappa>get_mask<unsigned char>::value) parms.lambda=parms.lambda/sizeof(unsigned short);
}*/
/*
int ROCKparams::getfilterPEMode() {
    return parms.filter_PE_separately;
}*/


void ROCKparams::processMainArgs(int optind, const int argc, char ** argv,std::vector<string>& v_input_lines) {
            if (optind==argc) return; // no arguments
            while (optind<argc) { // otherwise, only arguments expected are names of files to filter.
                string iline=argv[optind];
                v_input_lines.push_back(iline);
                optind++;
            }
        }


void ROCKparams::optArgConsistency(const string& input_file,const string& output_file,
                                   const int& g,CMSparams& parms,const int& nb_k_mers,
                                   const std::vector<string>& v_input_lines) {
    if (input_file.empty() && v_input_lines.empty()) {
        std::cout<<"You must provide filename via -i or arguments to indicate ROCK what to filter." <<std::endl;
        usage(EXIT_FAILURE);
    }
    if (!input_file.empty() && !v_input_lines.empty()) {
        std::cout<<"files to filter are provided via the -i option or as arguments. It cannot be both." <<std::endl;
        usage(EXIT_FAILURE);
    }
    if (g!=0 and parms.lambda!=0) {// user set both lambda and cms size=> inconsistency.
        cout<<"-l and -g options are mutually exclusive."<<endl;
        usage(EXIT_FAILURE);
    }
    if (nb_k_mers!=0 and parms.lambda!=0) {// user set both lambda and number of k-mers=> inconsistency.
        cout<<"-l and -n options are mutually exclusive."<<endl;
        usage(EXIT_FAILURE);
    }
    if (nb_k_mers) { // user indicated number of k-mers; use it to minimize parms.lambda
    	int smallest_kappa;
    	if (parms.kappa_prime==0) smallest_kappa=parms.kappa;
    	else smallest_kappa=parms.kappa_prime;
    	parms.lambda=getBestLambdaForN(nb_k_mers,smallest_kappa,Pi_js[0],k_max_collision_proba);
    }
    if (parms.kappa_prime>=parms.kappa) {
        cout<<"ERROR lower filter is higher than high filter!"<<endl;
        exit(EXIT_FAILURE);
    }
    if (qual_thres.nucl_score_threshold<0) {
        cout<<"ERROR nucleotide score threshold must be positive."<<endl;
        exit(EXIT_FAILURE);
    }

    if (qual_thres.min_correct_k_mers_in_read<1) {
        cout<<"minimum number of correct k-mers in read must be a positive integer."<<endl;
        exit(EXIT_FAILURE);
    }
    if (parms.lambda==0) {
        // This happens when the user doesn't specify lambda nor nb_k_mers.
        parms.lambda=k_proposed_lamda;
    }
#ifdef DEBUG
    cout<<"parms.lambda="<<parms.lambda<<" UINT_MAX="<<UINT_MAX<<endl;
#endif
    unsigned long cms_size=k_arr_cms_size;
    cms_size*=parms.lambda;
    if (parms.kappa>get_mask<unsigned char>::value) cms_size=cms_size*sizeof(unsigned short);
    cms_size=ceil(cms_size/1024.0/1024/1024); // convert to gigabytes.
    if (cms_size>getNodePhysMemory()-defaultGRPMAXSize) {
        cout<<"Not enough RAM on the machine to run rock with a CMS of size:"<<cms_size<<" GB."<<endl;
        cout<<" Maybe you should think of increasing the value for the low filter (-c option)"<<endl;
        exit(EXIT_FAILURE);
    }
}


/*
 * Loads the content of a text file (containing input fastq file names to be filtered or names of files that ROCK must generate).
 */
int ROCKparams::loadFileArgs(const std::string& afile,std::vector<string>& v_lines) {
    ifstream infiles_names(afile.c_str());
    if (!infiles_names) cout<<"couldn't open file: "<<afile<<endl;
    while (infiles_names) {
       string iline;
       if (!getline(infiles_names,iline)) break;
       v_lines.push_back(iline);
   }
   if (!infiles_names.eof())
   {
       std::cout<<"error while reading input or output file"<<std::endl;
       return EXIT_FAILURE;
   }
   return EXIT_SUCCESS;
}

/*
 * Reads the names of input and output fastq files that ROCK must process from text files whose names are passed as
 * argument of the -i/-o options. Fills the appropriate structures.
 * orresponding output files are supposed to be in the same order as input files. No checking here. It is ut to the user to give something correct in input.
 */
int ROCKparams::loadInOutFileArgs(const std::string& input_file,const std::string& output_file,std::vector<string>& v_input_lines,std::vector<string>& v_output_lines) {
    int reti=EXIT_SUCCESS;
    int reto=EXIT_SUCCESS;
    if (!input_file.empty()) {
        reti=loadFileArgs(input_file,v_input_lines);
    }
    if (!output_file.empty()) {
        reto=loadFileArgs(output_file,v_output_lines);
    }
    if (reti==EXIT_SUCCESS && reto==EXIT_SUCCESS) return EXIT_SUCCESS;
    else return EXIT_FAILURE;
}

void ROCKparams::removePathfromFName(string& FName) {
    std::size_t i_found2 =FName.find_last_of(path_sep); // remove path from filename.
    if (i_found2!=std::string::npos) {
        FName=FName.substr(i_found2+1);
    }
}

void ROCKparams::changeExtension(string& FName,const int& extension_type) { // changes .fq into .rock.fq or adds .rock.fq
    std::size_t o_found = FName.find_last_of(k_ext);
    if (extension_type==ROCKparams::output_ext) {
        if (o_found!=std::string::npos) FName.replace(o_found,1,".rock.");
        else FName.append(".rock.fq");
    }
    else {
        if (o_found!=std::string::npos) FName.replace(o_found,1,".undefined.");
        else FName.append(".undefined.fq");
    }
}

string ROCKparams::genUndefFilename(const string& fname,const string& dname) {
    string undef_filename="";
    string new_name=fname;
    removePathfromFName(new_name);
    if (dname.compare(".")!=0) {
      undef_filename=dname;
      undef_filename.append("/");
    }
    changeExtension(new_name,ROCKparams::undef_ext);
    undef_filename.append(new_name);
    return undef_filename;
}

void ROCKparams::genOutFilenames(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines) {
    std::vector<string>::const_iterator it_in;
    string o_line;
    for (it_in=v_input_lines.begin();it_in!=v_input_lines.end();++it_in) {
        std::size_t i_found = (*it_in).find_first_of(k_sep_pair_end);
        if (i_found!=std::string::npos) {// PE files are separared by a ','
            string i_name_PE1=(*it_in).substr(0,i_found);
            removePathfromFName(i_name_PE1);
            string i_name_PE2=(*it_in).substr(i_found+1);
            removePathfromFName(i_name_PE2);
            changeExtension(i_name_PE1,ROCKparams::output_ext);
            changeExtension(i_name_PE2,ROCKparams::output_ext);
            o_line=i_name_PE1;
            o_line+=k_sep_pair_end;
            o_line.append(i_name_PE2);

        } else {
            o_line=*it_in;
            removePathfromFName(o_line);
            changeExtension(o_line,ROCKparams::output_ext);
        }
        v_output_lines.push_back(o_line);
    }
}

int ROCKparams::processInOutFileArgs(const std::vector<string>& v_input_lines,std::vector<string>& v_output_lines,std::vector<IO_fq_files>& single_files,vector<PE_files>& v_PE_files,int& f_id) {
    if (v_output_lines.empty())  {
        // in that case, generate output filenames from input filenames
        genOutFilenames(v_input_lines,v_output_lines);
    }
    if (v_input_lines.size()!=v_output_lines.size()) {
        cout<< "Inconsistency between input and output files lists!"<<endl;
        return EXIT_FAILURE;
    } else {
        std::vector<string>::const_iterator it_in;
        std::vector<string>::const_iterator it_out;
        it_in=v_input_lines.begin();
        it_out=v_output_lines.begin();
        while (it_in!=v_input_lines.end()) {
            std::size_t i_found = (*it_in).find_first_of(k_sep_pair_end);
            if (i_found!=std::string::npos) {
                // this is PE
                f_id+=2;
                string i_name_PE1=(*it_in).substr(0,i_found);
                string i_name_PE2=(*it_in).substr(i_found+1);
                std::size_t o_found = (*it_out).find_first_of(k_sep_pair_end);
                if (o_found==std::string::npos) {
                    cout<< "Inconsistency between input and output files lists!"<<endl;
                    return EXIT_FAILURE;
                }
                string o_name_PE1=(*it_out).substr(0,o_found);
                string o_dir_PE1=checkDirExists(o_name_PE1);
                string o_name_PE2=(*it_out).substr(o_found+1);
                string o_dir_PE2=checkDirExists(o_name_PE2);
                //cout<<o_name_PE2<<endl;
                PE_files pe;
                pe.PE1.in_fq_file=i_name_PE1;
                pe.PE1.out_fq_file=o_name_PE1;
                pe.PE1.undef_fq_file=genUndefFilename(i_name_PE1,o_dir_PE1);
                pe.PE2.in_fq_file=i_name_PE2;
                pe.PE2.out_fq_file=o_name_PE2;
                pe.PE2.undef_fq_file=genUndefFilename(i_name_PE2,o_dir_PE2);
                v_PE_files.push_back(pe);

            } else {
                // this is single.
                f_id+=1;
                IO_fq_files p;
                p.in_fq_file=*it_in;
                string o_dir_single=checkDirExists(*it_out);
                p.undef_fq_file=genUndefFilename(*it_in,o_dir_single);
                p.out_fq_file=*it_out;
                single_files.push_back(p);
            }
            ++it_in;
            ++it_out;
        }
        if (f_id>k_max_input_files) {
           cout<<"ROCK cannot handle more than "<<k_max_input_files<<" input files."<<endl;
           return EXIT_FAILURE;
        }
    }
    return EXIT_SUCCESS;
}

void ROCKparams::initFromMainOptsArgs(int argc,char ** argv) {
    int i,q,m;
    std::vector<string> v_input_lines;
    std::vector<string> v_output_lines;
    static int PE_separately=1;

    while((i = getopt(argc, argv, "i:o:l:k:c:C:n:vq:m:")) != -1) {
        switch(i) {
            case 0:
                break;
            case 'i':
                input_file=optarg;break;
            case 'c':
                parms.kappa_prime=atoi(optarg);break;
            case 'h':
                usage(EXIT_SUCCESS); break;
            case 'o':
                output_file=optarg; break;
            case 'C':
                parms.kappa=atoi(optarg);
                if (parms.kappa<=0 || parms.kappa>get_mask<unsigned short>::value) {
                    cout<<"Bad value for kappa. Must choose kappa so that 0<kappa<="<<get_mask<unsigned short>::value<<endl;
                    usage(EXIT_FAILURE);
                }
                break;
            case 'l':
                parms.lambda = atoi(optarg);
                if (parms.lambda<=0) {
                    cout<<"Bad value for lambda. Choose a value that is >0 or let ROCK choose for you."<<endl;
                    usage(EXIT_FAILURE);
                }
                break;
            case 'k':
                k=atoi(optarg);
                if (k<=0 || k>32) {
                    cout<<"Bad value for k. Must choose k so that 0<k<=32."<<endl;
                    usage(EXIT_FAILURE);
                }
                break;
            case 'n':
                nb_k_mers=atoi(optarg);break; // number of distinct k-mers
            case 'v':
                verbose_mode=1;
                break;
            case 'q':
                q=atoi(optarg);
                if (q<0) {
                    cout<<"q must be >=0"<<endl;
                    usage(EXIT_FAILURE);
                }
                qual_thres.nucl_score_threshold=atoi(optarg)+k_phred_32;
                break;
            case 'm':
                m=atoi(optarg);
                if (m<1) {
                    cout<<"minimum number of valid k-mer for keeping a read must be an integer >=1"<<endl;
                    usage(EXIT_FAILURE);
                }

                qual_thres.min_correct_k_mers_in_read=atoi(optarg);
                break;
            default:
                usage(EXIT_FAILURE); break; }
    }
    processMainArgs(optind, argc, argv,v_input_lines);
    optArgConsistency(input_file,output_file,g,parms,nb_k_mers,v_input_lines);
    if (nb_k_mers) {
    	int smallest_kappa;
    	if (parms.kappa_prime==0) smallest_kappa=parms.kappa;
    	else smallest_kappa=parms.kappa_prime;
        expected_collision_proba=getCollisionProba(smallest_kappa,nb_k_mers,Pi_js[0],parms.lambda);
    }
    if ((v_input_lines.empty() || v_output_lines.empty()) && (loadInOutFileArgs(input_file,output_file,v_input_lines,v_output_lines)==EXIT_FAILURE)) throw EXIT_FAILURE;
    if (processInOutFileArgs(v_input_lines,v_output_lines,single_files,v_PE_files,f_id)!=EXIT_SUCCESS) throw EXIT_FAILURE;
}

FasqQualThreshold ROCKparams::getQualThreshold() {
    qual_thres.k=k;
    return qual_thres;
}