diff --git a/containers/FastTree.c b/containers/FastTree.c deleted file mode 100644 index 997b5a75172eea6f28851162546ef5ef2b3e5ff9..0000000000000000000000000000000000000000 --- a/containers/FastTree.c +++ /dev/null @@ -1,10304 +0,0 @@ -/* - * FastTree -- inferring approximately-maximum-likelihood trees for large - * multiple sequence alignments. - * - * Morgan N. Price - * http://www.microbesonline.org/fasttree/ - * - * Thanks to Jim Hester of the Cleveland Clinic Foundation for - * providing the first parallel (OpenMP) code, Siavash Mirarab of - * UT Austin for implementing the WAG option, Samuel Shepard - * at the CDC for suggesting and helping with the -quote option, and - * Aaron Darling (University of Technology, Sydney) for numerical changes - * for wide alignments of closely-related sequences. - * - * Copyright (C) 2008-2015 The Regents of the University of California - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * or visit http://www.gnu.org/copyleft/gpl.html - * - * Disclaimer - * - * NEITHER THE UNITED STATES NOR THE UNITED STATES DEPARTMENT OF ENERGY, - * NOR ANY OF THEIR EMPLOYEES, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, - * OR ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY, - * COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT, - * OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT INFRINGE - * PRIVATELY OWNED RIGHTS. - */ - -/* - * To compile FastTree, do: - * gcc -Wall -O3 -finline-functions -funroll-loops -o FastTree -lm FastTree.c - * Use -DNO_SSE to turn off use of SSE3 instructions - * (should not be necessary because compiler should not set __SSE__ if - * not available, and modern mallocs should return 16-byte-aligned values) - * Use -DOPENMP -fopenmp to use multiple threads (note, old versions of gcc - * may not support -fopenmp) - * Use -DTRACK_MEMORY if you want detailed reports of memory usage, - * but results are not correct above 4GB because mallinfo stores int values. - * It also makes FastTree run significantly slower. - * - * To get usage guidance, do: - * FastTree -help - * - * FastTree uses profiles instead of a distance matrix, and computes - * support values for each split from the profiles of the 4 nodes - * around the split. It stores a profile for each node and a average - * profile over all active nodes (the "out-profile" for computing the - * total sum of distance to other nodes). The neighbor joining phase - * requires O(N*L*a) space, where N is the number of sequences, L is - * the alignment width, and a is the alphabet size. The top-hits - * heuristic requires an additional O(N sqrt(N)) memory. After - * neighbor-joining, FastTree improves the topology with - * nearest-neighbor interchanges (NNIs) and subtree-prune-regraft - * moves (SPRs), which does not have a significant additional memory - * requirement. (We need only store "up-profiles" on the path from our - * current traversal point to the root.) These take O(NLa) time per - * round, and with default settings, O(N log(N) L a) time total. - * FastTree further improves the topology with maximum-likelihood - * NNIs, using similar data structures and complexity, but with a - * higher constant factor, and now the "profiles" are actually - * posterior distributions for that subtree. Finally, FastTree - * resamples the site likelihoods around each NNI and uses - * the Shimodaira Hasegawa test to estimate the reliability of each split. - * - * Overview of the neighbor-joining phase: - * - * Although FastTree uses a log correction on profile distances to - * account for multiple substitutions when doing NNIs and SPRs, the - * operations on the profiles themselves involve "additive" distances - * -- either %different (for nucleotide) or by using an amino acid - * similarity matrix (for proteins). If we are using %different as - * our distance matrix then - * - * Profile_distance(A,B) = 1 - sum over characters of freq(A)*freq(B) - * - * and we can average this value over positions. Positions with gaps - * are weighted by %ungapped(A) * %ungapped(B). - * - * If we are using an amino acid dissimilarity matrix D(i,j) then at - * each position - * - * Profile_distance(A,B) = sum(i,j) freq(A==i) * freq(B==j) * D(i,j) - * = sum(k) Ak * Bk * Lambda(k) - * - * where k iterates over 20 eigenvectors, Lambda(k) is the eigenvalue, - * and if A==i, then Ak is the kth column of the inverse of the - * eigenvector matrix. - * - * The exhaustive approach (-slow) takes O(N**3*L*a) time, but - * this can be reduced to as little as O(N**(3/2)*log(N)*L*a) time - * by using heuristics. - * - * It uses a combination of three heuristics: a visible set similar to - * that of FastTree (Elias & Lagergren 2005), a local hill-climbing - * search for a better join (as in relaxed neighbor-joining, Evans et - * al. 2006), and a top-hit list to reduce the search space (see - * below). - * - * The "visible" set stores, for each node, the best join for that - * node, as identified at some point in the past - * - * If top-hits are not being used, then the neighbor-joining phase can - * be summarized as: - * - * Compute the out-profile by averaging the leaves - * Compute the out-distance of each leaf quickly, using the out-profile - * Compute the visible set (or approximate it using top-hits, see below) - * Until we're down to 3 active nodes: - * Find the best join in the visible set - * (This involves recomputing the neighbor-joining criterion, - * as out-distances and #active nodes may have changed) - * Follow a chain of best hits (again recomputing the criterion) - * until we find a locally best join, as in relaxed neighbor joining - * Create a profile of the parent node, either using simple averages (default) - * or using weighted joining as in BIONJ (if -bionj was specified) - * Update the out-profile and the out-distances - * Update the visible set: - * find the best join for the new joined node - * replace hits to the joined children with hits to the parent - * if we stumble across a join for the new node that is better - * than the corresponding entry in the visible set, "reset" - * that entry. - * - * For each iteration, this method does - * O(N) work to find the best hit in the visible set - * O(L*N*a*log(N)) work to do the local search, where log(N) - * is a pessimistic estimate of the number of iterations. In - * practice, we average <1 iteration for 2,000 sequences. - * With -fastest, this step is omitted. - * O(N*a) work to compute the joined profile and update the out-profile - * O(L*N*a) work to update the out-distances - * O(L*N*a) work to compare the joined profile to the other nodes - * (to find the new entry in the visible set) - * - * and there are N-3 iterations, so it takes O(N**2 * L * log(N) * a) time. - * - * The profile distances give exactly the same result as matrix - * distances in neighbor-joining or BIONJ would if there are no gaps - * in the alignment. If there are gaps, then it is an - * approximation. To get the same result we also store a "diameter" - * for each node (diameter is 0 for leaves). - * - * In the simpler case (NJ rather than BIONJ), when we join A and B to - * give a new node AB, - * - * Profile(AB) = (A+B)/2 - * Profile_distance(AB,C) = (Profile_distance(A,C)+Profile_distance(B,C))/2 - * because the formulas above are linear - * - * And according to the neighor-joining rule, - * d(AB,C) = (d(A,C)+d(B,C)-d(A,B))/2 - * - * and we can achieve the same value by writing - * diameter(AB) = pd(A,B)/2 - * diameter(leaf) = 0 - * d(A,B) = pd(A,B) - diameter(A) - diameter(B) - * - * because - * d(AB,C) = (d(A,C)+d(B,C)-d(A,B))/2 - * = (pd(A,C)-diam(A)-diam(C)+pd(B,C)-diam(B)-diam(C)-d(A,B)+diam(A)+diam(B))/2 - * = (pd(A,C)+pd(B,C))/2 - diam(C) - pd(A,B) - * = pd(AB,C) - diam(AB) - diam(C) - * - * If we are using BIONJ, with weight lambda for the join: - * Profile(AB) = lambda*A + (1-lambda)*B - * then a similar argument gives - * diam(AB) = lambda*diam(A) + (1-lambda)*diam(B) + lambda*d(A,AB) + (1-lambda)*d(B,AB), - * - * where, as in neighbor joining, - * d(A,AB) = d(A,B) + (total out_distance(A) - total out_distance(B))/(n-2) - * - * A similar recursion formula works for the "variance" matrix of BIONJ, - * var(AB,C) = lambda*var(A,C) + (1-lambda)*var(B,C) - lambda*(1-lambda)*var(A,B) - * is equivalent to - * var(A,B) = pv(A,B) - vd(A) - vd(B), where - * pv(A,B) = pd(A,B) - * vd(A) = 0 for leaves - * vd(AB) = lambda*vd(A) + (1-lambda)*vd(B) + lambda*(1-lambda)*var(A,B) - * - * The top-hist heuristic to reduce the work below O(N**2*L) stores a top-hit - * list of size m=sqrt(N) for each active node. - * - * The list can be initialized for all the leaves in sub (N**2 * L) time as follows: - * Pick a "seed" sequence and compare it to all others - * Store the top m hits of the seed as its top-hit list - * Take "close" hits of the seed(within the top m, and see the "close" parameter), - * and assume that their top m hits lie within the top 2*m hits of the seed. - * So, compare them to the seed's neighors (if they do not already - * have a top hit list) and set their top hits. - * - * This method does O(N*L) work for each seed, or O(N**(3/2)*L) work total. - * - * To avoid doing O(N*L) work at each iteration, we need to avoid - * updating the visible set and the out-distances. So, we use "stale" - * out-distances, and when searching the visible set for the best hit, - * we only inspect the top m=sqrt(N) entries. We then update those - * out-distances (up to 2*m*L*a work) and then find the best hit. - * - * To avoid searching the entire visible set, FastTree keeps - * and updates a list of the top sqrt(N) entries in the visible set. - * This costs O(sqrt(N)) time per join to find the best entry and to - * update, or (N sqrt(N)) time overall. - * - * Similarly, when doing the local hill-climbing, we avoid O(N*L) work - * by only considering the top-hits for the current node. So this adds - * O(m*a*log(N)) work per iteration. - * - * When we join two nodes, we compute profiles and update the - * out-profile as before. We need to compute the best hits of the node - * -- we merge the lists for the children and select the best up-to-m - * hits. If the top hit list contains a stale node we replace it with - * its parent. If we still have <m/2 entries, we do a "refresh". - * - * In a "refresh", similar to the fast top-hit computation above, we - * compare the "seed", in this case the new joined node, to all other - * nodes. We compare its close neighbors (the top m hits) to all - * neighbors (the top 2*m hits) and update the top-hit lists of all - * neighbors (by merging to give a list of 3*m entries and then - * selecting the best m entries). - * - * Finally, during these processes we update the visible sets for - * other nodes with better hits if we find them, and we set the - * visible entry for the new joined node to the best entry in its - * top-hit list. (And whenever we update a visible entry, we - * do O(sqrt(N)) work to update the top-visible list.) - * These udpates are not common so they do not alter the - * O(N sqrt(N) log(N) L a) total running time for the joining phase. - * - * Second-level top hits - * - * With -fastest or with -2nd, FastTree uses an additional "2nd-level" top hits - * heuristic to reduce the running time for the top-hits phase to - * O(N**1.25 L) and for the neighbor-joining phase to O(N**1.25 L a). - * This also reduces the memory usage for the top-hits lists to - * O(N**1.25), which is important for alignments with a million - * sequences. The key idea is to store just q = sqrt(m) top hits for - * most sequences. - * - * Given the neighbors of A -- either for a seed or for a neighbor - * from the top-hits heuristic, if B is within the top q hits of A, we - * set top-hits(B) from the top 3*q top-hits of A. And, we record that - * A is the "source" of the hits for B, so if we run low on hits for - * B, instead of doing a full refresh, we can do top-hits(B) := - * top-hits(B) union top-hits(active_ancestor(A)). - * During a refresh, these "2nd-level" top hits are updated just as - * normal, but the source is maintained and only q entries are stored, - * until we near the end of the neighbor joining phase (until the - * root as 2*m children or less). - * - * Parallel execution with OpenMP - * - * If you compile FastTree with OpenMP support, it will take - * advantage of multiple CPUs on one machine. It will parallelize: - * - * The top hits phase - * Comparing one node to many others during the NJ phase (the simplest kind of join) - * The refresh phase - * Optimizing likelihoods for 3 alternate topologies during ML NNIs and ML supports - * (only 3 threads can be used) - * - * This accounts for most of the O(N L a) or slower steps except for - * minimum-evolution NNIs (which are fast anyway), minimum-evolution SPRs, - * selecting per-site rates, and optimizing branch lengths outside of ML NNIs. - * - * Parallelizing the top hits phase may lead to a slight change in the tree, - * as some top hits are computed from different (and potentially less optimal source). - * This means that results on repeated runs may not be 100% identical. - * However, this should not have any significant effect on tree quality - * after the NNIs and SPRs. - * - * The OpenMP code also turns off the star-topology test during ML - * NNIs, which may lead to slight improvements in likelihood. - */ - -#include <stdio.h> -#include <stdbool.h> -#include <string.h> -#include <assert.h> -#include <math.h> -#include <stdlib.h> -#include <sys/time.h> -#include <ctype.h> -#include <unistd.h> -#ifdef TRACK_MEMORY -/* malloc.h apparently doesn't exist on MacOS */ -#include <malloc.h> -#endif - -/* Compile with -DOPENMP to turn on multithreading */ -#ifdef OPENMP -#include <omp.h> -#endif - -/* By default, tries to compile with SSE instructions for greater speed. - But if compiled with -DUSE_DOUBLE, uses double precision instead of single-precision - floating point (2x memory required), does not use SSE, and allows much shorter - branch lengths. -*/ -#ifdef __SSE__ -#if !defined(NO_SSE) && !defined(USE_DOUBLE) -#define USE_SSE3 -#endif -#endif - - -#ifdef USE_DOUBLE -#define SSE_STRING "Double precision (No SSE3)" -typedef double numeric_t; -#define ScanNumericSpec "%lf" -#else -typedef float numeric_t; -#define ScanNumericSpec "%f" -#endif - -#ifdef USE_SSE3 -#define SSE_STRING "SSE3" -#define ALIGNED __attribute__((aligned(16))) -#define IS_ALIGNED(X) ((((unsigned long) new) & 15L) == 0L) -#include <xmmintrin.h> - -#else - -#define ALIGNED -#define IS_ALIGNED(X) 1 - -#ifndef USE_DOUBLE -#define SSE_STRING "No SSE3" -#endif - -#endif /* USE_SSE3 */ - -#define FT_VERSION "2.1.11" - -char *usage = - " FastTree protein_alignment > tree\n" - " FastTree < protein_alignment > tree\n" - " FastTree -out tree protein_alignment\n" - " FastTree -nt nucleotide_alignment > tree\n" - " FastTree -nt -gtr < nucleotide_alignment > tree\n" - " FastTree < nucleotide_alignment > tree\n" - "FastTree accepts alignments in fasta or phylip interleaved formats\n" - "\n" - "Common options (must be before the alignment file):\n" - " -quiet to suppress reporting information\n" - " -nopr to suppress progress indicator\n" - " -log logfile -- save intermediate trees, settings, and model details\n" - " -fastest -- speed up the neighbor joining phase & reduce memory usage\n" - " (recommended for >50,000 sequences)\n" - " -n <number> to analyze multiple alignments (phylip format only)\n" - " (use for global bootstrap, with seqboot and CompareToBootstrap.pl)\n" - " -nosupport to not compute support values\n" - " -intree newick_file to set the starting tree(s)\n" - " -intree1 newick_file to use this starting tree for all the alignments\n" - " (for faster global bootstrap on huge alignments)\n" - " -pseudo to use pseudocounts (recommended for highly gapped sequences)\n" - " -gtr -- generalized time-reversible model (nucleotide alignments only)\n" - " -lg -- Le-Gascuel 2008 model (amino acid alignments only)\n" - " -wag -- Whelan-And-Goldman 2001 model (amino acid alignments only)\n" - " -quote -- allow spaces and other restricted characters (but not ' ) in\n" - " sequence names and quote names in the output tree (fasta input only;\n" - " FastTree will not be able to read these trees back in)\n" - " -noml to turn off maximum-likelihood\n" - " -nome to turn off minimum-evolution NNIs and SPRs\n" - " (recommended if running additional ML NNIs with -intree)\n" - " -nome -mllen with -intree to optimize branch lengths for a fixed topology\n" - " -cat # to specify the number of rate categories of sites (default 20)\n" - " or -nocat to use constant rates\n" - " -gamma -- after optimizing the tree under the CAT approximation,\n" - " rescale the lengths to optimize the Gamma20 likelihood\n" - " -constraints constraintAlignment to constrain the topology search\n" - " constraintAlignment should have 1s or 0s to indicates splits\n" - " -expert -- see more options\n" - "For more information, see http://www.microbesonline.org/fasttree/\n"; - -char *expertUsage = - "FastTree [-nt] [-n 100] [-quote] [-pseudo | -pseudo 1.0]\n" - " [-boot 1000 | -nosupport]\n" - " [-intree starting_trees_file | -intree1 starting_tree_file]\n" - " [-quiet | -nopr]\n" - " [-nni 10] [-spr 2] [-noml | -mllen | -mlnni 10]\n" - " [-mlacc 2] [-cat 20 | -nocat] [-gamma]\n" - " [-slow | -fastest] [-2nd | -no2nd] [-slownni] [-seed 1253] \n" - " [-top | -notop] [-topm 1.0 [-close 0.75] [-refresh 0.8]]\n" - " [-gtr] [-gtrrates ac ag at cg ct gt] [-gtrfreq A C G T]\n" - " [ -lg | -wag | -trans transitionmatrixfile ]\n" - " [-matrix Matrix | -nomatrix] [-nj | -bionj]\n" - " [ -constraints constraintAlignment [ -constraintWeight 100.0 ] ]\n" - " [-log logfile]\n" - " [ alignment_file ]\n" - " [ -out output_newick_file | > newick_tree]\n" - "\n" - "or\n" - "\n" - "FastTree [-nt] [-matrix Matrix | -nomatrix] [-rawdist] -makematrix [alignment]\n" - " [-n 100] > phylip_distance_matrix\n" - "\n" - " FastTree supports fasta or phylip interleaved alignments\n" - " By default FastTree expects protein alignments, use -nt for nucleotides\n" - " FastTree reads standard input if no alignment file is given\n" - "\n" - "Input/output options:\n" - " -n -- read in multiple alignments in. This only\n" - " works with phylip interleaved format. For example, you can\n" - " use it with the output from phylip's seqboot. If you use -n, FastTree\n" - " will write 1 tree per line to standard output.\n" - " -intree newickfile -- read the starting tree in from newickfile.\n" - " Any branch lengths in the starting trees are ignored.\n" - " -intree with -n will read a separate starting tree for each alignment.\n" - " -intree1 newickfile -- read the same starting tree for each alignment\n" - " -quiet -- do not write to standard error during normal operation (no progress\n" - " indicator, no options summary, no likelihood values, etc.)\n" - " -nopr -- do not write the progress indicator to stderr\n" - " -log logfile -- save intermediate trees so you can extract\n" - " the trees and restart long-running jobs if they crash\n" - " -log also reports the per-site rates (1 means slowest category)\n" - " -quote -- quote sequence names in the output and allow spaces, commas,\n" - " parentheses, and colons in them but not ' characters (fasta files only)\n" - "\n" - "Distances:\n" - " Default: For protein sequences, log-corrected distances and an\n" - " amino acid dissimilarity matrix derived from BLOSUM45\n" - " or for nucleotide sequences, Jukes-Cantor distances\n" - " To specify a different matrix, use -matrix FilePrefix or -nomatrix\n" - " Use -rawdist to turn the log-correction off\n" - " or to use %different instead of Jukes-Cantor\n" - " (These options affect minimum-evolution computations only;\n" - " use -trans to affect maximum-likelihoood computations)\n" - "\n" - " -pseudo [weight] -- Use pseudocounts to estimate distances between\n" - " sequences with little or no overlap. (Off by default.) Recommended\n" - " if analyzing the alignment has sequences with little or no overlap.\n" - " If the weight is not specified, it is 1.0\n" - "\n" - "Topology refinement:\n" - " By default, FastTree tries to improve the tree with up to 4*log2(N)\n" - " rounds of minimum-evolution nearest-neighbor interchanges (NNI),\n" - " where N is the number of unique sequences, 2 rounds of\n" - " subtree-prune-regraft (SPR) moves (also min. evo.), and\n" - " up to 2*log(N) rounds of maximum-likelihood NNIs.\n" - " Use -nni to set the number of rounds of min. evo. NNIs,\n" - " and -spr to set the rounds of SPRs.\n" - " Use -noml to turn off both min-evo NNIs and SPRs (useful if refining\n" - " an approximately maximum-likelihood tree with further NNIs)\n" - " Use -sprlength set the maximum length of a SPR move (default 10)\n" - " Use -mlnni to set the number of rounds of maximum-likelihood NNIs\n" - " Use -mlacc 2 or -mlacc 3 to always optimize all 5 branches at each NNI,\n" - " and to optimize all 5 branches in 2 or 3 rounds\n" - " Use -mllen to optimize branch lengths without ML NNIs\n" - " Use -mllen -nome with -intree to optimize branch lengths on a fixed topology\n" - " Use -slownni to turn off heuristics to avoid constant subtrees (affects both\n" - " ML and ME NNIs)\n" - "\n" - "Maximum likelihood model options:\n" - " -lg -- Le-Gascuel 2008 model instead of (default) Jones-Taylor-Thorton 1992 model (a.a. only)\n" - " -wag -- Whelan-And-Goldman 2001 model instead of (default) Jones-Taylor-Thorton 1992 model (a.a. only)\n" - " -gtr -- generalized time-reversible instead of (default) Jukes-Cantor (nt only)\n" - " -cat # -- specify the number of rate categories of sites (default 20)\n" - " -nocat -- no CAT model (just 1 category)\n" - " - trans filename -- use the transition matrix from filename\n" - " This is supported for amino acid alignments only\n" - " The file must be tab-delimited with columns in the order ARNDCQEGHILKMFPSTWYV*\n" - " The additional column named * is for the stationary distribution\n" - " Each row must have a row name in the same order ARNDCQEGHILKMFPSTWYV\n" - " -gamma -- after the final round of optimizing branch lengths with the CAT model,\n" - " report the likelihood under the discrete gamma model with the same\n" - " number of categories. FastTree uses the same branch lengths but\n" - " optimizes the gamma shape parameter and the scale of the lengths.\n" - " The final tree will have rescaled lengths. Used with -log, this\n" - " also generates per-site likelihoods for use with CONSEL, see\n" - " GammaLogToPaup.pl and documentation on the FastTree web site.\n" - "\n" - "Support value options:\n" - " By default, FastTree computes local support values by resampling the site\n" - " likelihoods 1,000 times and the Shimodaira Hasegawa test. If you specify -nome,\n" - " it will compute minimum-evolution bootstrap supports instead\n" - " In either case, the support values are proportions ranging from 0 to 1\n" - "\n" - " Use -nosupport to turn off support values or -boot 100 to use just 100 resamples\n" - " Use -seed to initialize the random number generator\n" - "\n" - "Searching for the best join:\n" - " By default, FastTree combines the 'visible set' of fast neighbor-joining with\n" - " local hill-climbing as in relaxed neighbor-joining\n" - " -slow -- exhaustive search (like NJ or BIONJ, but different gap handling)\n" - " -slow takes half an hour instead of 8 seconds for 1,250 proteins\n" - " -fastest -- search the visible set (the top hit for each node) only\n" - " Unlike the original fast neighbor-joining, -fastest updates visible(C)\n" - " after joining A and B if join(AB,C) is better than join(C,visible(C))\n" - " -fastest also updates out-distances in a very lazy way,\n" - " -fastest sets -2nd on as well, use -fastest -no2nd to avoid this\n" - "\n" - "Top-hit heuristics:\n" - " By default, FastTree uses a top-hit list to speed up search\n" - " Use -notop (or -slow) to turn this feature off\n" - " and compare all leaves to each other,\n" - " and all new joined nodes to each other\n" - " -topm 1.0 -- set the top-hit list size to parameter*sqrt(N)\n" - " FastTree estimates the top m hits of a leaf from the\n" - " top 2*m hits of a 'close' neighbor, where close is\n" - " defined as d(seed,close) < 0.75 * d(seed, hit of rank 2*m),\n" - " and updates the top-hits as joins proceed\n" - " -close 0.75 -- modify the close heuristic, lower is more conservative\n" - " -refresh 0.8 -- compare a joined node to all other nodes if its\n" - " top-hit list is less than 80% of the desired length,\n" - " or if the age of the top-hit list is log2(m) or greater\n" - " -2nd or -no2nd to turn 2nd-level top hits heuristic on or off\n" - " This reduces memory usage and running time but may lead to\n" - " marginal reductions in tree quality.\n" - " (By default, -fastest turns on -2nd.)\n" - "\n" - "Join options:\n" - " -nj: regular (unweighted) neighbor-joining (default)\n" - " -bionj: weighted joins as in BIONJ\n" - " FastTree will also weight joins during NNIs\n" - "\n" - "Constrained topology search options:\n" - " -constraints alignmentfile -- an alignment with values of 0, 1, and -\n" - " Not all sequences need be present. A column of 0s and 1s defines a\n" - " constrained split. Some constraints may be violated\n" - " (see 'violating constraints:' in standard error).\n" - " -constraintWeight -- how strongly to weight the constraints. A value of 1\n" - " means a penalty of 1 in tree length for violating a constraint\n" - " Default: 100.0\n" - "\n" - "For more information, see http://www.microbesonline.org/fasttree/\n" - " or the comments in the source code\n"; -; - - -#define MAXCODES 20 -#define NOCODE 127 -/* Note -- sequence lines longer than BUFFER_SIZE are - allowed, but FASTA header lines must be within this limit */ -#define BUFFER_SIZE 5000 -#define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) -#define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) - -typedef struct { - int nPos; - int nSeq; - char **names; - char **seqs; - int nSaved; /* actual allocated size of names and seqs */ -} alignment_t; - -/* For each position in a profile, we have a weight (% non-gapped) and a - frequency vector. (If using a matrix, the frequency vector is in eigenspace). - We also store codes for simple profile positions (all gaps or only 1 value) - If weight[pos] > 0 && codes[pos] == NOCODE then we store the vector - vectors itself is sets of nCodes long, so the vector for the ith nonconstant position - starts at &vectors[nCodes*i] - - To speed up comparison of outprofile to a sequence or other simple profile, we also - (for outprofiles) store codeDist[iPos*nCodes+k] = dist(k,profile[iPos]) - - For constraints, we store a vector of nOn and nOff - If not using constraints, those will be NULL -*/ -typedef struct { - /* alignment profile */ - numeric_t *weights; - unsigned char *codes; - numeric_t *vectors; /* NULL if no non-constant positions, e.g. for leaves */ - int nVectors; - numeric_t *codeDist; /* Optional -- distance to each code at each position */ - - /* constraint profile */ - int *nOn; - int *nOff; -} profile_t; - -/* A visible node is a pair of nodes i, j such that j is the best hit of i, - using the neighbor-joining criterion, at the time the comparison was made, - or approximately so since then. - - Note that variance = dist because in BIONJ, constant factors of variance do not matter, - and because we weight ungapped sequences higher naturally when averaging profiles, - so we do not take this into account in the computation of "lambda" for BIONJ. - - For the top-hit list heuristic, if the top hit list becomes "too short", - we store invalid entries with i=j=-1 and dist/criterion very high. -*/ -typedef struct { - int i, j; - numeric_t weight; /* Total product of weights (maximum value is nPos) - This is needed for weighted joins and for pseudocounts, - but not in most other places. - For example, it is not maintained by the top hits code */ - numeric_t dist; /* The uncorrected distance (includes diameter correction) */ - numeric_t criterion; /* changes when we update the out-profile or change nActive */ -} besthit_t; - -typedef struct { - int nChild; - int child[3]; -} children_t; - -typedef struct { - /* Distances between amino acids */ - numeric_t distances[MAXCODES][MAXCODES]; - - /* Inverse of the eigenvalue matrix, for rotating a frequency vector - into eigenspace so that profile similarity computations are - O(alphabet) not O(alphabet*alphabet) time. - */ - numeric_t eigeninv[MAXCODES][MAXCODES]; - numeric_t eigenval[MAXCODES]; /* eigenvalues */ - - - /* eigentot=eigeninv times the all-1s frequency vector - useful for normalizing rotated frequency vectors - */ - numeric_t eigentot[MAXCODES]; - - /* codeFreq is the transpose of the eigeninv matrix is - the rotated frequency vector for each code */ - numeric_t codeFreq[MAXCODES][MAXCODES]; - numeric_t gapFreq[MAXCODES]; -} distance_matrix_t; - - -/* A transition matrix gives the instantaneous rate of change of frequencies - df/dt = M . f - which is solved by - f(t) = exp(M) . f(0) - and which is not a symmetric matrix because of - non-uniform stationary frequencies stat, so that - M stat = 0 - M(i,j) is instantaneous rate of j -> i, not of i -> j - - S = diag(sqrt(stat)) is a correction so that - M' = S**-1 M S is symmetric - Let W L W**-1 = M' be an eigendecomposition of M' - Because M' is symmetric, W can be a rotation, and W**-1 = t(W) - Set V = S*W - M = V L V**-1 is an eigendecomposition of M - Note V**-1 = W**-1 S**-1 = t(W) S**-1 - - Evolution by time t is given by - - exp(M*t) = V exp(L*t) V**-1 - P(A & B | t) = B . exp(M*t) . (A * stat) - note this is *not* the same as P(A->B | t) - - and we can reduce some of the computations from O(a**2) to O(a) time, - where a is the alphabet size, by storing frequency vectors as - t(V) . f = t(W) . t(S) . f - - Then - P(f0 & f1 | t) = f1 . exp(M*t) . f0 * (f0 . stat) = sum(r0j * r1j * exp(l_j*t)) - where r0 and r1 are the transformed vectors - - Posterior distribution of P given children f0 and f1 is given by - P(i | f0, f1, t0, t1) = stat * P(i->f0 | t0) * P(i->f1 | t1) - = P(i & f0 | t0) * P(i & f1 | t1) / stat - ~ (V . exp(t0*L) . r0) * (V . exp(t1*L) . r1) / stat - - When normalize this posterior distribution (to sum to 1), divide by stat, - and transform by t(V) -- this is the "profile" of internal nodes - - To eliminate the O(N**2) step of transforming by t(V), if the posterior - distribution of an amino acid is near 1 then we can approximate it by - P(i) ~= (i==A) * w + nearP(i) * (1-w), where - w is fit so that P(i==A) is correct - nearP = Posterior(i | i, i, 0.1, 0.1) [0.1 is an arbitrary choice] - and we confirm that the approximation works well before we use it. - - Given this parameter w we can set - rotated_posterior = rotation(w * (i==A)/stat + (1-w) * nearP/stat) - = codeFreq(A) * w/stat(A) + nearFreq(A) * (1-w) - */ -typedef struct { - numeric_t stat[MAXCODES]; /* The stationary distribution */ - numeric_t statinv[MAXCODES]; /* 1/stat */ - /* the eigenmatrix, with the eigenvectors as columns and rotations of individual - characters as rows. Also includes a NOCODE entry for gaps */ - numeric_t codeFreq[NOCODE+1][MAXCODES]; - numeric_t eigeninv[MAXCODES][MAXCODES]; /* Inverse of eigenmatrix */ - numeric_t eigeninvT[MAXCODES][MAXCODES]; /* transpose of eigeninv */ - numeric_t eigenval[MAXCODES]; /* Eigenvalues */ - /* These are for approximate posteriors (off by default) */ - numeric_t nearP[MAXCODES][MAXCODES]; /* nearP[i][j] = P(parent=j | both children are i, both lengths are 0.1 */ - numeric_t nearFreq[MAXCODES][MAXCODES]; /* rotation of nearP/stat */ -} transition_matrix_t; - -typedef struct { - int nRateCategories; - numeric_t *rates; /* 1 per rate category */ - unsigned int *ratecat; /* 1 category per position */ -} rates_t; - -typedef struct { - /* The input */ - int nSeq; - int nPos; - char **seqs; /* the aligment sequences array (not reallocated) */ - distance_matrix_t *distance_matrix; /* a pointer (not reallocated), or NULL if using %identity distance */ - transition_matrix_t *transmat; /* a pointer (is allocated), or NULL for Jukes-Cantor */ - /* Topological constraints are represented for each sequence as binary characters - with values of '0', '1', or '-' (for missing data) - Sequences that have no constraint may have a NULL string - */ - int nConstraints; - char **constraintSeqs; - - /* The profile data structures */ - int maxnode; /* The next index to allocate */ - int maxnodes; /* Space allocated in data structures below */ - profile_t **profiles; /* Profiles of leaves and intermediate nodes */ - numeric_t *diameter; /* To correct for distance "up" from children (if any) */ - numeric_t *varDiameter; /* To correct variances for distance "up" */ - numeric_t *selfdist; /* Saved for use in some formulas */ - numeric_t *selfweight; /* Saved for use in some formulas */ - - /* Average profile of all active nodes, the "outprofile" - * If all inputs are ungapped, this has weight 1 (not nSequences) at each position - * The frequencies all sum to one (or that is implied by the eigen-representation) - */ - profile_t *outprofile; - double totdiam; - - /* We sometimes use stale out-distances, so we remember what nActive was */ - numeric_t *outDistances; /* Sum of distances to other active (parent==-1) nodes */ - int *nOutDistActive; /* What nActive was when this outDistance was computed */ - - /* the inferred tree */ - int root; /* index of the root. Unlike other internal nodes, it has 3 children */ - int *parent; /* -1 or index of parent */ - children_t *child; - numeric_t *branchlength; /* Distance to parent */ - numeric_t *support; /* 1 for high-confidence nodes */ - - /* auxilliary data for maximum likelihood (defaults to 1 category of rate=1.0) */ - rates_t rates; -} NJ_t; - -/* Uniquify sequences in an alignment -- map from indices - in the alignment to unique indicies in a NJ_t -*/ -typedef struct { - int nSeq; - int nUnique; - int *uniqueFirst; /* iUnique -> iAln */ - int *alnNext; /* iAln -> next, or -1 */ - int *alnToUniq; /* iAln -> iUnique, or -1 if another was the exemplar */ - char **uniqueSeq; /* indexed by iUniq -- points to strings allocated elsewhere */ -} uniquify_t; - -/* Describes which switch to do */ -typedef enum {ABvsCD,ACvsBD,ADvsBC} nni_t; - -/* A list of these describes a chain of NNI moves in a rooted tree, - making up, in total, an SPR move -*/ -typedef struct { - int nodes[2]; - double deltaLength; /* change in tree length for this step (lower is better) */ -} spr_step_t; - -/* Keep track of hits for the top-hits heuristic without wasting memory - j = -1 means empty - If j is an inactive node, this may be replaced by that node's parent (and dist recomputed) - */ -typedef struct { - int j; - numeric_t dist; -} hit_t; - -typedef struct { - int nHits; /* the allocated and desired size; some of them may be empty */ - hit_t *hits; - int hitSource; /* where to refresh hits from if a 2nd-level top-hit list, or -1 */ - int age; /* number of joins since a refresh */ -} top_hits_list_t; - -typedef struct { - int m; /* size of a full top hits list, usually sqrt(N) */ - int q; /* size of a 2nd-level top hits, usually sqrt(m) */ - int maxnodes; - top_hits_list_t *top_hits_lists; /* one per node */ - hit_t *visible; /* the "visible" (very best) hit for each node */ - - /* The top-visible set is a subset, usually of size m, of the visible set -- - it is the set of joins to select from - Each entry is either a node whose visible set entry has a good (low) criterion, - or -1 for empty, or is an obsolete node (which is effectively the same). - Whenever we update the visible set, should also call UpdateTopVisible() - which ensures that none of the topvisible set are stale (that is, they - all point to an active node). - */ - int nTopVisible; /* nTopVisible = m * topvisibleMult */ - int *topvisible; - - int topvisibleAge; /* joins since the top-visible list was recomputed */ - -#ifdef OPENMP - /* 1 lock to read or write any top hits list, no thread grabs more than one */ - omp_lock_t *locks; -#endif -} top_hits_t; - -/* Global variables */ -/* Options */ -int verbose = 1; -int showProgress = 1; -int slow = 0; -int fastest = 0; -bool useTopHits2nd = false; /* use the second-level top hits heuristic? */ -int bionj = 0; -double tophitsMult = 1.0; /* 0 means compare nodes to all other nodes */ -double tophitsClose = -1.0; /* Parameter for how close is close; also used as a coverage req. */ -double topvisibleMult = 1.5; /* nTopVisible = m * topvisibleMult; 1 or 2 did not make much difference - in either running time or accuracy so I chose a compromise. */ - -double tophitsRefresh = 0.8; /* Refresh if fraction of top-hit-length drops to this */ -double tophits2Mult = 1.0; /* Second-level top heuristic -- only with -fastest */ -int tophits2Safety = 3; /* Safety factor for second level of top-hits heuristic */ -double tophits2Refresh = 0.6; /* Refresh 2nd-level top hits if drops down to this fraction of length */ - -double staleOutLimit = 0.01; /* nActive changes by at most this amount before we recompute - an out-distance. (Only applies if using the top-hits heuristic) */ -double fResetOutProfile = 0.02; /* Recompute out profile from scratch if nActive has changed - by more than this proportion, and */ -int nResetOutProfile = 200; /* nActive has also changed more than this amount */ -int nCodes=20; /* 20 if protein, 4 if nucleotide */ -bool useMatrix=true; /* If false, use %different as the uncorrected distance */ -bool logdist = true; /* If true, do a log-correction (scoredist-like or Jukes-Cantor) - but only during NNIs and support values, not during neighbor-joining */ -double pseudoWeight = 0.0; /* The weight of pseudocounts to avoid artificial long branches when - nearby sequences in the tree have little or no overlap - (off by default). The prior distance is based on - all overlapping positions among the quartet or triplet under - consideration. The log correction takes place after the - pseudocount is used. */ -double constraintWeight = 100.0;/* Cost of violation of a topological constraint in evolutionary distance - or likelihood */ -double MEMinDelta = 1.0e-4; /* Changes of less than this in tree-length are discounted for - purposes of identifying fixed subtrees */ -bool fastNNI = true; -bool gammaLogLk = false; /* compute gamma likelihood without reoptimizing branch lengths? */ - -/* Maximum likelihood options and constants */ -/* These are used to rescale likelihood values and avoid taking a logarithm at each position */ -const double LkUnderflow = 1.0e-4; -const double LkUnderflowInv = 1.0e4; -const double LogLkUnderflow = 9.21034037197618; /* -log(LkUnderflowInv) */ -const double Log2 = 0.693147180559945; -/* These are used to limit the optimization of branch lengths. - Also very short branch lengths can create numerical problems. - In version 2.1.7, the minimum branch lengths (MLMinBranchLength and MLMinRelBranchLength) - were increased to prevent numerical problems in rare cases. - In version 2.1.8, to provide useful branch lengths for genome-wide alignments, - the minimum branch lengths were dramatically decreased if USE_DOUBLE is defined. -*/ -#ifndef USE_DOUBLE -const double MLMinBranchLengthTolerance = 1.0e-4; /* absolute tolerance for optimizing branch lengths */ -const double MLFTolBranchLength = 0.001; /* fractional tolerance for optimizing branch lengths */ -const double MLMinBranchLength = 5.0e-4; /* minimum value for branch length */ -const double MLMinRelBranchLength = 2.5e-4; /* minimum of rate * length */ -const double fPostTotalTolerance = 1.0e-10; /* posterior vector must sum to at least this before rescaling */ -#else -const double MLMinBranchLengthTolerance = 1.0e-9; -const double MLFTolBranchLength = 0.001; -const double MLMinBranchLength = 5.0e-9; -const double MLMinRelBranchLength = 2.5e-9; -const double fPostTotalTolerance = 1.0e-20; -#endif - -int mlAccuracy = 1; /* Rounds of optimization of branch lengths; 1 means do 2nd round only if close */ -double closeLogLkLimit = 5.0; /* If partial optimization of an NNI looks like it would decrease the log likelihood - by this much or more then do not optimize it further */ -double treeLogLkDelta = 0.1; /* Give up if tree log-lk changes by less than this; NNIs that change - likelihood by less than this also are considered unimportant - by some heuristics */ -bool exactML = true; /* Exact or approximate posterior distributions for a.a.s */ -double approxMLminf = 0.95; /* Only try to approximate posterior distributions if max. value is at least this high */ -double approxMLminratio = 2/3.0;/* Ratio of approximated/true posterior values must be at least this high */ -double approxMLnearT = 0.2; /* 2nd component of near-constant posterior distribution uses this time scale */ -const int nDefaultRateCats = 20; - -/* Performance and memory usage */ -long profileOps = 0; /* Full profile-based distance operations */ -long outprofileOps = 0; /* How many of profileOps are comparisons to outprofile */ -long seqOps = 0; /* Faster leaf-based distance operations */ -long profileAvgOps = 0; /* Number of profile-average steps */ -long nHillBetter = 0; /* Number of hill-climbing steps */ -long nCloseUsed = 0; /* Number of "close" neighbors we avoid full search for */ -long nClose2Used = 0; /* Number of "close" neighbors we use 2nd-level top hits for */ -long nRefreshTopHits = 0; /* Number of full-blown searches (interior nodes) */ -long nVisibleUpdate = 0; /* Number of updates of the visible set */ -long nNNI = 0; /* Number of NNI changes performed */ -long nSPR = 0; /* Number of SPR changes performed */ -long nML_NNI = 0; /* Number of max-lik. NNI changes performed */ -long nSuboptimalSplits = 0; /* # of splits that are rejected given final tree (during bootstrap) */ -long nSuboptimalConstrained = 0; /* Bad splits that are due to constraints */ -long nConstraintViolations = 0; /* Number of constraint violations */ -long nProfileFreqAlloc = 0; -long nProfileFreqAvoid = 0; -long szAllAlloc = 0; -long mymallocUsed = 0; /* useful allocations by mymalloc */ -long maxmallocHeap = 0; /* Maximum of mi.arena+mi.hblkhd from mallinfo (actual mem usage) */ -long nLkCompute = 0; /* # of likelihood computations for pairs of probability vectors */ -long nPosteriorCompute = 0; /* # of computations of posterior probabilities */ -long nAAPosteriorExact = 0; /* # of times compute exact AA posterior */ -long nAAPosteriorRough = 0; /* # of times use rough approximation */ -long nStarTests = 0; /* # of times we use star test to avoid testing an NNI */ - -/* Protein character set */ -unsigned char *codesStringAA = (unsigned char*) "ARNDCQEGHILKMFPSTWYV"; -unsigned char *codesStringNT = (unsigned char*) "ACGT"; -unsigned char *codesString = NULL; - -distance_matrix_t *ReadDistanceMatrix(char *prefix); -void SetupDistanceMatrix(/*IN/OUT*/distance_matrix_t *); /* set eigentot, codeFreq, gapFreq */ -void ReadMatrix(char *filename, /*OUT*/numeric_t codes[MAXCODES][MAXCODES], bool check_codes); -void ReadVector(char *filename, /*OUT*/numeric_t codes[MAXCODES]); -alignment_t *ReadAlignment(/*READ*/FILE *fp, bool bQuote); /* Returns a list of strings (exits on failure) */ -alignment_t *FreeAlignment(alignment_t *); /* returns NULL */ -void FreeAlignmentSeqs(/*IN/OUT*/alignment_t *); - -/* Takes as input the transpose of the matrix V, with i -> j - This routine takes care of setting the diagonals -*/ -transition_matrix_t *CreateTransitionMatrix(/*IN*/double matrix[MAXCODES][MAXCODES], - /*IN*/double stat[MAXCODES]); -transition_matrix_t *CreateGTR(double *gtrrates/*ac,ag,at,cg,ct,gt*/, double *gtrfreq/*ACGT*/); -transition_matrix_t *ReadAATransitionMatrix(/*IN*/char *filename); - -/* For converting profiles from 1 rotation to another, or converts NULL to NULL */ -distance_matrix_t *TransMatToDistanceMat(transition_matrix_t *transmat); - -/* Allocates memory, initializes leaf profiles */ -NJ_t *InitNJ(char **sequences, int nSeqs, int nPos, - /*IN OPTIONAL*/char **constraintSeqs, int nConstraints, - /*IN OPTIONAL*/distance_matrix_t *, - /*IN OPTIONAL*/transition_matrix_t *); - -NJ_t *FreeNJ(NJ_t *NJ); /* returns NULL */ -void FastNJ(/*IN/OUT*/NJ_t *NJ); /* Does the joins */ -void ReliabilityNJ(/*IN/OUT*/NJ_t *NJ, int nBootstrap); /* Estimates the reliability of the joins */ - -/* nni_stats_t is meaningless for leaves and root, so all of those entries - will just be high (for age) or 0 (for delta) -*/ -typedef struct { - int age; /* number of rounds since this node was modified by an NNI */ - int subtreeAge; /* number of rounds since self or descendent had a significant improvement */ - double delta; /* improvement in score for this node (or 0 if no change) */ - double support; /* improvement of score for self over better of alternatives */ -} nni_stats_t; - -/* One round of nearest-neighbor interchanges according to the - minimum-evolution or approximate maximum-likelihood criterion. - If doing maximum likelihood then this modifies the branch lengths. - age is the # of rounds since a node was NNId - Returns the # of topological changes performed -*/ -int NNI(/*IN/OUT*/NJ_t *NJ, int iRound, int nRounds, bool useML, - /*IN/OUT*/nni_stats_t *stats, - /*OUT*/double *maxDeltaCriterion); -nni_stats_t *InitNNIStats(NJ_t *NJ); -nni_stats_t *FreeNNIStats(nni_stats_t *, NJ_t *NJ); /* returns NULL */ - -/* One round of subtree-prune-regraft moves (minimum evolution) */ -void SPR(/*IN/OUT*/NJ_t *NJ, int maxSPRLength, int iRound, int nRounds); - -/* Recomputes all branch lengths by minimum evolution criterion*/ -void UpdateBranchLengths(/*IN/OUT*/NJ_t *NJ); - -/* Recomputes all branch lengths and, optionally, internal profiles */ -double TreeLength(/*IN/OUT*/NJ_t *NJ, bool recomputeProfiles); - -typedef struct { - int nBadSplits; - int nConstraintViolations; - int nBadBoth; - int nSplits; - /* How much length would be reduce or likelihood would be increased by the - best NNI we find (the worst "miss") */ - double dWorstDeltaUnconstrained; - double dWorstDeltaConstrained; -} SplitCount_t; - -void TestSplitsMinEvo(NJ_t *NJ, /*OUT*/SplitCount_t *splitcount); - -/* Sets SH-like support values if nBootstrap>0 */ -void TestSplitsML(/*IN/OUT*/NJ_t *NJ, /*OUT*/SplitCount_t *splitcount, int nBootstrap); - -/* Pick columns for resampling, stored as returned_vector[iBoot*nPos + j] */ -int *ResampleColumns(int nPos, int nBootstrap); - -/* Use out-profile and NJ->totdiam to recompute out-distance for node iNode - Only does this computation if the out-distance is "stale" (nOutDistActive[iNode] != nActive) - Note "IN/UPDATE" for NJ always means that we may update out-distances but otherwise - make no changes. - */ -void SetOutDistance(/*IN/UPDATE*/NJ_t *NJ, int iNode, int nActive); - -/* Always sets join->criterion; may update NJ->outDistance and NJ->nOutDistActive, - assumes join's weight and distance are already set, - and that the constraint penalty (if any) is included in the distance -*/ -void SetCriterion(/*IN/UPDATE*/NJ_t *NJ, int nActive, /*IN/OUT*/besthit_t *join); - -/* Computes weight and distance (which includes the constraint penalty) - and then sets the criterion (maybe update out-distances) -*/ -void SetDistCriterion(/*IN/UPDATE*/NJ_t *NJ, int nActive, /*IN/OUT*/besthit_t *join); - -/* If join->i or join->j are inactive nodes, replaces them with their active ancestors. - After doing this, if i == j, or either is -1, sets weight to 0 and dist and criterion to 1e20 - and returns false (not a valid join) - Otherwise, if i or j changed, recomputes the distance and criterion. - Note that if i and j are unchanged then the criterion could be stale - If bUpdateDist is false, and i or j change, then it just sets dist to a negative number -*/ -bool UpdateBestHit(/*IN/UPDATE*/NJ_t *NJ, int nActive, /*IN/OUT*/besthit_t *join, - bool bUpdateDist); - -/* This recomputes the criterion, or returns false if the visible node - is no longer active. -*/ -bool GetVisible(/*IN/UPDATE*/NJ_t *NJ, int nActive, /*IN/OUT*/top_hits_t *tophits, - int iNode, /*OUT*/besthit_t *visible); - -int ActiveAncestor(/*IN*/NJ_t *NJ, int node); - -/* Compute the constraint penalty for a join. This is added to the "distance" - by SetCriterion */ -int JoinConstraintPenalty(/*IN*/NJ_t *NJ, int node1, int node2); -int JoinConstraintPenaltyPiece(NJ_t *NJ, int node1, int node2, int iConstraint); - -/* Helper function for computing the number of constraints violated by - a split, represented as counts of on and off on each side */ -int SplitConstraintPenalty(int nOn1, int nOff1, int nOn2, int nOff2); - -/* Reports the (min. evo.) support for the (1,2) vs. (3,4) split - col[iBoot*nPos+j] is column j for bootstrap iBoot -*/ -double SplitSupport(profile_t *p1, profile_t *p2, profile_t *p3, profile_t *p4, - /*OPTIONAL*/distance_matrix_t *dmat, - int nPos, - int nBootstrap, - int *col); - -/* Returns SH-like support given resampling spec. (in col) and site likelihods - for the three quartets -*/ -double SHSupport(int nPos, int nBoostrap, int *col, double loglk[3], double *site_likelihoods[3]); - -profile_t *SeqToProfile(/*IN/OUT*/NJ_t *NJ, - char *seq, int nPos, - /*OPTIONAL*/char *constraintSeqs, int nConstraints, - int iNode, - unsigned long counts[256]); - -/* ProfileDist and SeqDist only set the dist and weight fields - If using an outprofile, use the second argument of ProfileDist - for better performance. - - These produce uncorrected distances. -*/ -void ProfileDist(profile_t *profile1, profile_t *profile2, int nPos, - /*OPTIONAL*/distance_matrix_t *distance_matrix, - /*OUT*/besthit_t *hit); -void SeqDist(unsigned char *codes1, unsigned char *codes2, int nPos, - /*OPTIONAL*/distance_matrix_t *distance_matrix, - /*OUT*/besthit_t *hit); - -/* Computes all pairs of profile distances, applies pseudocounts - if pseudoWeight > 0, and applies log-correction if logdist is true. - The lower index is compared to the higher index, e.g. for profiles - A,B,C,D the comparison will be as in quartet_pair_t -*/ -typedef enum {qAB,qAC,qAD,qBC,qBD,qCD} quartet_pair_t; -void CorrectedPairDistances(profile_t **profiles, int nProfiles, - /*OPTIONAL*/distance_matrix_t *distance_matrix, - int nPos, - /*OUT*/double *distances); - -/* output is indexed by nni_t - To ensure good behavior while evaluating a subtree-prune-regraft move as a series - of nearest-neighbor interchanges, this uses a distance-ish model of constraints, - as given by PairConstraintDistance(), rather than - counting the number of violated splits (which is what FastTree does - during neighbor-joining). - Thus, penalty values may well be >0 even if no constraints are violated, but the - relative scores for the three NNIs will be correct. - */ -void QuartetConstraintPenalties(profile_t *profiles[4], int nConstraints, /*OUT*/double d[3]); - -double PairConstraintDistance(int nOn1, int nOff1, int nOn2, int nOff2); - -/* the split is consistent with the constraint if any of the profiles have no data - or if three of the profiles have the same uniform value (all on or all off) - or if AB|CD = 00|11 or 11|00 (all uniform) - */ -bool SplitViolatesConstraint(profile_t *profiles[4], int iConstraint); - -/* If false, no values were set because this constraint was not relevant. - output is for the 3 splits -*/ -bool QuartetConstraintPenaltiesPiece(profile_t *profiles[4], int iConstraint, /*OUT*/double penalty[3]); - -/* Apply Jukes-Cantor or scoredist-like log(1-d) transform - to correct the distance for multiple substitutions. -*/ -double LogCorrect(double distance); - -/* AverageProfile is used to do a weighted combination of nodes - when doing a join. If weight is negative, then the value is ignored and the profiles - are averaged. The weight is *not* adjusted for the gap content of the nodes. - Also, the weight does not affect the representation of the constraints -*/ -profile_t *AverageProfile(profile_t *profile1, profile_t *profile2, - int nPos, int nConstraints, - distance_matrix_t *distance_matrix, - double weight1); - -/* PosteriorProfile() is like AverageProfile() but it computes posterior probabilities - rather than an average -*/ -profile_t *PosteriorProfile(profile_t *profile1, profile_t *profile2, - double len1, double len2, - /*OPTIONAL*/transition_matrix_t *transmat, - rates_t *rates, - int nPos, int nConstraints); - -/* Set a node's profile from its children. - Deletes the previous profile if it exists - Use -1.0 for a balanced join - Fails unless the node has two children (e.g., no leaves or root) -*/ -void SetProfile(/*IN/OUT*/NJ_t *NJ, int node, double weight1); - -/* OutProfile does an unweighted combination of nodes to create the - out-profile. It always sets code to NOCODE so that UpdateOutProfile - can work. -*/ -profile_t *OutProfile(profile_t **profiles, int nProfiles, - int nPos, int nConstraints, - distance_matrix_t *distance_matrix); - -void UpdateOutProfile(/*UPDATE*/profile_t *out, profile_t *old1, profile_t *old2, - profile_t *new, int nActiveOld, - int nPos, int nConstraints, - distance_matrix_t *distance_matrix); - -profile_t *NewProfile(int nPos, int nConstraints); /* returned has no vectors */ -profile_t *FreeProfile(profile_t *profile, int nPos, int nConstraints); /* returns NULL */ - -void AllocRateCategories(/*IN/OUT*/rates_t *rates, int nRateCategories, int nPos); - -/* f1 can be NULL if code1 != NOCODE, and similarly for f2 - Or, if (say) weight1 was 0, then can have code1==NOCODE *and* f1==NULL - In that case, returns an arbitrary large number. -*/ -double ProfileDistPiece(unsigned int code1, unsigned int code2, - numeric_t *f1, numeric_t *f2, - /*OPTIONAL*/distance_matrix_t *dmat, - /*OPTIONAL*/numeric_t *codeDist2); - -/* Adds (or subtracts, if weight is negative) fIn/codeIn from fOut - fOut is assumed to exist (as from an outprofile) - do not call unless weight of input profile > 0 - */ -void AddToFreq(/*IN/OUT*/numeric_t *fOut, double weight, - unsigned int codeIn, /*OPTIONAL*/numeric_t *fIn, - /*OPTIONAL*/distance_matrix_t *dmat); - -/* Divide the vector (of length nCodes) by a constant - so that the total (unrotated) frequency is 1.0 */ -void NormalizeFreq(/*IN/OUT*/numeric_t *freq, distance_matrix_t *distance_matrix); - -/* Allocate, if necessary, and recompute the codeDist*/ -void SetCodeDist(/*IN/OUT*/profile_t *profile, int nPos, distance_matrix_t *dmat); - -/* The allhits list contains the distances of the node to all other active nodes - This is useful for the "reset" improvement to the visible set - Note that the following routines do not handle the tophits heuristic - and assume that out-distances are up to date. -*/ -void SetBestHit(int node, NJ_t *NJ, int nActive, - /*OUT*/besthit_t *bestjoin, - /*OUT OPTIONAL*/besthit_t *allhits); -void ExhaustiveNJSearch(NJ_t *NJ, int nActive, /*OUT*/besthit_t *bestjoin); - -/* Searches the visible set */ -void FastNJSearch(NJ_t *NJ, int nActive, /*UPDATE*/besthit_t *visible, /*OUT*/besthit_t *bestjoin); - -/* Subroutines for handling the tophits heuristic */ - -top_hits_t *InitTopHits(NJ_t *NJ, int m); -top_hits_t *FreeTopHits(top_hits_t *tophits); /* returns NULL */ - -/* Before we do any joins -- sets tophits and visible - NJ may be modified by setting out-distances - */ -void SetAllLeafTopHits(/*IN/UPDATE*/NJ_t *NJ, /*IN/OUT*/top_hits_t *tophits); - -/* Find the best join to do. */ -void TopHitNJSearch(/*IN/UPDATE*/NJ_t *NJ, - int nActive, - /*IN/OUT*/top_hits_t *tophits, - /*OUT*/besthit_t *bestjoin); - -/* Returns the best hit within top hits - NJ may be modified because it updates out-distances if they are too stale - Does *not* update visible set -*/ -void GetBestFromTopHits(int iNode, /*IN/UPDATE*/NJ_t *NJ, int nActive, - /*IN*/top_hits_t *tophits, - /*OUT*/besthit_t *bestjoin); - -/* visible set is modifiable so that we can reset it more globally when we do - a "refresh", but we also set the visible set for newnode and do any - "reset" updates too. And, we update many outdistances. - */ -void TopHitJoin(int newnode, - /*IN/UPDATE*/NJ_t *NJ, int nActive, - /*IN/OUT*/top_hits_t *tophits); - -/* Sort the input besthits by criterion - and save the best nOut hits as a new array in top_hits_lists - Does not update criterion or out-distances - Ignores (silently removes) hit to self - Saved list may be shorter than requested if there are insufficient entries -*/ -void SortSaveBestHits(int iNode, /*IN/SORT*/besthit_t *besthits, - int nIn, int nOut, - /*IN/OUT*/top_hits_t *tophits); - -/* Given candidate hits from one node, "transfer" them to another node: - Stores them in a new place in the same order - searches up to active nodes if hits involve non-active nodes - If update flag is set, it also recomputes distance and criterion - (and ensures that out-distances are updated); otherwise - it sets dist to -1e20 and criterion to 1e20 - - */ -void TransferBestHits(/*IN/UPDATE*/NJ_t *NJ, int nActive, - int iNode, - /*IN*/besthit_t *oldhits, - int nOldHits, - /*OUT*/besthit_t *newhits, - bool updateDistance); - -/* Create best hit objects from 1 or more hits. Do not update out-distances or set criteria */ -void HitsToBestHits(/*IN*/hit_t *hits, int nHits, int iNode, /*OUT*/besthit_t *newhits); -besthit_t HitToBestHit(int i, hit_t hit); - -/* Given a set of besthit entries, - look for improvements to the visible set of the j entries. - Updates out-distances as it goes. - Also replaces stale nodes with this node, because a join is usually - how this happens (i.e. it does not need to walk up to ancestors). - Note this calls UpdateTopVisible() on any change -*/ -void UpdateVisible(/*IN/UPDATE*/NJ_t *NJ, int nActive, - /*IN*/besthit_t *tophitsNode, - int nTopHits, - /*IN/OUT*/top_hits_t *tophits); - -/* Update the top-visible list to perhaps include this hit (O(sqrt(N)) time) */ -void UpdateTopVisible(/*IN*/NJ_t * NJ, int nActive, - int iNode, /*IN*/hit_t *hit, - /*IN/OUT*/top_hits_t *tophits); - -/* Recompute the top-visible subset of the visible set */ -void ResetTopVisible(/*IN/UPDATE*/NJ_t *NJ, - int nActive, - /*IN/OUT*/top_hits_t *tophits); - -/* Make a shorter list with only unique entries. - Replaces any "dead" hits to nodes that have parents with their active ancestors - and ignores any that become dead. - Updates all criteria. - Combined gets sorted by i & j - The returned list is allocated to nCombined even though only *nUniqueOut entries are filled -*/ -besthit_t *UniqueBestHits(/*IN/UPDATE*/NJ_t *NJ, int nActive, - /*IN/SORT*/besthit_t *combined, int nCombined, - /*OUT*/int *nUniqueOut); - -nni_t ChooseNNI(profile_t *profiles[4], - /*OPTIONAL*/distance_matrix_t *dmat, - int nPos, int nConstraints, - /*OUT*/double criteria[3]); /* The three internal branch lengths or log likelihoods*/ - -/* length[] is ordered as described by quartet_length_t, but after we do the swap - of B with C (to give AC|BD) or B with D (to get AD|BC), if that is the returned choice - bFast means do not consider NNIs if AB|CD is noticeably better than the star topology - (as implemented by MLQuartetOptimize). - If there are constraints, then the constraint penalty is included in criteria[] -*/ -nni_t MLQuartetNNI(profile_t *profiles[4], - /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, - int nPos, int nConstraints, - /*OUT*/double criteria[3], /* The three potential quartet log-likelihoods */ - /*IN/OUT*/numeric_t length[5], - bool bFast); - -void OptimizeAllBranchLengths(/*IN/OUT*/NJ_t *NJ); -double TreeLogLk(/*IN*/NJ_t *NJ, /*OPTIONAL OUT*/double *site_loglk); -double MLQuartetLogLk(profile_t *pA, profile_t *pB, profile_t *pC, profile_t *pD, - int nPos, /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, - /*IN*/double branch_lengths[5], - /*OPTIONAL OUT*/double *site_likelihoods); - -/* Given a topology and branch lengths, estimate rates & recompute profiles */ -void SetMLRates(/*IN/OUT*/NJ_t *NJ, int nRateCategories); - -/* Returns a set of nRateCategories potential rates; the caller must free it */ -numeric_t *MLSiteRates(int nRateCategories); - -/* returns site_loglk so that - site_loglk[nPos*iRate + j] is the log likelihood of site j with rate iRate - The caller must free it. -*/ -double *MLSiteLikelihoodsByRate(/*IN*/NJ_t *NJ, /*IN*/numeric_t *rates, int nRateCategories); - -typedef struct { - double mult; /* multiplier for the rates / divisor for the tree-length */ - double alpha; - int nPos; - int nRateCats; - numeric_t *rates; - double *site_loglk; -} siteratelk_t; - -double GammaLogLk(/*IN*/siteratelk_t *s, /*OPTIONAL OUT*/double *gamma_loglk_sites); - -/* Input site_loglk must be for each rate. Note that FastTree does not reoptimize - the branch lengths under the Gamma model -- it optimizes the overall scale. - Reports the gamma log likelihhod (and logs site likelihoods if fpLog is set), - and reports the rescaling value. -*/ -double RescaleGammaLogLk(int nPos, int nRateCats, - /*IN*/numeric_t *rates, /*IN*/double *site_loglk, - /*OPTIONAL*/FILE *fpLog); - -/* P(value<=x) for the gamma distribution with shape parameter alpha and scale 1/alpha */ -double PGamma(double x, double alpha); - -/* Given a topology and branch lengths, optimize GTR rates and quickly reoptimize branch lengths - If gtrfreq is NULL, then empirical frequencies are used -*/ -void SetMLGtr(/*IN/OUT*/NJ_t *NJ, /*OPTIONAL IN*/double *gtrfreq, /*OPTIONAL WRITE*/FILE *fpLog); - -/* P(A & B | len) = P(B | A, len) * P(A) - If site_likelihoods is present, multiplies those values by the site likelihood at each point - (Note it does not handle underflow) - */ -double PairLogLk(/*IN*/profile_t *p1, /*IN*/profile_t *p2, double length, - int nPos, /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, - /*OPTIONAL IN/OUT*/double *site_likelihoods); - -/* Branch lengths for 4-taxon tree ((A,B),C,D); I means internal */ -typedef enum {LEN_A,LEN_B,LEN_C,LEN_D,LEN_I} quartet_length_t; - -typedef struct { - int nPos; - transition_matrix_t *transmat; - rates_t *rates; - int nEval; /* number of likelihood evaluations */ - /* The pair to optimize */ - profile_t *pair1; - profile_t *pair2; -} quartet_opt_t; - -double PairNegLogLk(double x, void *data); /* data must be a quartet_opt_t */ - -typedef struct { - NJ_t *NJ; - double freq[4]; - double rates[6]; - int iRate; /* which rate to set x from */ - FILE *fpLog; /* OPTIONAL WRITE */ -} gtr_opt_t; - -/* Returns -log_likelihood for the tree with the given rates - data must be a gtr_opt_t and x is used to set rate iRate - Does not recompute profiles -- assumes that the caller will -*/ -double GTRNegLogLk(double x, void *data); - -/* Returns the resulting log likelihood. Optionally returns whether other - topologies should be abandoned, based on the difference between AB|CD and - the "star topology" (AB|CD with a branch length of MLMinBranchLength) exceeding - closeLogLkLimit. - If bStarTest is passed in, it only optimized the internal branch if - the star test is true. Otherwise, it optimized all 5 branch lengths - in turn. - */ -double MLQuartetOptimize(profile_t *pA, profile_t *pB, profile_t *pC, profile_t *pD, - int nPos, /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, - /*IN/OUT*/double branch_lengths[5], - /*OPTIONAL OUT*/bool *pStarTest, - /*OPTIONAL OUT*/double *site_likelihoods); - -/* Returns the resulting log likelihood */ -double MLPairOptimize(profile_t *pA, profile_t *pB, - int nPos, /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, - /*IN/OUT*/double *branch_length); - -/* Returns the number of steps considered, with the actual steps in steps[] - Modifies the tree by this chain of NNIs -*/ -int FindSPRSteps(/*IN/OUT*/NJ_t *NJ, - int node, - int parent, /* sibling or parent of node to NNI to start the chain */ - /*IN/OUT*/profile_t **upProfiles, - /*OUT*/spr_step_t *steps, - int maxSteps, - bool bFirstAC); - -/* Undo a single NNI */ -void UnwindSPRStep(/*IN/OUT*/NJ_t *NJ, - /*IN*/spr_step_t *step, - /*IN/OUT*/profile_t **upProfiles); - - -/* Update the profile of node and its ancestor, and delete nearby out-profiles */ -void UpdateForNNI(/*IN/OUT*/NJ_t *NJ, int node, /*IN/OUT*/profile_t **upProfiles, bool useML); - -/* Sets NJ->parent[newchild] and replaces oldchild with newchild - in the list of children of parent -*/ -void ReplaceChild(/*IN/OUT*/NJ_t *NJ, int parent, int oldchild, int newchild); - -int CompareHitsByCriterion(const void *c1, const void *c2); -int CompareHitsByIJ(const void *c1, const void *c2); - -int NGaps(NJ_t *NJ, int node); /* only handles leaf sequences */ - -/* node is the parent of AB, sibling of C - node cannot be root or a leaf - If node is the child of root, then D is the other sibling of node, - and the 4th profile is D's profile. - Otherwise, D is the parent of node, and we use its upprofile - Call this with profiles=NULL to get the nodes, without fetching or - computing profiles -*/ -void SetupABCD(NJ_t *NJ, int node, - /* the 4 profiles for ABCD; the last one is an upprofile */ - /*OPTIONAL OUT*/profile_t *profiles[4], - /*OPTIONAL IN/OUT*/profile_t **upProfiles, - /*OUT*/int nodeABCD[4], - bool useML); - -int Sibling(NJ_t *NJ, int node); /* At root, no unique sibling so returns -1 */ -void RootSiblings(NJ_t *NJ, int node, /*OUT*/int sibs[2]); - -/* JC probability of nucleotide not changing, for each rate category */ -double *PSameVector(double length, rates_t *rates); - -/* JC probability of nucleotide not changing, for each rate category */ -double *PDiffVector(double *pSame, rates_t *rates); - -/* expeigen[iRate*nCodes + j] = exp(length * rate iRate * eigenvalue j) */ -numeric_t *ExpEigenRates(double length, transition_matrix_t *transmat, rates_t *rates); - -/* Print a progress report if more than 0.1 second has gone by since the progress report */ -/* Format should include 0-4 %d references and no newlines */ -void ProgressReport(char *format, int iArg1, int iArg2, int iArg3, int iArg4); -void LogTree(char *format, int round, /*OPTIONAL WRITE*/FILE *fp, NJ_t *NJ, char **names, uniquify_t *unique, bool bQuote); -void LogMLRates(/*OPTIONAL WRITE*/FILE *fpLog, NJ_t *NJ); - -void *mymalloc(size_t sz); /* Prints "Out of memory" and exits on failure */ -void *myfree(void *, size_t sz); /* Always returns NULL */ - -/* One-dimensional minimization using brent's function, with - a fractional and an absolute tolerance */ -double onedimenmin(double xmin, double xguess, double xmax, double (*f)(double,void*), void *data, - double ftol, double atol, - /*OUT*/double *fx, /*OUT*/double *f2x); - -double brent(double ax, double bx, double cx, double (*f)(double, void *), void *data, - double ftol, double atol, - double *foptx, double *f2optx, double fax, double fbx, double fcx); - -/* Vector operations, either using SSE3 or not - Code assumes that vectors are a multiple of 4 in size -*/ -void vector_multiply(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, int n, /*OUT*/numeric_t *fOut); -numeric_t vector_multiply_sum(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, int n); -void vector_add_mult(/*IN/OUT*/numeric_t *f, /*IN*/numeric_t *add, numeric_t weight, int n); - -/* multiply the transpose of a matrix by a vector */ -void matrixt_by_vector4(/*IN*/numeric_t mat[4][MAXCODES], /*IN*/numeric_t vec[4], /*OUT*/numeric_t out[4]); - -/* sum(f1*fBy)*sum(f2*fBy) */ -numeric_t vector_dot_product_rot(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, /*IN*/numeric_t* fBy, int n); - -/* sum(f1*f2*f3) */ -numeric_t vector_multiply3_sum(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, /*IN*/numeric_t* f3, int n); - -numeric_t vector_sum(/*IN*/numeric_t *f1, int n); -void vector_multiply_by(/*IN/OUT*/numeric_t *f, /*IN*/numeric_t fBy, int n); - -double clockDiff(/*IN*/struct timeval *clock_start); -int timeval_subtract (/*OUT*/struct timeval *result, /*IN*/struct timeval *x, /*IN*/struct timeval *y); - -char *OpenMPString(void); - -void ran_start(long seed); -double knuth_rand(); /* Random number between 0 and 1 */ -void tred2 (double *a, const int n, const int np, double *d, double *e); -double pythag(double a, double b); -void tqli(double *d, double *e, int n, int np, double *z); - -/* Like mymalloc; duplicates the input (returns NULL if given NULL) */ -void *mymemdup(void *data, size_t sz); -void *myrealloc(void *data, size_t szOld, size_t szNew, bool bCopy); - -double pnorm(double z); /* Probability(value <=z) */ - -/* Hashtable functions */ -typedef struct -{ - char *string; - int nCount; /* number of times this entry was seen */ - int first; /* index of first entry with this value */ -} hashbucket_t; - -typedef struct { - int nBuckets; - /* hashvalue -> bucket. Or look in bucket + 1, +2, etc., till you hit a NULL string */ - hashbucket_t *buckets; -} hashstrings_t; -typedef int hashiterator_t; - -hashstrings_t *MakeHashtable(char **strings, int nStrings); -hashstrings_t *FreeHashtable(hashstrings_t* hash); /*returns NULL*/ -hashiterator_t FindMatch(hashstrings_t *hash, char *string); - -/* Return NULL if we have run out of values */ -char *GetHashString(hashstrings_t *hash, hashiterator_t hi); -int HashCount(hashstrings_t *hash, hashiterator_t hi); -int HashFirst(hashstrings_t *hash, hashiterator_t hi); - -void PrintNJ(/*WRITE*/FILE *, NJ_t *NJ, char **names, uniquify_t *unique, bool bShowSupport, bool bQuoteNames); - -/* Print topology using node indices as node names */ -void PrintNJInternal(/*WRITE*/FILE *, NJ_t *NJ, bool useLen); - -uniquify_t *UniquifyAln(/*IN*/alignment_t *aln); -uniquify_t *FreeUniquify(uniquify_t *); /* returns NULL */ - -/* Convert a constraint alignment to a list of sequences. The returned array is indexed - by iUnique and points to values in the input alignment -*/ -char **AlnToConstraints(alignment_t *constraints, uniquify_t *unique, hashstrings_t *hashnames); - -/* ReadTree ignores non-unique leaves after the first instance. - At the end, it prunes the tree to ignore empty children and it - unroots the tree if necessary. -*/ -void ReadTree(/*IN/OUT*/NJ_t *NJ, - /*IN*/uniquify_t *unique, - /*IN*/hashstrings_t *hashnames, - /*READ*/FILE *fpInTree); -char *ReadTreeToken(/*READ*/FILE *fp); /* returns a static array, or NULL on EOF */ -void ReadTreeAddChild(int parent, int child, /*IN/OUT*/int *parents, /*IN/OUT*/children_t *children); -/* Do not add the leaf if we already set this unique-set to another parent */ -void ReadTreeMaybeAddLeaf(int parent, char *name, - hashstrings_t *hashnames, uniquify_t *unique, - /*IN/OUT*/int *parents, /*IN/OUT*/children_t *children); -void ReadTreeRemove(/*IN/OUT*/int *parents, /*IN/OUT*/children_t *children, int node); - -/* Routines to support tree traversal and prevent visiting a node >1 time - (esp. if topology changes). -*/ -typedef bool *traversal_t; -traversal_t InitTraversal(NJ_t*); -void SkipTraversalInto(int node, /*IN/OUT*/traversal_t traversal); -traversal_t FreeTraversal(traversal_t, NJ_t*); /*returns NULL*/ - -/* returns new node, or -1 if nothing left to do. Use root for the first call. - Will return every node and then root. - Uses postorder tree traversal (depth-first search going down to leaves first) - Keeps track of which nodes are visited, so even after an NNI that swaps a - visited child with an unvisited uncle, the next call will visit the - was-uncle-now-child. (However, after SPR moves, there is no such guarantee.) - - If pUp is not NULL, then, if going "back up" through a previously visited node - (presumably due to an NNI), then it will return the node another time, - with *pUp = true. -*/ -int TraversePostorder(int lastnode, NJ_t *NJ, /*IN/OUT*/traversal_t, - /*OUT OPTIONAL*/bool *pUp); - -/* Routines to support storing up-profiles during tree traversal - Eventually these should be smart enough to do weighted joins and - to minimize memory usage -*/ -profile_t **UpProfiles(NJ_t *NJ); -profile_t *GetUpProfile(/*IN/OUT*/profile_t **upProfiles, NJ_t *NJ, int node, bool useML); -profile_t *DeleteUpProfile(/*IN/OUT*/profile_t **upProfiles, NJ_t *NJ, int node); /* returns NULL */ -profile_t **FreeUpProfiles(profile_t **upProfiles, NJ_t *NJ); /* returns NULL */ - -/* Recomputes the profile for a node, presumably to reflect topology changes - If bionj is set, does a weighted join -- which requires using upProfiles - If useML is set, computes the posterior probability instead of averaging - */ -void RecomputeProfile(/*IN/OUT*/NJ_t *NJ, /*IN/OUT*/profile_t **upProfiles, int node, bool useML); - -/* Recompute profiles going up from the leaves, using the provided distance matrix - and unweighted joins -*/ -void RecomputeProfiles(/*IN/OUT*/NJ_t *NJ, /*OPTIONAL*/distance_matrix_t *dmat); - -void RecomputeMLProfiles(/*IN/OUT*/NJ_t *NJ); - -/* If bionj is set, computes the weight to be given to A when computing the - profile for the ancestor of A and B. C and D are the other profiles in the quartet - If bionj is not set, returns -1 (which means unweighted in AverageProfile). - (A and B are the first two profiles in the array) -*/ -double QuartetWeight(profile_t *profiles[4], distance_matrix_t *dmat, int nPos); - -/* Returns a list of nodes, starting with node and ending with root */ -int *PathToRoot(NJ_t *NJ, int node, /*OUT*/int *depth); -int *FreePath(int *path, NJ_t *NJ); /* returns NULL */ - -/* The default amino acid distance matrix, derived from the BLOSUM45 similarity matrix */ -distance_matrix_t matrixBLOSUM45; - -/* The default amino acid transition matrix (Jones Taylor Thorton 1992) */ -double matrixJTT92[MAXCODES][MAXCODES]; -double statJTT92[MAXCODES]; - -/* The Le-Gascuel 2008 amino acid transition matrix */ -double matrixLG08[MAXCODES][MAXCODES]; -double statLG08[MAXCODES]; - -/* The WAG amino acid transition matrix (Whelan-And-Goldman 2001) */ -double matrixWAG01[MAXCODES][MAXCODES]; -double statWAG01[MAXCODES]; - - -int main(int argc, char **argv) { - int nAlign = 1; /* number of alignments to read */ - int iArg; - char *matrixPrefix = NULL; - char *transitionFile = NULL; - distance_matrix_t *distance_matrix = NULL; - bool make_matrix = false; - char *constraintsFile = NULL; - char *intreeFile = NULL; - bool intree1 = false; /* the same starting tree each round */ - int nni = -1; /* number of rounds of NNI, defaults to 4*log2(n) */ - int spr = 2; /* number of rounds of SPR */ - int maxSPRLength = 10; /* maximum distance to move a node */ - int MLnni = -1; /* number of rounds of ML NNI, defaults to 2*log2(n) */ - bool MLlen = false; /* optimize branch lengths; no topology changes */ - int nBootstrap = 1000; /* If set, number of replicates of local bootstrap to do */ - int nRateCats = nDefaultRateCats; - char *logfile = NULL; - bool bUseGtr = false; - bool bUseLg = false; - bool bUseWag = false; - bool bUseGtrRates = false; - double gtrrates[6] = {1,1,1,1,1,1}; - bool bUseGtrFreq = false; - double gtrfreq[4] = {0.25,0.25,0.25,0.25}; - bool bQuote = false; - FILE *fpOut = stdout; - - if (isatty(STDIN_FILENO) && argc == 1) { - fprintf(stderr,"Usage for FastTree version %s %s%s:\n%s", - FT_VERSION, SSE_STRING, OpenMPString(), usage); -#if (defined _WIN32 || defined WIN32 || defined WIN64 || defined _WIN64) - fprintf(stderr, "Windows users: Please remember to run this inside a command shell\n"); - fprintf(stderr,"Hit return to continue\n"); - fgetc(stdin); -#endif - exit(0); - } - for (iArg = 1; iArg < argc; iArg++) { - if (strcmp(argv[iArg],"-makematrix") == 0) { - make_matrix = true; - } else if (strcmp(argv[iArg],"-logdist") == 0) { - fprintf(stderr, "Warning: logdist is now on by default and obsolete\n"); - } else if (strcmp(argv[iArg],"-rawdist") == 0) { - logdist = false; - } else if (strcmp(argv[iArg],"-verbose") == 0 && iArg < argc-1) { - verbose = atoi(argv[++iArg]); - } else if (strcmp(argv[iArg],"-quiet") == 0) { - verbose = 0; - showProgress = 0; - } else if (strcmp(argv[iArg],"-nopr") == 0) { - showProgress = 0; - } else if (strcmp(argv[iArg],"-slow") == 0) { - slow = 1; - } else if (strcmp(argv[iArg],"-fastest") == 0) { - fastest = 1; - tophitsRefresh = 0.5; - useTopHits2nd = true; - } else if (strcmp(argv[iArg],"-2nd") == 0) { - useTopHits2nd = true; - } else if (strcmp(argv[iArg],"-no2nd") == 0) { - useTopHits2nd = false; - } else if (strcmp(argv[iArg],"-slownni") == 0) { - fastNNI = false; - } else if (strcmp(argv[iArg], "-matrix") == 0 && iArg < argc-1) { - iArg++; - matrixPrefix = argv[iArg]; - } else if (strcmp(argv[iArg], "-nomatrix") == 0) { - useMatrix = false; - } else if (strcmp(argv[iArg], "-n") == 0 && iArg < argc-1) { - iArg++; - nAlign = atoi(argv[iArg]); - if (nAlign < 1) { - fprintf(stderr, "-n argument for #input alignments must be > 0 not %s\n", argv[iArg]); - exit(1); - } - } else if (strcmp(argv[iArg], "-quote") == 0) { - bQuote = true; - } else if (strcmp(argv[iArg], "-nt") == 0) { - nCodes = 4; - } else if (strcmp(argv[iArg], "-intree") == 0 && iArg < argc-1) { - iArg++; - intreeFile = argv[iArg]; - } else if (strcmp(argv[iArg], "-intree1") == 0 && iArg < argc-1) { - iArg++; - intreeFile = argv[iArg]; - intree1 = true; - } else if (strcmp(argv[iArg], "-nj") == 0) { - bionj = 0; - } else if (strcmp(argv[iArg], "-bionj") == 0) { - bionj = 1; - } else if (strcmp(argv[iArg], "-boot") == 0 && iArg < argc-1) { - iArg++; - nBootstrap = atoi(argv[iArg]); - } else if (strcmp(argv[iArg], "-noboot") == 0 || strcmp(argv[iArg], "-nosupport") == 0) { - nBootstrap = 0; - } else if (strcmp(argv[iArg], "-seed") == 0 && iArg < argc-1) { - iArg++; - long seed = atol(argv[iArg]); - ran_start(seed); - } else if (strcmp(argv[iArg],"-top") == 0) { - if(tophitsMult < 0.01) - tophitsMult = 1.0; - } else if (strcmp(argv[iArg],"-notop") == 0) { - tophitsMult = 0.0; - } else if (strcmp(argv[iArg], "-topm") == 0 && iArg < argc-1) { - iArg++; - tophitsMult = atof(argv[iArg]); - } else if (strcmp(argv[iArg], "-close") == 0 && iArg < argc-1) { - iArg++; - tophitsClose = atof(argv[iArg]); - if (tophitsMult <= 0) { - fprintf(stderr, "Cannot use -close unless -top is set above 0\n"); - exit(1); - } - if (tophitsClose <= 0 || tophitsClose >= 1) { - fprintf(stderr, "-close argument must be between 0 and 1\n"); - exit(1); - } - } else if (strcmp(argv[iArg], "-refresh") == 0 && iArg < argc-1) { - iArg++; - tophitsRefresh = atof(argv[iArg]); - if (tophitsMult <= 0) { - fprintf(stderr, "Cannot use -refresh unless -top is set above 0\n"); - exit(1); - } - if (tophitsRefresh <= 0 || tophitsRefresh >= 1) { - fprintf(stderr, "-refresh argument must be between 0 and 1\n"); - exit(1); - } - } else if (strcmp(argv[iArg],"-nni") == 0 && iArg < argc-1) { - iArg++; - nni = atoi(argv[iArg]); - if (nni == 0) - spr = 0; - } else if (strcmp(argv[iArg],"-spr") == 0 && iArg < argc-1) { - iArg++; - spr = atoi(argv[iArg]); - } else if (strcmp(argv[iArg],"-sprlength") == 0 && iArg < argc-1) { - iArg++; - maxSPRLength = atoi(argv[iArg]); - } else if (strcmp(argv[iArg],"-mlnni") == 0 && iArg < argc-1) { - iArg++; - MLnni = atoi(argv[iArg]); - } else if (strcmp(argv[iArg],"-noml") == 0) { - MLnni = 0; - } else if (strcmp(argv[iArg],"-mllen") == 0) { - MLnni = 0; - MLlen = true; - } else if (strcmp(argv[iArg],"-nome") == 0) { - spr = 0; - nni = 0; - } else if (strcmp(argv[iArg],"-help") == 0) { - fprintf(stderr,"FastTree %s %s%s:\n%s", FT_VERSION, SSE_STRING, OpenMPString(), usage); - exit(0); - } else if (strcmp(argv[iArg],"-expert") == 0) { - fprintf(stderr, "Detailed usage for FastTree %s %s%s:\n%s", - FT_VERSION, SSE_STRING, OpenMPString(), expertUsage); - exit(0); - } else if (strcmp(argv[iArg],"-pseudo") == 0) { - if (iArg < argc-1 && isdigit(argv[iArg+1][0])) { - iArg++; - pseudoWeight = atof(argv[iArg]); - if (pseudoWeight < 0.0) { - fprintf(stderr,"Illegal argument to -pseudo: %s\n", argv[iArg]); - exit(1); - } - } else { - pseudoWeight = 1.0; - } - } else if (strcmp(argv[iArg],"-constraints") == 0 && iArg < argc-1) { - iArg++; - constraintsFile = argv[iArg]; - } else if (strcmp(argv[iArg],"-constraintWeight") == 0 && iArg < argc-1) { - iArg++; - constraintWeight = atof(argv[iArg]); - if (constraintWeight <= 0.0) { - fprintf(stderr, "Illegal argument to -constraintWeight (must be greater than zero): %s\n", argv[iArg]); - exit(1); - } - } else if (strcmp(argv[iArg],"-mlacc") == 0 && iArg < argc-1) { - iArg++; - mlAccuracy = atoi(argv[iArg]); - if (mlAccuracy < 1) { - fprintf(stderr, "Illlegal -mlacc argument: %s\n", argv[iArg]); - exit(1); - } - } else if (strcmp(argv[iArg],"-exactml") == 0 || strcmp(argv[iArg],"-mlexact") == 0) { - fprintf(stderr,"-exactml is not required -- exact posteriors is the default now\n"); - } else if (strcmp(argv[iArg],"-approxml") == 0 || strcmp(argv[iArg],"-mlapprox") == 0) { - exactML = false; - } else if (strcmp(argv[iArg],"-cat") == 0 && iArg < argc-1) { - iArg++; - nRateCats = atoi(argv[iArg]); - if (nRateCats < 1) { - fprintf(stderr, "Illlegal argument to -ncat (must be greater than zero): %s\n", argv[iArg]); - exit(1); - } - } else if (strcmp(argv[iArg],"-nocat") == 0) { - nRateCats = 1; - } else if (strcmp(argv[iArg], "-lg") == 0) { - bUseLg = true; - } else if (strcmp(argv[iArg], "-wag") == 0) { - bUseWag = true; - } else if (strcmp(argv[iArg], "-gtr") == 0) { - bUseGtr = true; - } else if (strcmp(argv[iArg], "-trans") == 0 && iArg < argc-1) { - iArg++; - transitionFile = argv[iArg]; - } else if (strcmp(argv[iArg], "-gtrrates") == 0 && iArg < argc-6) { - bUseGtr = true; - bUseGtrRates = true; - int i; - for (i = 0; i < 6; i++) { - gtrrates[i] = atof(argv[++iArg]); - if (gtrrates[i] < 1e-5) { - fprintf(stderr, "Illegal or too small value of GTR rate: %s\n", argv[iArg]); - exit(1); - } - } - } else if (strcmp(argv[iArg],"-gtrfreq") == 0 && iArg < argc-4) { - bUseGtr = true; - bUseGtrFreq = true; - int i; - double sum = 0; - for (i = 0; i < 4; i++) { - gtrfreq[i] = atof(argv[++iArg]); - sum += gtrfreq[i]; - if (gtrfreq[i] < 1e-5) { - fprintf(stderr, "Illegal or too small value of GTR frequency: %s\n", argv[iArg]); - exit(1); - } - } - if (fabs(1.0-sum) > 0.01) { - fprintf(stderr, "-gtrfreq values do not sum to 1\n"); - exit(1); - } - for (i = 0; i < 4; i++) - gtrfreq[i] /= sum; - } else if (strcmp(argv[iArg],"-log") == 0 && iArg < argc-1) { - iArg++; - logfile = argv[iArg]; - } else if (strcmp(argv[iArg],"-gamma") == 0) { - gammaLogLk = true; - } else if (strcmp(argv[iArg],"-out") == 0 && iArg < argc-1) { - iArg++; - fpOut = fopen(argv[iArg],"w"); - if(fpOut==NULL) { - fprintf(stderr,"Cannot write to %s\n",argv[iArg]); - exit(1); - } - } else if (argv[iArg][0] == '-') { - fprintf(stderr, "Unknown or incorrect use of option %s\n%s", argv[iArg], usage); - exit(1); - } else - break; - } - if(iArg < argc-1) { - fprintf(stderr, "%s", usage); - exit(1); - } - - codesString = nCodes == 20 ? codesStringAA : codesStringNT; - if (nCodes == 4 && matrixPrefix == NULL) - useMatrix = false; /* no default nucleotide matrix */ - if (transitionFile && nCodes != 20) { - fprintf(stderr, "The -trans option is only supported for amino acid alignments\n"); - exit(1); - } -#ifndef USE_DOUBLE - if (transitionFile) - fprintf(stderr, - "Warning: custom matrices may create numerical problems for single-precision FastTree.\n" - "You may want to recompile with -DUSE_DOUBLE\n"); -#endif - - char *fileName = iArg == (argc-1) ? argv[argc-1] : NULL; - - if (slow && fastest) { - fprintf(stderr,"Cannot be both slow and fastest\n"); - exit(1); - } - if (slow && tophitsMult > 0) { - tophitsMult = 0.0; - } - - FILE *fpLog = NULL; - if (logfile != NULL) { - fpLog = fopen(logfile, "w"); - if (fpLog == NULL) { - fprintf(stderr, "Cannot write to: %s\n", logfile); - exit(1); - } - fprintf(fpLog, "Command:"); - int i; - for (i=0; i < argc; i++) - fprintf(fpLog, " %s", argv[i]); - fprintf(fpLog,"\n"); - fflush(fpLog); - } - - int i; - FILE *fps[2] = {NULL,NULL}; - int nFPs = 0; - if (verbose) - fps[nFPs++] = stderr; - if (fpLog != NULL) - fps[nFPs++] = fpLog; - - if (!make_matrix) { /* Report settings */ - char tophitString[100] = "no"; - char tophitsCloseStr[100] = "default"; - if(tophitsClose > 0) sprintf(tophitsCloseStr,"%.2f",tophitsClose); - if(tophitsMult>0) sprintf(tophitString,"%.2f*sqrtN close=%s refresh=%.2f", - tophitsMult, tophitsCloseStr, tophitsRefresh); - char supportString[100] = "none"; - if (nBootstrap>0) { - if (MLnni != 0 || MLlen) - sprintf(supportString, "SH-like %d", nBootstrap); - else - sprintf(supportString,"Local boot %d",nBootstrap); - } - char nniString[100] = "(no NNI)"; - if (nni > 0) - sprintf(nniString, "+NNI (%d rounds)", nni); - if (nni == -1) - strcpy(nniString, "+NNI"); - char sprString[100] = "(no SPR)"; - if (spr > 0) - sprintf(sprString, "+SPR (%d rounds range %d)", spr, maxSPRLength); - char mlnniString[100] = "(no ML-NNI)"; - if(MLnni > 0) - sprintf(mlnniString, "+ML-NNI (%d rounds)", MLnni); - else if (MLnni == -1) - sprintf(mlnniString, "+ML-NNI"); - else if (MLlen) - sprintf(mlnniString, "+ML branch lengths"); - if ((MLlen || MLnni != 0) && !exactML) - strcat(mlnniString, " approx"); - if (MLnni != 0) - sprintf(mlnniString+strlen(mlnniString), " opt-each=%d",mlAccuracy); - - for (i = 0; i < nFPs; i++) { - FILE *fp = fps[i]; - fprintf(fp,"FastTree Version %s %s%s\nAlignment: %s", - FT_VERSION, SSE_STRING, OpenMPString(), fileName != NULL ? fileName : "standard input"); - if (nAlign>1) - fprintf(fp, " (%d alignments)", nAlign); - fprintf(fp,"\n%s distances: %s Joins: %s Support: %s\n", - nCodes == 20 ? "Amino acid" : "Nucleotide", - matrixPrefix ? matrixPrefix : (useMatrix? "BLOSUM45" - : (nCodes==4 && logdist ? "Jukes-Cantor" : "%different")), - bionj ? "weighted" : "balanced" , - supportString); - if (intreeFile == NULL) - fprintf(fp, "Search: %s%s %s %s %s\nTopHits: %s\n", - slow?"Exhaustive (slow)" : (fastest ? "Fastest" : "Normal"), - useTopHits2nd ? "+2nd" : "", - nniString, sprString, mlnniString, - tophitString); - else - fprintf(fp, "Start at tree from %s %s %s\n", intreeFile, nniString, sprString); - - if (MLnni != 0 || MLlen) { - fprintf(fp, "ML Model: %s,", - (nCodes == 4) ? - (bUseGtr ? "Generalized Time-Reversible" : "Jukes-Cantor") : - (transitionFile ? transitionFile : - (bUseLg ? "Le-Gascuel 2008" : (bUseWag ? "Whelan-And-Goldman" : "Jones-Taylor-Thorton")))); - if (nRateCats == 1) - fprintf(fp, " No rate variation across sites"); - else - fprintf(fp, " CAT approximation with %d rate categories", nRateCats); - fprintf(fp, "\n"); - if (nCodes == 4 && bUseGtrRates) - fprintf(fp, "GTR rates(ac ag at cg ct gt) %.4f %.4f %.4f %.4f %.4f %.4f\n", - gtrrates[0],gtrrates[1],gtrrates[2],gtrrates[3],gtrrates[4],gtrrates[5]); - if (nCodes == 4 && bUseGtrFreq) - fprintf(fp, "GTR frequencies(A C G T) %.4f %.4f %.4f %.4f\n", - gtrfreq[0],gtrfreq[1],gtrfreq[2],gtrfreq[3]); - } - if (constraintsFile != NULL) - fprintf(fp, "Constraints: %s Weight: %.3f\n", constraintsFile, constraintWeight); - if (pseudoWeight > 0) - fprintf(fp, "Pseudocount weight for comparing sequences with little overlap: %.3lf\n",pseudoWeight); - fflush(fp); - } - } - if (matrixPrefix != NULL) { - if (!useMatrix) { - fprintf(stderr,"Cannot use both -matrix and -nomatrix arguments!"); - exit(1); - } - distance_matrix = ReadDistanceMatrix(matrixPrefix); - } else if (useMatrix) { /* use default matrix */ - assert(nCodes==20); - distance_matrix = &matrixBLOSUM45; - SetupDistanceMatrix(distance_matrix); - } else { - distance_matrix = NULL; - } - - int iAln; - FILE *fpIn = fileName != NULL ? fopen(fileName, "r") : stdin; - if (fpIn == NULL) { - fprintf(stderr, "Cannot read %s\n", fileName); - exit(1); - } - FILE *fpConstraints = NULL; - if (constraintsFile != NULL) { - fpConstraints = fopen(constraintsFile, "r"); - if (fpConstraints == NULL) { - fprintf(stderr, "Cannot read %s\n", constraintsFile); - exit(1); - } - } - - FILE *fpInTree = NULL; - if (intreeFile != NULL) { - fpInTree = fopen(intreeFile,"r"); - if (fpInTree == NULL) { - fprintf(stderr, "Cannot read %s\n", intreeFile); - exit(1); - } - } - - for(iAln = 0; iAln < nAlign; iAln++) { - alignment_t *aln = ReadAlignment(fpIn, bQuote); - if (aln->nSeq < 1) { - fprintf(stderr, "No alignment sequences\n"); - exit(1); - } - if (fpLog) { - fprintf(fpLog, "Read %d sequences, %d positions\n", aln->nSeq, aln->nPos); - fflush(fpLog); - } - - struct timeval clock_start; - gettimeofday(&clock_start,NULL); - ProgressReport("Read alignment",0,0,0,0); - - /* Check that all names in alignment are unique */ - hashstrings_t *hashnames = MakeHashtable(aln->names, aln->nSeq); - int i; - for (i=0; i<aln->nSeq; i++) { - hashiterator_t hi = FindMatch(hashnames,aln->names[i]); - if (HashCount(hashnames,hi) != 1) { - fprintf(stderr,"Non-unique name '%s' in the alignment\n",aln->names[i]); - exit(1); - } - } - - /* Make a list of unique sequences -- note some lists are bigger than required */ - ProgressReport("Hashed the names",0,0,0,0); - if (make_matrix) { - NJ_t *NJ = InitNJ(aln->seqs, aln->nSeq, aln->nPos, - /*constraintSeqs*/NULL, /*nConstraints*/0, - distance_matrix, /*transmat*/NULL); - printf(" %d\n",aln->nSeq); - int i,j; - for(i = 0; i < NJ->nSeq; i++) { - printf("%s",aln->names[i]); - for (j = 0; j < NJ->nSeq; j++) { - besthit_t hit; - SeqDist(NJ->profiles[i]->codes,NJ->profiles[j]->codes,NJ->nPos,NJ->distance_matrix,/*OUT*/&hit); - if (logdist) - hit.dist = LogCorrect(hit.dist); - /* Make sure -0 prints as 0 */ - printf(" %f", hit.dist <= 0.0 ? 0.0 : hit.dist); - } - printf("\n"); - } - } else { - /* reset counters*/ - profileOps = 0; - outprofileOps = 0; - seqOps = 0; - profileAvgOps = 0; - nHillBetter = 0; - nCloseUsed = 0; - nClose2Used = 0; - nRefreshTopHits = 0; - nVisibleUpdate = 0; - nNNI = 0; - nML_NNI = 0; - nProfileFreqAlloc = 0; - nProfileFreqAvoid = 0; - szAllAlloc = 0; - mymallocUsed = 0; - maxmallocHeap = 0; - nLkCompute = 0; - nPosteriorCompute = 0; - nAAPosteriorExact = 0; - nAAPosteriorRough = 0; - nStarTests = 0; - - uniquify_t *unique = UniquifyAln(aln); - ProgressReport("Identified unique sequences",0,0,0,0); - - /* read constraints */ - alignment_t *constraints = NULL; - char **uniqConstraints = NULL; - if (constraintsFile != NULL) { - constraints = ReadAlignment(fpConstraints, bQuote); - if (constraints->nSeq < 4) { - fprintf(stderr, "Warning: constraints file with less than 4 sequences ignored:\nalignment #%d in %s\n", - iAln+1, constraintsFile); - constraints = FreeAlignment(constraints); - } else { - uniqConstraints = AlnToConstraints(constraints, unique, hashnames); - ProgressReport("Read the constraints",0,0,0,0); - } - } /* end load constraints */ - - transition_matrix_t *transmat = NULL; - if (nCodes == 20) { - transmat = transitionFile? ReadAATransitionMatrix(transitionFile) : - (bUseLg? CreateTransitionMatrix(matrixLG08,statLG08) : - (bUseWag? CreateTransitionMatrix(matrixWAG01,statWAG01) : - CreateTransitionMatrix(matrixJTT92,statJTT92))); - } else if (nCodes == 4 && bUseGtr && (bUseGtrRates || bUseGtrFreq)) { - transmat = CreateGTR(gtrrates,gtrfreq); - } - NJ_t *NJ = InitNJ(unique->uniqueSeq, unique->nUnique, aln->nPos, - uniqConstraints, - uniqConstraints != NULL ? constraints->nPos : 0, /* nConstraints */ - distance_matrix, - transmat); - if (verbose>2) fprintf(stderr, "read %s seqs %d (%d unique) positions %d nameLast %s seqLast %s\n", - fileName ? fileName : "standard input", - aln->nSeq, unique->nUnique, aln->nPos, aln->names[aln->nSeq-1], aln->seqs[aln->nSeq-1]); - FreeAlignmentSeqs(/*IN/OUT*/aln); /*no longer needed*/ - if (fpInTree != NULL) { - if (intree1) - fseek(fpInTree, 0L, SEEK_SET); - ReadTree(/*IN/OUT*/NJ, /*IN*/unique, /*IN*/hashnames, /*READ*/fpInTree); - if (verbose > 2) - fprintf(stderr, "Read tree from %s\n", intreeFile); - if (verbose > 2) - PrintNJ(stderr, NJ, aln->names, unique, /*support*/false, bQuote); - } else { - FastNJ(NJ); - } - LogTree("NJ", 0, fpLog, NJ, aln->names, unique, bQuote); - - /* profile-frequencies for the "up-profiles" in ReliabilityNJ take only diameter(Tree)*L*a - space not N*L*a space, because we can free them as we go. - And up-profile by their nature tend to be complicated. - So save the profile-frequency memory allocation counters now to exclude later results. - */ -#ifdef TRACK_MEMORY - long svProfileFreqAlloc = nProfileFreqAlloc; - long svProfileFreqAvoid = nProfileFreqAvoid; -#endif - int nniToDo = nni == -1 ? (int)(0.5 + 4.0 * log(NJ->nSeq)/log(2)) : nni; - int sprRemaining = spr; - int MLnniToDo = (MLnni != -1) ? MLnni : (int)(0.5 + 2.0*log(NJ->nSeq)/log(2)); - if(verbose>0) { - if (fpInTree == NULL) - fprintf(stderr, "Initial topology in %.2f seconds\n", clockDiff(&clock_start)); - if (spr > 0 || nniToDo > 0 || MLnniToDo > 0) - fprintf(stderr,"Refining topology: %d rounds ME-NNIs, %d rounds ME-SPRs, %d rounds ML-NNIs\n", nniToDo, spr, MLnniToDo); - } - - if (nniToDo>0) { - int i; - bool bConverged = false; - nni_stats_t *nni_stats = InitNNIStats(NJ); - for (i=0; i < nniToDo; i++) { - double maxDelta; - if (!bConverged) { - int nChange = NNI(/*IN/OUT*/NJ, i, nniToDo, /*use ml*/false, /*IN/OUT*/nni_stats, /*OUT*/&maxDelta); - LogTree("ME_NNI%d",i+1, fpLog, NJ, aln->names, unique, bQuote); - if (nChange == 0) { - bConverged = true; - if (verbose>1) - fprintf(stderr, "Min_evolution NNIs converged at round %d -- skipping some rounds\n", i+1); - if (fpLog) - fprintf(fpLog, "Min_evolution NNIs converged at round %d -- skipping some rounds\n", i+1); - } - } - - /* Interleave SPRs with NNIs (typically 1/3rd NNI, SPR, 1/3rd NNI, SPR, 1/3rd NNI */ - if (sprRemaining > 0 && (nniToDo/(spr+1) > 0 && ((i+1) % (nniToDo/(spr+1))) == 0)) { - SPR(/*IN/OUT*/NJ, maxSPRLength, spr-sprRemaining, spr); - LogTree("ME_SPR%d",spr-sprRemaining+1, fpLog, NJ, aln->names, unique, bQuote); - sprRemaining--; - /* Restart the NNIs -- set all ages to 0, etc. */ - bConverged = false; - nni_stats = FreeNNIStats(nni_stats, NJ); - nni_stats = InitNNIStats(NJ); - } - } - nni_stats = FreeNNIStats(nni_stats, NJ); - } - while(sprRemaining > 0) { /* do any remaining SPR rounds */ - SPR(/*IN/OUT*/NJ, maxSPRLength, spr-sprRemaining, spr); - LogTree("ME_SPR%d",spr-sprRemaining+1, fpLog, NJ, aln->names, unique, bQuote); - sprRemaining--; - } - - /* In minimum-evolution mode, update branch lengths, even if no NNIs or SPRs, - so that they are log-corrected, do not include penalties from constraints, - and avoid errors due to approximation of out-distances. - If doing maximum-likelihood NNIs, then we'll also use these - to get estimates of starting distances for quartets, etc. - */ - UpdateBranchLengths(/*IN/OUT*/NJ); - LogTree("ME_Lengths",0, fpLog, NJ, aln->names, unique, bQuote); - - double total_len = 0; - int iNode; - for (iNode = 0; iNode < NJ->maxnode; iNode++) - total_len += fabs(NJ->branchlength[iNode]); - - if (verbose>0) { - fprintf(stderr, "Total branch-length %.3f after %.2f sec\n", - total_len, clockDiff(&clock_start)); - fflush(stderr); - } - if (fpLog) { - fprintf(fpLog, "Total branch-length %.3f after %.2f sec\n", - total_len, clockDiff(&clock_start)); - fflush(stderr); - } - -#ifdef TRACK_MEMORY - if (verbose>1) { - struct mallinfo mi = mallinfo(); - fprintf(stderr, "Memory @ end of ME phase: %.2f MB (%.1f byte/pos) useful %.2f expected %.2f\n", - (mi.arena+mi.hblkhd)/1.0e6, (mi.arena+mi.hblkhd)/(double)(NJ->nSeq*(double)NJ->nPos), - mi.uordblks/1.0e6, mymallocUsed/1e6); - } -#endif - - SplitCount_t splitcount = {0,0,0,0,0.0,0.0}; - - if (MLnniToDo > 0 || MLlen) { - bool warn_len = total_len/NJ->maxnode < 0.001 && MLMinBranchLengthTolerance > 1.0/aln->nPos; - bool warn = warn_len || (total_len/NJ->maxnode < 0.001 && aln->nPos >= 10000); - if (warn) - fprintf(stderr, "\nWARNING! This alignment consists of closely-related and very-long sequences.\n"); - if (warn_len) - fprintf(stderr, - "This version of FastTree may not report reasonable branch lengths!\n" -#ifdef USE_DOUBLE - "Consider changing MLMinBranchLengthTolerance.\n" -#else - "Consider recompiling FastTree with -DUSE_DOUBLE.\n" -#endif - "For more information, visit\n" - "http://www.microbesonline.org/fasttree/#BranchLen\n\n"); - if (warn) - fprintf(stderr, "WARNING! FastTree (or other standard maximum-likelihood tools)\n" - "may not be appropriate for aligments of very closely-related sequences\n" - "like this one, as FastTree does not account for recombination or gene conversion\n\n"); - - /* Do maximum-likelihood computations */ - /* Convert profiles to use the transition matrix */ - distance_matrix_t *tmatAsDist = TransMatToDistanceMat(/*OPTIONAL*/NJ->transmat); - RecomputeProfiles(NJ, /*OPTIONAL*/tmatAsDist); - tmatAsDist = myfree(tmatAsDist, sizeof(distance_matrix_t)); - double lastloglk = -1e20; - nni_stats_t *nni_stats = InitNNIStats(NJ); - bool resetGtr = nCodes == 4 && bUseGtr && !bUseGtrRates; - - if (MLlen) { - int iRound; - int maxRound = (int)(0.5 + log(NJ->nSeq)/log(2)); - double dLastLogLk = -1e20; - for (iRound = 1; iRound <= maxRound; iRound++) { - int node; - numeric_t *oldlength = (numeric_t*)mymalloc(sizeof(numeric_t)*NJ->maxnodes); - for (node = 0; node < NJ->maxnode; node++) - oldlength[node] = NJ->branchlength[node]; - OptimizeAllBranchLengths(/*IN/OUT*/NJ); - LogTree("ML_Lengths",iRound, fpLog, NJ, aln->names, unique, bQuote); - double dMaxChange = 0; /* biggest change in branch length */ - for (node = 0; node < NJ->maxnode; node++) { - double d = fabs(oldlength[node] - NJ->branchlength[node]); - if (dMaxChange < d) - dMaxChange = d; - } - oldlength = myfree(oldlength, sizeof(numeric_t)*NJ->maxnodes); - double loglk = TreeLogLk(NJ, /*site_likelihoods*/NULL); - bool bConverged = iRound > 1 && (dMaxChange < 0.001 || loglk < (dLastLogLk+treeLogLkDelta)); - if (verbose) - fprintf(stderr, "%d rounds ML lengths: LogLk %s= %.3lf Max-change %.4lf%s Time %.2f\n", - iRound, - exactML || nCodes != 20 ? "" : "~", - loglk, - dMaxChange, - bConverged ? " (converged)" : "", - clockDiff(&clock_start)); - if (fpLog) - fprintf(fpLog, "TreeLogLk\tLength%d\t%.4lf\tMaxChange\t%.4lf\n", - iRound, loglk, dMaxChange); - if (iRound == 1) { - if (resetGtr) - SetMLGtr(/*IN/OUT*/NJ, bUseGtrFreq ? gtrfreq : NULL, fpLog); - SetMLRates(/*IN/OUT*/NJ, nRateCats); - LogMLRates(fpLog, NJ); - } - if (bConverged) - break; - } - } - - if (MLnniToDo > 0) { - /* This may help us converge faster, and is fast */ - OptimizeAllBranchLengths(/*IN/OUT*/NJ); - LogTree("ML_Lengths%d",1, fpLog, NJ, aln->names, unique, bQuote); - } - - int iMLnni; - double maxDelta; - bool bConverged = false; - for (iMLnni = 0; iMLnni < MLnniToDo; iMLnni++) { - int changes = NNI(/*IN/OUT*/NJ, iMLnni, MLnniToDo, /*use ml*/true, /*IN/OUT*/nni_stats, /*OUT*/&maxDelta); - LogTree("ML_NNI%d",iMLnni+1, fpLog, NJ, aln->names, unique, bQuote); - double loglk = TreeLogLk(NJ, /*site_likelihoods*/NULL); - bool bConvergedHere = (iMLnni > 0) && ((loglk < lastloglk + treeLogLkDelta) || maxDelta < treeLogLkDelta); - if (verbose) - fprintf(stderr, "ML-NNI round %d: LogLk %s= %.3f NNIs %d max delta %.2f Time %.2f%s\n", - iMLnni+1, - exactML || nCodes != 20 ? "" : "~", - loglk, changes, maxDelta, clockDiff(&clock_start), - bConverged ? " (final)" : ""); - if (fpLog) - fprintf(fpLog, "TreeLogLk\tML_NNI%d\t%.4lf\tMaxChange\t%.4lf\n", iMLnni+1, loglk, maxDelta); - if (bConverged) - break; /* we did our extra round */ - if (bConvergedHere) - bConverged = true; - if (bConverged || iMLnni == MLnniToDo-2) { - /* last round uses high-accuracy seettings -- reset NNI stats to tone down heuristics */ - nni_stats = FreeNNIStats(nni_stats, NJ); - nni_stats = InitNNIStats(NJ); - if (verbose) - fprintf(stderr, "Turning off heuristics for final round of ML NNIs%s\n", - bConvergedHere? " (converged)" : ""); - if (fpLog) - fprintf(fpLog, "Turning off heuristics for final round of ML NNIs%s\n", - bConvergedHere? " (converged)" : ""); - } - lastloglk = loglk; - if (iMLnni == 0 && NJ->rates.nRateCategories == 1) { - if (resetGtr) - SetMLGtr(/*IN/OUT*/NJ, bUseGtrFreq ? gtrfreq : NULL, fpLog); - SetMLRates(/*IN/OUT*/NJ, nRateCats); - LogMLRates(fpLog, NJ); - } - } - nni_stats = FreeNNIStats(nni_stats, NJ); - - /* This does not take long and improves the results */ - if (MLnniToDo > 0) { - OptimizeAllBranchLengths(/*IN/OUT*/NJ); - LogTree("ML_Lengths%d",2, fpLog, NJ, aln->names, unique, bQuote); - if (verbose || fpLog) { - double loglk = TreeLogLk(NJ, /*site_likelihoods*/NULL); - if (verbose) - fprintf(stderr, "Optimize all lengths: LogLk %s= %.3f Time %.2f\n", - exactML || nCodes != 20 ? "" : "~", - loglk, - clockDiff(&clock_start)); - if (fpLog) { - fprintf(fpLog, "TreeLogLk\tML_Lengths%d\t%.4f\n", 2, loglk); - fflush(fpLog); - } - } - } - - /* Count bad splits and compute SH-like supports if desired */ - if ((MLnniToDo > 0 && !fastest) || nBootstrap > 0) - TestSplitsML(NJ, /*OUT*/&splitcount, nBootstrap); - - /* Compute gamma-based likelihood? */ - if (gammaLogLk && nRateCats > 1) { - numeric_t *rates = MLSiteRates(nRateCats); - double *site_loglk = MLSiteLikelihoodsByRate(NJ, rates, nRateCats); - double scale = RescaleGammaLogLk(NJ->nPos, nRateCats, rates, /*IN*/site_loglk, /*OPTIONAL*/fpLog); - rates = myfree(rates, sizeof(numeric_t) * nRateCats); - site_loglk = myfree(site_loglk, sizeof(double) * nRateCats * NJ->nPos); - - for (i = 0; i < NJ->maxnodes; i++) - NJ->branchlength[i] *= scale; - } - } else { - /* Minimum evolution supports */ - TestSplitsMinEvo(NJ, /*OUT*/&splitcount); - if (nBootstrap > 0) - ReliabilityNJ(NJ, nBootstrap); - } - - for (i = 0; i < nFPs; i++) { - FILE *fp = fps[i]; - fprintf(fp, "Total time: %.2f seconds Unique: %d/%d Bad splits: %d/%d", - clockDiff(&clock_start), - NJ->nSeq, aln->nSeq, - splitcount.nBadSplits, splitcount.nSplits); - if (splitcount.dWorstDeltaUnconstrained > 0) - fprintf(fp, " Worst %sdelta-%s %.3f", - uniqConstraints != NULL ? "unconstrained " : "", - (MLnniToDo > 0 || MLlen) ? "LogLk" : "Len", - splitcount.dWorstDeltaUnconstrained); - fprintf(fp,"\n"); - if (NJ->nSeq > 3 && NJ->nConstraints > 0) { - fprintf(fp, "Violating constraints: %d both bad: %d", - splitcount.nConstraintViolations, splitcount.nBadBoth); - if (splitcount.dWorstDeltaConstrained > 0) - fprintf(fp, " Worst delta-%s due to constraints: %.3f", - (MLnniToDo > 0 || MLlen) ? "LogLk" : "Len", - splitcount.dWorstDeltaConstrained); - fprintf(fp,"\n"); - } - if (verbose > 1 || fp == fpLog) { - double dN2 = NJ->nSeq*(double)NJ->nSeq; - fprintf(fp, "Dist/N**2: by-profile %.3f (out %.3f) by-leaf %.3f avg-prof %.3f\n", - profileOps/dN2, outprofileOps/dN2, seqOps/dN2, profileAvgOps/dN2); - if (nCloseUsed>0 || nClose2Used > 0 || nRefreshTopHits>0) - fprintf(fp, "Top hits: close neighbors %ld/%d 2nd-level %ld refreshes %ld", - nCloseUsed, NJ->nSeq, nClose2Used, nRefreshTopHits); - if(!slow) fprintf(fp, " Hill-climb: %ld Update-best: %ld\n", nHillBetter, nVisibleUpdate); - if (nniToDo > 0 || spr > 0 || MLnniToDo > 0) - fprintf(fp, "NNI: %ld SPR: %ld ML-NNI: %ld\n", nNNI, nSPR, nML_NNI); - if (MLnniToDo > 0) { - fprintf(fp, "Max-lk operations: lk %ld posterior %ld", nLkCompute, nPosteriorCompute); - if (nAAPosteriorExact > 0 || nAAPosteriorRough > 0) - fprintf(fp, " approximate-posteriors %.2f%%", - (100.0*nAAPosteriorRough)/(double)(nAAPosteriorExact+nAAPosteriorRough)); - if (mlAccuracy < 2) - fprintf(fp, " star-only %ld", nStarTests); - fprintf(fp, "\n"); - } - } -#ifdef TRACK_MEMORY - fprintf(fp, "Memory: %.2f MB (%.1f byte/pos) ", - maxmallocHeap/1.0e6, maxmallocHeap/(double)(aln->nSeq*(double)aln->nPos)); - /* Only report numbers from before we do reliability estimates */ - fprintf(fp, "profile-freq-alloc %ld avoided %.2f%%\n", - svProfileFreqAlloc, - svProfileFreqAvoid > 0 ? - 100.0*svProfileFreqAvoid/(double)(svProfileFreqAlloc+svProfileFreqAvoid) - : 0); -#endif - fflush(fp); - } - PrintNJ(fpOut, NJ, aln->names, unique, /*support*/nBootstrap > 0, bQuote); - fflush(fpOut); - if (fpLog) { - fprintf(fpLog,"TreeCompleted\n"); - fflush(fpLog); - } - FreeNJ(NJ); - if (uniqConstraints != NULL) - uniqConstraints = myfree(uniqConstraints, sizeof(char*) * unique->nUnique); - constraints = FreeAlignment(constraints); - unique = FreeUniquify(unique); - } /* end build tree */ - hashnames = FreeHashtable(hashnames); - aln = FreeAlignment(aln); - } /* end loop over alignments */ - if (fpLog != NULL) - fclose(fpLog); - if (fpOut != stdout) fclose(fpOut); - exit(0); -} - -void ProgressReport(char *format, int i1, int i2, int i3, int i4) { - static bool time_set = false; - static struct timeval time_last; - static struct timeval time_begin; - - if (!showProgress) - return; - - static struct timeval time_now; - gettimeofday(&time_now,NULL); - if (!time_set) { - time_begin = time_last = time_now; - time_set = true; - } - static struct timeval elapsed; - timeval_subtract(&elapsed,&time_now,&time_last); - - if (elapsed.tv_sec > 1 || elapsed.tv_usec > 100*1000 || verbose > 1) { - timeval_subtract(&elapsed,&time_now,&time_begin); - fprintf(stderr, "%7i.%2.2i seconds: ", (int)elapsed.tv_sec, (int)(elapsed.tv_usec/10000)); - fprintf(stderr, format, i1, i2, i3, i4); - if (verbose > 1 || !isatty(STDERR_FILENO)) { - fprintf(stderr, "\n"); - } else { - fprintf(stderr, " \r"); - } - fflush(stderr); - time_last = time_now; - } -} - -void LogMLRates(/*OPTIONAL WRITE*/FILE *fpLog, NJ_t *NJ) { - if (fpLog != NULL) { - rates_t *rates = &NJ->rates; - fprintf(fpLog, "NCategories\t%d\nRates",rates->nRateCategories); - assert(rates->nRateCategories > 0); - int iRate; - for (iRate = 0; iRate < rates->nRateCategories; iRate++) - fprintf(fpLog, " %f", rates->rates[iRate]); - fprintf(fpLog,"\nSiteCategories"); - int iPos; - for (iPos = 0; iPos < NJ->nPos; iPos++) { - iRate = rates->ratecat[iPos]; - fprintf(fpLog," %d",iRate+1); - } - fprintf(fpLog,"\n"); - fflush(fpLog); - } -} - -void LogTree(char *format, int i, /*OPTIONAL WRITE*/FILE *fpLog, NJ_t *NJ, char **names, uniquify_t *unique, bool bQuote) { - if(fpLog != NULL) { - fprintf(fpLog, format, i); - fprintf(fpLog, "\t"); - PrintNJ(fpLog, NJ, names, unique, /*support*/false, bQuote); - fflush(fpLog); - } -} - -NJ_t *InitNJ(char **sequences, int nSeq, int nPos, - /*OPTIONAL*/char **constraintSeqs, int nConstraints, - /*OPTIONAL*/distance_matrix_t *distance_matrix, - /*OPTIONAL*/transition_matrix_t *transmat) { - int iNode; - - NJ_t *NJ = (NJ_t*)mymalloc(sizeof(NJ_t)); - NJ->root = -1; /* set at end of FastNJ() */ - NJ->maxnode = NJ->nSeq = nSeq; - NJ->nPos = nPos; - NJ->maxnodes = 2*nSeq; - NJ->seqs = sequences; - NJ->distance_matrix = distance_matrix; - NJ->transmat = transmat; - NJ->nConstraints = nConstraints; - NJ->constraintSeqs = constraintSeqs; - - NJ->profiles = (profile_t **)mymalloc(sizeof(profile_t*) * NJ->maxnodes); - - unsigned long counts[256]; - int i; - for (i = 0; i < 256; i++) - counts[i] = 0; - for (iNode = 0; iNode < NJ->nSeq; iNode++) { - NJ->profiles[iNode] = SeqToProfile(NJ, NJ->seqs[iNode], nPos, - constraintSeqs != NULL ? constraintSeqs[iNode] : NULL, - nConstraints, - iNode, - /*IN/OUT*/counts); - } - unsigned long totCount = 0; - for (i = 0; i < 256; i++) - totCount += counts[i]; - - /* warnings about unknown characters */ - for (i = 0; i < 256; i++) { - if (counts[i] == 0 || i == '.' || i == '-') - continue; - unsigned char *codesP; - bool bMatched = false; - for (codesP = codesString; *codesP != '\0'; codesP++) { - if (*codesP == i || tolower(*codesP) == i) { - bMatched = true; - break; - } - } - if (!bMatched) - fprintf(stderr, "Ignored unknown character %c (seen %lu times)\n", i, counts[i]); - } - - - /* warnings about the counts */ - double fACGTUN = (counts['A'] + counts['C'] + counts['G'] + counts['T'] + counts['U'] + counts['N'] - + counts['a'] + counts['c'] + counts['g'] + counts['t'] + counts['u'] + counts['n']) - / (double)(totCount - counts['-'] - counts['.']); - if (nCodes == 4 && fACGTUN < 0.9) - fprintf(stderr, "WARNING! ONLY %.1f%% NUCLEOTIDE CHARACTERS -- IS THIS REALLY A NUCLEOTIDE ALIGNMENT?\n", - 100.0 * fACGTUN); - else if (nCodes == 20 && fACGTUN >= 0.9) - fprintf(stderr, "WARNING! %.1f%% NUCLEOTIDE CHARACTERS -- IS THIS REALLY A PROTEIN ALIGNMENT?\n", - 100.0 * fACGTUN); - - if(verbose>10) fprintf(stderr,"Made sequence profiles\n"); - for (iNode = NJ->nSeq; iNode < NJ->maxnodes; iNode++) - NJ->profiles[iNode] = NULL; /* not yet exists */ - - NJ->outprofile = OutProfile(NJ->profiles, NJ->nSeq, - NJ->nPos, NJ->nConstraints, - NJ->distance_matrix); - if(verbose>10) fprintf(stderr,"Made out-profile\n"); - - NJ->totdiam = 0.0; - - NJ->diameter = (numeric_t *)mymalloc(sizeof(numeric_t)*NJ->maxnodes); - for (iNode = 0; iNode < NJ->maxnodes; iNode++) NJ->diameter[iNode] = 0; - - NJ->varDiameter = (numeric_t *)mymalloc(sizeof(numeric_t)*NJ->maxnodes); - for (iNode = 0; iNode < NJ->maxnodes; iNode++) NJ->varDiameter[iNode] = 0; - - NJ->selfdist = (numeric_t *)mymalloc(sizeof(numeric_t)*NJ->maxnodes); - for (iNode = 0; iNode < NJ->maxnodes; iNode++) NJ->selfdist[iNode] = 0; - - NJ->selfweight = (numeric_t *)mymalloc(sizeof(numeric_t)*NJ->maxnodes); - for (iNode = 0; iNode < NJ->nSeq; iNode++) - NJ->selfweight[iNode] = NJ->nPos - NGaps(NJ,iNode); - - NJ->outDistances = (numeric_t *)mymalloc(sizeof(numeric_t)*NJ->maxnodes); - NJ->nOutDistActive = (int *)mymalloc(sizeof(int)*NJ->maxnodes); - for (iNode = 0; iNode < NJ->maxnodes; iNode++) - NJ->nOutDistActive[iNode] = NJ->nSeq * 10; /* unreasonably high value */ - NJ->parent = NULL; /* so SetOutDistance ignores it */ - for (iNode = 0; iNode < NJ->nSeq; iNode++) - SetOutDistance(/*IN/UPDATE*/NJ, iNode, /*nActive*/NJ->nSeq); - - if (verbose>2) { - for (iNode = 0; iNode < 4 && iNode < NJ->nSeq; iNode++) - fprintf(stderr, "Node %d outdist %f\n", iNode, NJ->outDistances[iNode]); - } - - NJ->parent = (int *)mymalloc(sizeof(int)*NJ->maxnodes); - for (iNode = 0; iNode < NJ->maxnodes; iNode++) NJ->parent[iNode] = -1; - - NJ->branchlength = (numeric_t *)mymalloc(sizeof(numeric_t)*NJ->maxnodes); /* distance to parent */ - for (iNode = 0; iNode < NJ->maxnodes; iNode++) NJ->branchlength[iNode] = 0; - - NJ->support = (numeric_t *)mymalloc(sizeof(numeric_t)*NJ->maxnodes); - for (iNode = 0; iNode < NJ->maxnodes; iNode++) NJ->support[iNode] = -1.0; - - NJ->child = (children_t*)mymalloc(sizeof(children_t)*NJ->maxnodes); - for (iNode= 0; iNode < NJ->maxnode; iNode++) NJ->child[iNode].nChild = 0; - - NJ->rates.nRateCategories = 0; - NJ->rates.rates = NULL; - NJ->rates.ratecat = NULL; - AllocRateCategories(&NJ->rates, 1, NJ->nPos); - return(NJ); -} - -NJ_t *FreeNJ(NJ_t *NJ) { - if (NJ==NULL) - return(NJ); - - int i; - for (i=0; i < NJ->maxnode; i++) - NJ->profiles[i] = FreeProfile(NJ->profiles[i], NJ->nPos, NJ->nConstraints); - NJ->profiles = myfree(NJ->profiles, sizeof(profile_t*) * NJ->maxnodes); - NJ->outprofile = FreeProfile(NJ->outprofile, NJ->nPos, NJ->nConstraints); - NJ->diameter = myfree(NJ->diameter, sizeof(numeric_t)*NJ->maxnodes); - NJ->varDiameter = myfree(NJ->varDiameter, sizeof(numeric_t)*NJ->maxnodes); - NJ->selfdist = myfree(NJ->selfdist, sizeof(numeric_t)*NJ->maxnodes); - NJ->selfweight = myfree(NJ->selfweight, sizeof(numeric_t)*NJ->maxnodes); - NJ->outDistances = myfree(NJ->outDistances, sizeof(numeric_t)*NJ->maxnodes); - NJ->nOutDistActive = myfree(NJ->nOutDistActive, sizeof(int)*NJ->maxnodes); - NJ->parent = myfree(NJ->parent, sizeof(int)*NJ->maxnodes); - NJ->branchlength = myfree(NJ->branchlength, sizeof(numeric_t)*NJ->maxnodes); - NJ->support = myfree(NJ->support, sizeof(numeric_t)*NJ->maxnodes); - NJ->child = myfree(NJ->child, sizeof(children_t)*NJ->maxnodes); - NJ->transmat = myfree(NJ->transmat, sizeof(transition_matrix_t)); - AllocRateCategories(&NJ->rates, 0, NJ->nPos); - return(myfree(NJ, sizeof(NJ_t))); -} - -/* Allocate or reallocate the rate categories, and set every position - to category 0 and every category's rate to 1.0 - If nRateCategories=0, just deallocate -*/ -void AllocRateCategories(/*IN/OUT*/rates_t *rates, int nRateCategories, int nPos) { - assert(nRateCategories >= 0); - rates->rates = myfree(rates->rates, sizeof(numeric_t)*rates->nRateCategories); - rates->ratecat = myfree(rates->ratecat, sizeof(unsigned int)*nPos); - rates->nRateCategories = nRateCategories; - if (rates->nRateCategories > 0) { - rates->rates = (numeric_t*)mymalloc(sizeof(numeric_t)*rates->nRateCategories); - int i; - for (i = 0; i < nRateCategories; i++) - rates->rates[i] = 1.0; - rates->ratecat = (unsigned int *)mymalloc(sizeof(unsigned int)*nPos); - for (i = 0; i < nPos; i++) - rates->ratecat[i] = 0; - } -} - -void FastNJ(NJ_t *NJ) { - int iNode; - - assert(NJ->nSeq >= 1); - if (NJ->nSeq < 3) { - NJ->root = NJ->maxnode++; - NJ->child[NJ->root].nChild = NJ->nSeq; - for (iNode = 0; iNode < NJ->nSeq; iNode++) { - NJ->parent[iNode] = NJ->root; - NJ->child[NJ->root].child[iNode] = iNode; - } - if (NJ->nSeq == 1) { - NJ->branchlength[0] = 0; - } else { - assert (NJ->nSeq == 2); - besthit_t hit; - SeqDist(NJ->profiles[0]->codes,NJ->profiles[1]->codes,NJ->nPos,NJ->distance_matrix,/*OUT*/&hit); - NJ->branchlength[0] = hit.dist/2.0; - NJ->branchlength[1] = hit.dist/2.0; - } - return; - } - - /* else 3 or more sequences */ - - /* The visible set stores the best hit of each node (unless using top hits, in which case - it is handled by the top hits routines) */ - besthit_t *visible = NULL; /* Not used if doing top hits */ - besthit_t *besthitNew = NULL; /* All hits of new node -- not used if doing top-hits */ - - /* The top-hits lists, with the key parameter m = length of each top-hit list */ - top_hits_t *tophits = NULL; - int m = 0; /* maximum length of a top-hits list */ - if (tophitsMult > 0) { - m = (int)(0.5 + tophitsMult*sqrt(NJ->nSeq)); - if(m<4 || 2*m >= NJ->nSeq) { - m=0; - if(verbose>1) fprintf(stderr,"Too few leaves, turning off top-hits\n"); - } else { - if(verbose>2) fprintf(stderr,"Top-hit-list size = %d of %d\n", m, NJ->nSeq); - } - } - assert(!(slow && m>0)); - - /* Initialize top-hits or visible set */ - if (m>0) { - tophits = InitTopHits(NJ, m); - SetAllLeafTopHits(/*IN/UPDATE*/NJ, /*OUT*/tophits); - ResetTopVisible(/*IN/UPDATE*/NJ, /*nActive*/NJ->nSeq, /*IN/OUT*/tophits); - } else if (!slow) { - visible = (besthit_t*)mymalloc(sizeof(besthit_t)*NJ->maxnodes); - besthitNew = (besthit_t*)mymalloc(sizeof(besthit_t)*NJ->maxnodes); - for (iNode = 0; iNode < NJ->nSeq; iNode++) - SetBestHit(iNode, NJ, /*nActive*/NJ->nSeq, /*OUT*/&visible[iNode], /*OUT IGNORED*/NULL); - } - - /* Iterate over joins */ - int nActiveOutProfileReset = NJ->nSeq; - int nActive; - for (nActive = NJ->nSeq; nActive > 3; nActive--) { - int nJoinsDone = NJ->nSeq - nActive; - if (nJoinsDone > 0 && (nJoinsDone % 100) == 0) - ProgressReport("Joined %6d of %6d", nJoinsDone, NJ->nSeq-3, 0, 0); - - besthit_t join; /* the join to do */ - if (slow) { - ExhaustiveNJSearch(NJ,nActive,/*OUT*/&join); - } else if (m>0) { - TopHitNJSearch(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/tophits, /*OUT*/&join); - } else { - FastNJSearch(NJ, nActive, /*IN/OUT*/visible, /*OUT*/&join); - } - - if (verbose>2) { - double penalty = constraintWeight - * (double)JoinConstraintPenalty(NJ, join.i, join.j); - if (penalty > 0.001) { - fprintf(stderr, "Constraint violation during neighbor-joining %d %d into %d penalty %.3f\n", - join.i, join.j, NJ->maxnode, penalty); - int iC; - for (iC = 0; iC < NJ->nConstraints; iC++) { - int local = JoinConstraintPenaltyPiece(NJ, join.i, join.j, iC); - if (local > 0) - fprintf(stderr, "Constraint %d piece %d %d/%d %d/%d %d/%d\n", iC, local, - NJ->profiles[join.i]->nOn[iC], - NJ->profiles[join.i]->nOff[iC], - NJ->profiles[join.j]->nOn[iC], - NJ->profiles[join.j]->nOff[iC], - NJ->outprofile->nOn[iC] - NJ->profiles[join.i]->nOn[iC] - NJ->profiles[join.j]->nOn[iC], - NJ->outprofile->nOff[iC] - NJ->profiles[join.i]->nOff[iC] - NJ->profiles[join.j]->nOff[iC]); - } - } - } - - /* because of the stale out-distance heuristic, make sure that these are up-to-date */ - SetOutDistance(NJ, join.i, nActive); - SetOutDistance(NJ, join.j, nActive); - /* Make sure weight is set and criterion is up to date */ - SetDistCriterion(NJ, nActive, /*IN/OUT*/&join); - assert(NJ->nOutDistActive[join.i] == nActive); - assert(NJ->nOutDistActive[join.j] == nActive); - - int newnode = NJ->maxnode++; - NJ->parent[join.i] = newnode; - NJ->parent[join.j] = newnode; - NJ->child[newnode].nChild = 2; - NJ->child[newnode].child[0] = join.i < join.j ? join.i : join.j; - NJ->child[newnode].child[1] = join.i > join.j ? join.i : join.j; - - double rawIJ = join.dist + NJ->diameter[join.i] + NJ->diameter[join.j]; - double distIJ = join.dist; - - double deltaDist = (NJ->outDistances[join.i]-NJ->outDistances[join.j])/(double)(nActive-2); - NJ->branchlength[join.i] = (distIJ + deltaDist)/2; - NJ->branchlength[join.j] = (distIJ - deltaDist)/2; - - double bionjWeight = 0.5; /* IJ = bionjWeight*I + (1-bionjWeight)*J */ - double varIJ = rawIJ - NJ->varDiameter[join.i] - NJ->varDiameter[join.j]; - - if (bionj && join.weight > 0.01 && varIJ > 0.001) { - /* Set bionjWeight according to the BIONJ formula, where - the variance matrix is approximated by - - Vij = ProfileVar(i,j) - varDiameter(i) - varDiameter(j) - ProfileVar(i,j) = distance(i,j) = top(i,j)/weight(i,j) - - (The node's distance diameter does not affect the variances.) - - The BIONJ formula is equation 9 from Gascuel 1997: - - bionjWeight = 1/2 + sum(k!=i,j) (Vjk - Vik) / ((nActive-2)*Vij) - sum(k!=i,j) (Vjk - Vik) = sum(k!=i,j) Vik - varDiameter(j) + varDiameter(i) - = sum(k!=i,j) ProfileVar(j,k) - sum(k!=i,j) ProfileVar(i,k) + (nActive-2)*(varDiameter(i)-varDiameter(j)) - - sum(k!=i,j) ProfileVar(i,k) - ~= (sum(k!=i,j) distance(i,k) * weight(i,k))/(mean(k!=i,j) weight(i,k)) - ~= (N-2) * top(i, Out-i-j) / weight(i, Out-i-j) - - weight(i, Out-i-j) = N*weight(i,Out) - weight(i,i) - weight(i,j) - top(i, Out-i-j) = N*top(i,Out) - top(i,i) - top(i,j) - */ - besthit_t outI; - besthit_t outJ; - ProfileDist(NJ->profiles[join.i],NJ->outprofile,NJ->nPos,NJ->distance_matrix,/*OUT*/&outI); - ProfileDist(NJ->profiles[join.j],NJ->outprofile,NJ->nPos,NJ->distance_matrix,/*OUT*/&outJ); - outprofileOps += 2; - - double varIWeight = (nActive * outI.weight - NJ->selfweight[join.i] - join.weight); - double varJWeight = (nActive * outJ.weight - NJ->selfweight[join.j] - join.weight); - - double varITop = outI.dist * outI.weight * nActive - - NJ->selfdist[join.i] * NJ->selfweight[join.i] - rawIJ * join.weight; - double varJTop = outJ.dist * outJ.weight * nActive - - NJ->selfdist[join.j] * NJ->selfweight[join.j] - rawIJ * join.weight; - - double deltaProfileVarOut = (nActive-2) * (varJTop/varJWeight - varITop/varIWeight); - double deltaVarDiam = (nActive-2)*(NJ->varDiameter[join.i] - NJ->varDiameter[join.j]); - if (varJWeight > 0.01 && varIWeight > 0.01) - bionjWeight = 0.5 + (deltaProfileVarOut+deltaVarDiam)/(2*(nActive-2)*varIJ); - if(bionjWeight<0) bionjWeight=0; - if(bionjWeight>1) bionjWeight=1; - if (verbose>2) fprintf(stderr,"dVarO %f dVarDiam %f varIJ %f from dist %f weight %f (pos %d) bionjWeight %f %f\n", - deltaProfileVarOut, deltaVarDiam, - varIJ, join.dist, join.weight, NJ->nPos, - bionjWeight, 1-bionjWeight); - if (verbose>3 && (newnode%5) == 0) { - /* Compare weight estimated from outprofiles from weight made by summing over other nodes */ - double deltaProfileVarTot = 0; - for (iNode = 0; iNode < newnode; iNode++) { - if (NJ->parent[iNode] < 0) { /* excludes join.i, join.j */ - besthit_t di, dj; - ProfileDist(NJ->profiles[join.i],NJ->profiles[iNode],NJ->nPos,NJ->distance_matrix,/*OUT*/&di); - ProfileDist(NJ->profiles[join.j],NJ->profiles[iNode],NJ->nPos,NJ->distance_matrix,/*OUT*/&dj); - deltaProfileVarTot += dj.dist - di.dist; - } - } - double lambdaTot = 0.5 + (deltaProfileVarTot+deltaVarDiam)/(2*(nActive-2)*varIJ); - if (lambdaTot < 0) lambdaTot = 0; - if (lambdaTot > 1) lambdaTot = 1; - if (fabs(bionjWeight-lambdaTot) > 0.01 || verbose > 4) - fprintf(stderr, "deltaProfileVar actual %.6f estimated %.6f lambda actual %.3f estimated %.3f\n", - deltaProfileVarTot,deltaProfileVarOut,lambdaTot,bionjWeight); - } - } - if (verbose > 2) fprintf(stderr, "Join\t%d\t%d\t%.6f\tlambda\t%.6f\tselfw\t%.3f\t%.3f\tnew\t%d\n", - join.i < join.j ? join.i : join.j, - join.i < join.j ? join.j : join.i, - join.criterion, bionjWeight, - NJ->selfweight[join.i < join.j ? join.i : join.j], - NJ->selfweight[join.i < join.j ? join.j : join.i], - newnode); - - NJ->diameter[newnode] = bionjWeight * (NJ->branchlength[join.i] + NJ->diameter[join.i]) - + (1-bionjWeight) * (NJ->branchlength[join.j] + NJ->diameter[join.j]); - NJ->varDiameter[newnode] = bionjWeight * NJ->varDiameter[join.i] - + (1-bionjWeight) * NJ->varDiameter[join.j] - + bionjWeight * (1-bionjWeight) * varIJ; - - NJ->profiles[newnode] = AverageProfile(NJ->profiles[join.i],NJ->profiles[join.j], - NJ->nPos, NJ->nConstraints, - NJ->distance_matrix, - bionj ? bionjWeight : /*noweight*/-1.0); - - /* Update out-distances and total diameters */ - int changedActiveOutProfile = nActiveOutProfileReset - (nActive-1); - if (changedActiveOutProfile >= nResetOutProfile - && changedActiveOutProfile >= fResetOutProfile * nActiveOutProfileReset) { - /* Recompute the outprofile from scratch to avoid roundoff error */ - profile_t **activeProfiles = (profile_t**)mymalloc(sizeof(profile_t*)*(nActive-1)); - int nSaved = 0; - NJ->totdiam = 0; - for (iNode=0;iNode<NJ->maxnode;iNode++) { - if (NJ->parent[iNode]<0) { - assert(nSaved < nActive-1); - activeProfiles[nSaved++] = NJ->profiles[iNode]; - NJ->totdiam += NJ->diameter[iNode]; - } - } - assert(nSaved==nActive-1); - FreeProfile(NJ->outprofile, NJ->nPos, NJ->nConstraints); - if(verbose>2) fprintf(stderr,"Recomputing outprofile %d %d\n",nActiveOutProfileReset,nActive-1); - NJ->outprofile = OutProfile(activeProfiles, nSaved, - NJ->nPos, NJ->nConstraints, - NJ->distance_matrix); - activeProfiles = myfree(activeProfiles, sizeof(profile_t*)*(nActive-1)); - nActiveOutProfileReset = nActive-1; - } else { - UpdateOutProfile(/*OUT*/NJ->outprofile, - NJ->profiles[join.i], NJ->profiles[join.j], NJ->profiles[newnode], - nActive, - NJ->nPos, NJ->nConstraints, - NJ->distance_matrix); - NJ->totdiam += NJ->diameter[newnode] - NJ->diameter[join.i] - NJ->diameter[join.j]; - } - - /* Store self-dist for use in other computations */ - besthit_t selfdist; - ProfileDist(NJ->profiles[newnode],NJ->profiles[newnode],NJ->nPos,NJ->distance_matrix,/*OUT*/&selfdist); - NJ->selfdist[newnode] = selfdist.dist; - NJ->selfweight[newnode] = selfdist.weight; - - /* Find the best hit of the joined node IJ */ - if (m>0) { - TopHitJoin(newnode, /*IN/UPDATE*/NJ, nActive-1, /*IN/OUT*/tophits); - } else { - /* Not using top-hits, so we update all out-distances */ - for (iNode = 0; iNode < NJ->maxnode; iNode++) { - if (NJ->parent[iNode] < 0) { - /* True nActive is now nActive-1 */ - SetOutDistance(/*IN/UPDATE*/NJ, iNode, nActive-1); - } - } - - if(visible != NULL) { - SetBestHit(newnode, NJ, nActive-1, /*OUT*/&visible[newnode], /*OUT OPTIONAL*/besthitNew); - if (verbose>2) - fprintf(stderr,"Visible %d %d %f %f\n", - visible[newnode].i, visible[newnode].j, - visible[newnode].dist, visible[newnode].criterion); - if (besthitNew != NULL) { - /* Use distances to new node to update visible set entries that are non-optimal */ - for (iNode = 0; iNode < NJ->maxnode; iNode++) { - if (NJ->parent[iNode] >= 0 || iNode == newnode) - continue; - int iOldVisible = visible[iNode].j; - assert(iOldVisible>=0); - assert(visible[iNode].i == iNode); - - /* Update the criterion; use nActive-1 because haven't decremented nActive yet */ - if (NJ->parent[iOldVisible] < 0) - SetCriterion(/*IN/OUT*/NJ, nActive-1, &visible[iNode]); - - if (NJ->parent[iOldVisible] >= 0 - || besthitNew[iNode].criterion < visible[iNode].criterion) { - if(verbose>3) fprintf(stderr,"Visible %d reset from %d to %d (%f vs. %f)\n", - iNode, iOldVisible, - newnode, visible[iNode].criterion, besthitNew[iNode].criterion); - if(NJ->parent[iOldVisible] < 0) nVisibleUpdate++; - visible[iNode].j = newnode; - visible[iNode].dist = besthitNew[iNode].dist; - visible[iNode].criterion = besthitNew[iNode].criterion; - } - } /* end loop over all nodes */ - } /* end if recording all hits of new node */ - } /* end if keeping a visible set */ - } /* end else (m==0) */ - } /* end loop over nActive */ - -#ifdef TRACK_MEMORY - if (verbose>1) { - struct mallinfo mi = mallinfo(); - fprintf(stderr, "Memory @ end of FastNJ(): %.2f MB (%.1f byte/pos) useful %.2f expected %.2f\n", - (mi.arena+mi.hblkhd)/1.0e6, (mi.arena+mi.hblkhd)/(double)(NJ->nSeq*(double)NJ->nPos), - mi.uordblks/1.0e6, mymallocUsed/1e6); - } -#endif - - /* We no longer need the tophits, visible set, etc. */ - if (visible != NULL) visible = myfree(visible,sizeof(besthit_t)*NJ->maxnodes); - if (besthitNew != NULL) besthitNew = myfree(besthitNew,sizeof(besthit_t)*NJ->maxnodes); - tophits = FreeTopHits(tophits); - - /* Add a root for the 3 remaining nodes */ - int top[3]; - int nTop = 0; - for (iNode = 0; iNode < NJ->maxnode; iNode++) { - if (NJ->parent[iNode] < 0) { - assert(nTop <= 2); - top[nTop++] = iNode; - } - } - assert(nTop==3); - - NJ->root = NJ->maxnode++; - NJ->child[NJ->root].nChild = 3; - for (nTop = 0; nTop < 3; nTop++) { - NJ->parent[top[nTop]] = NJ->root; - NJ->child[NJ->root].child[nTop] = top[nTop]; - } - - besthit_t dist01, dist02, dist12; - ProfileDist(NJ->profiles[top[0]], NJ->profiles[top[1]], NJ->nPos, NJ->distance_matrix, /*OUT*/&dist01); - ProfileDist(NJ->profiles[top[0]], NJ->profiles[top[2]], NJ->nPos, NJ->distance_matrix, /*OUT*/&dist02); - ProfileDist(NJ->profiles[top[1]], NJ->profiles[top[2]], NJ->nPos, NJ->distance_matrix, /*OUT*/&dist12); - - double d01 = dist01.dist - NJ->diameter[top[0]] - NJ->diameter[top[1]]; - double d02 = dist02.dist - NJ->diameter[top[0]] - NJ->diameter[top[2]]; - double d12 = dist12.dist - NJ->diameter[top[1]] - NJ->diameter[top[2]]; - NJ->branchlength[top[0]] = (d01 + d02 - d12)/2; - NJ->branchlength[top[1]] = (d01 + d12 - d02)/2; - NJ->branchlength[top[2]] = (d02 + d12 - d01)/2; - - /* Check how accurate the outprofile is */ - if (verbose>2) { - profile_t *p[3] = {NJ->profiles[top[0]], NJ->profiles[top[1]], NJ->profiles[top[2]]}; - profile_t *out = OutProfile(p, 3, NJ->nPos, NJ->nConstraints, NJ->distance_matrix); - int i; - double freqerror = 0; - double weighterror = 0; - for (i=0;i<NJ->nPos;i++) { - weighterror += fabs(out->weights[i] - NJ->outprofile->weights[i]); - int k; - for(k=0;k<nCodes;k++) - freqerror += fabs(out->vectors[nCodes*i+k] - NJ->outprofile->vectors[nCodes*i+k]); - } - fprintf(stderr,"Roundoff error in outprofile@end: WeightError %f FreqError %f\n", weighterror, freqerror); - FreeProfile(out, NJ->nPos, NJ->nConstraints); - } - return; -} - -void ExhaustiveNJSearch(NJ_t *NJ, int nActive, /*OUT*/besthit_t *join) { - join->i = -1; - join->j = -1; - join->weight = 0; - join->dist = 1e20; - join->criterion = 1e20; - double bestCriterion = 1e20; - - int i, j; - for (i = 0; i < NJ->maxnode-1; i++) { - if (NJ->parent[i] < 0) { - for (j = i+1; j < NJ->maxnode; j++) { - if (NJ->parent[j] < 0) { - besthit_t hit; - hit.i = i; - hit.j = j; - SetDistCriterion(NJ, nActive, /*IN/OUT*/&hit); - if (hit.criterion < bestCriterion) { - *join = hit; - bestCriterion = hit.criterion; - } - } - } - } - } - assert (join->i >= 0 && join->j >= 0); -} - -void FastNJSearch(NJ_t *NJ, int nActive, /*IN/OUT*/besthit_t *besthits, /*OUT*/besthit_t *join) { - join->i = -1; - join->j = -1; - join->dist = 1e20; - join->weight = 0; - join->criterion = 1e20; - int iNode; - for (iNode = 0; iNode < NJ->maxnode; iNode++) { - int jNode = besthits[iNode].j; - if (NJ->parent[iNode] < 0 && NJ->parent[jNode] < 0) { /* both i and j still active */ - /* recompute criterion to reflect the current out-distances */ - SetCriterion(NJ, nActive, /*IN/OUT*/&besthits[iNode]); - if (besthits[iNode].criterion < join->criterion) - *join = besthits[iNode]; - } - } - - if(!fastest) { - int changed; - do { - changed = 0; - assert(join->i >= 0 && join->j >= 0); - SetBestHit(join->i, NJ, nActive, /*OUT*/&besthits[join->i], /*OUT IGNORED*/NULL); - if (besthits[join->i].j != join->j) { - changed = 1; - if (verbose>2) - fprintf(stderr,"BetterI\t%d\t%d\t%d\t%d\t%f\t%f\n", - join->i,join->j,besthits[join->i].i,besthits[join->i].j, - join->criterion,besthits[join->i].criterion); - } - - /* Save the best hit either way, because the out-distance has probably changed - since we started the computation. */ - join->j = besthits[join->i].j; - join->weight = besthits[join->i].weight; - join->dist = besthits[join->i].dist; - join->criterion = besthits[join->i].criterion; - - SetBestHit(join->j, NJ, nActive, /*OUT*/&besthits[join->j], /*OUT IGNORE*/NULL); - if (besthits[join->j].j != join->i) { - changed = 1; - if (verbose>2) - fprintf(stderr,"BetterJ\t%d\t%d\t%d\t%d\t%f\t%f\n", - join->i,join->j,besthits[join->j].i,besthits[join->j].j, - join->criterion,besthits[join->j].criterion); - join->i = besthits[join->j].j; - join->weight = besthits[join->j].weight; - join->dist = besthits[join->j].dist; - join->criterion = besthits[join->j].criterion; - } - if(changed) nHillBetter++; - } while(changed); - } -} - -/* A token is one of ():;, or an alphanumeric string without whitespace - Any whitespace between tokens is ignored */ -char *ReadTreeToken(FILE *fp) { - static char buf[BUFFER_SIZE]; - int len = 0; - int c; - for (c = fgetc(fp); c != EOF; c = fgetc(fp)) { - if (c == '(' || c == ')' || c == ':' || c == ';' || c == ',') { - /* standalone token */ - if (len == 0) { - buf[len++] = c; - buf[len] = '\0'; - return(buf); - } else { - ungetc(c, fp); - buf[len] = '\0'; - return(buf); - } - } else if (isspace(c)) { - if (len > 0) { - buf[len] = '\0'; - return(buf); - } - /* else ignore whitespace at beginning of token */ - } else { - /* not whitespace or standalone token */ - buf[len++] = c; - if (len >= BUFFER_SIZE) { - buf[BUFFER_SIZE-1] = '\0'; - fprintf(stderr, "Token too long in tree file, token begins with\n%s\n", buf); - exit(1); - } - } - } - if (len > 0) { - /* return the token we have so far */ - buf[len] = '\0'; - return(buf); - } - /* else */ - return(NULL); -} - -void ReadTreeError(char *err, char *token) { - fprintf(stderr, "Tree parse error: unexpected token '%s' -- %s\n", - token == NULL ? "(End of file)" : token, - err); - exit(1); -} - -void ReadTreeAddChild(int parent, int child, /*IN/OUT*/int *parents, /*IN/OUT*/children_t *children) { - assert(parent >= 0); - assert(child >= 0); - assert(parents[child] < 0); - assert(children[parent].nChild < 3); - parents[child] = parent; - children[parent].child[children[parent].nChild++] = child; -} - -void ReadTreeMaybeAddLeaf(int parent, char *name, - hashstrings_t *hashnames, uniquify_t *unique, - /*IN/OUT*/int *parents, /*IN/OUT*/children_t *children) { - hashiterator_t hi = FindMatch(hashnames,name); - if (HashCount(hashnames,hi) != 1) - ReadTreeError("not recognized as a sequence name", name); - - int iSeqNonunique = HashFirst(hashnames,hi); - assert(iSeqNonunique >= 0 && iSeqNonunique < unique->nSeq); - int iSeqUnique = unique->alnToUniq[iSeqNonunique]; - assert(iSeqUnique >= 0 && iSeqUnique < unique->nUnique); - /* Either record this leaves' parent (if it is -1) or ignore this leaf (if already seen) */ - if (parents[iSeqUnique] < 0) { - ReadTreeAddChild(parent, iSeqUnique, /*IN/OUT*/parents, /*IN/OUT*/children); - if(verbose > 5) - fprintf(stderr, "Found leaf uniq%d name %s child of %d\n", iSeqUnique, name, parent); - } else { - if (verbose > 5) - fprintf(stderr, "Skipped redundant leaf uniq%d name %s\n", iSeqUnique, name); - } -} - -void ReadTreeRemove(/*IN/OUT*/int *parents, /*IN/OUT*/children_t *children, int node) { - if(verbose > 5) - fprintf(stderr,"Removing node %d parent %d\n", node, parents[node]); - assert(parents[node] >= 0); - int parent = parents[node]; - parents[node] = -1; - children_t *pc = &children[parent]; - int oldn; - for (oldn = 0; oldn < pc->nChild; oldn++) { - if (pc->child[oldn] == node) - break; - } - assert(oldn < pc->nChild); - - /* move successor nodes back in child list and shorten list */ - int i; - for (i = oldn; i < pc->nChild-1; i++) - pc->child[i] = pc->child[i+1]; - pc->nChild--; - - /* add its children to parent's child list */ - children_t *nc = &children[node]; - if (nc->nChild > 0) { - assert(nc->nChild<=2); - assert(pc->nChild < 3); - assert(pc->nChild + nc->nChild <= 3); - int j; - for (j = 0; j < nc->nChild; j++) { - if(verbose > 5) - fprintf(stderr,"Repointing parent %d to child %d\n", parent, nc->child[j]); - pc->child[pc->nChild++] = nc->child[j]; - parents[nc->child[j]] = parent; - } - nc->nChild = 0; - } -} - -void ReadTree(/*IN/OUT*/NJ_t *NJ, - /*IN*/uniquify_t *unique, - /*IN*/hashstrings_t *hashnames, - /*READ*/FILE *fpInTree) { - assert(NJ->nSeq == unique->nUnique); - /* First, do a preliminary parse of the tree to with non-unique leaves ignored - We need to store this separately from NJ because it may have too many internal nodes - (matching sequences show up once in the NJ but could be in multiple places in the tree) - Will use iUnique as the index of nodes, as in the NJ structure - */ - int maxnodes = unique->nSeq*2; - int maxnode = unique->nSeq; - int *parent = (int*)mymalloc(sizeof(int)*maxnodes); - children_t *children = (children_t *)mymalloc(sizeof(children_t)*maxnodes); - int root = maxnode++; - int i; - for (i = 0; i < maxnodes; i++) { - parent[i] = -1; - children[i].nChild = 0; - } - - /* The stack is the current path to the root, with the root at the first (top) position */ - int stack_size = 1; - int *stack = (int*)mymalloc(sizeof(int)*maxnodes); - stack[0] = root; - int nDown = 0; - int nUp = 0; - - char *token; - token = ReadTreeToken(fpInTree); - if (token == NULL || *token != '(') - ReadTreeError("No '(' at start", token); - /* nDown is still 0 because we have created the root */ - - while ((token = ReadTreeToken(fpInTree)) != NULL) { - if (nDown > 0) { /* In a stream of parentheses */ - if (*token == '(') - nDown++; - else if (*token == ',' || *token == ';' || *token == ':' || *token == ')') - ReadTreeError("while reading parentheses", token); - else { - /* Add intermediate nodes if nDown was > 1 (for nDown=1, the only new node is the leaf) */ - while (nDown-- > 0) { - int new = maxnode++; - assert(new < maxnodes); - ReadTreeAddChild(stack[stack_size-1], new, /*IN/OUT*/parent, /*IN/OUT*/children); - if(verbose > 5) - fprintf(stderr, "Added internal child %d of %d, stack size increase to %d\n", - new, stack[stack_size-1],stack_size+1); - stack[stack_size++] = new; - assert(stack_size < maxnodes); - } - ReadTreeMaybeAddLeaf(stack[stack_size-1], token, - hashnames, unique, - /*IN/OUT*/parent, /*IN/OUT*/children); - } - } else if (nUp > 0) { - if (*token == ';') { /* end the tree? */ - if (nUp != stack_size) - ReadTreeError("unbalanced parentheses", token); - else - break; - } else if (*token == ')') - nUp++; - else if (*token == '(') - ReadTreeError("unexpected '(' after ')'", token); - else if (*token == ':') { - token = ReadTreeToken(fpInTree); - /* Read the branch length and ignore it */ - if (token == NULL || (*token != '-' && !isdigit(*token))) - ReadTreeError("not recognized as a branch length", token); - } else if (*token == ',') { - /* Go back up the stack the correct #times */ - while (nUp-- > 0) { - stack_size--; - if(verbose > 5) - fprintf(stderr, "Up to nUp=%d stack size %d at %d\n", - nUp, stack_size, stack[stack_size-1]); - if (stack_size <= 0) - ReadTreeError("too many ')'", token); - } - nUp = 0; - } else if (*token == '-' || isdigit(*token)) - ; /* ignore bootstrap value */ - else - fprintf(stderr, "Warning while parsing tree: non-numeric label %s for internal node\n", - token); - } else if (*token == '(') { - nDown = 1; - } else if (*token == ')') { - nUp = 1; - } else if (*token == ':') { - token = ReadTreeToken(fpInTree); - if (token == NULL || (*token != '-' && !isdigit(*token))) - ReadTreeError("not recognized as a branch length", token); - } else if (*token == ',') { - ; /* do nothing */ - } else if (*token == ';') - ReadTreeError("unexpected token", token); - else - ReadTreeMaybeAddLeaf(stack[stack_size-1], token, - hashnames, unique, - /*IN/OUT*/parent, /*IN/OUT*/children); - } - - /* Verify that all sequences were seen */ - for (i = 0; i < unique->nUnique; i++) { - if (parent[i] < 0) { - fprintf(stderr, "Alignment sequence %d (unique %d) absent from input tree\n" - "The starting tree (the argument to -intree) must include all sequences in the alignment!\n", - unique->uniqueFirst[i], i); - exit(1); - } - } - - /* Simplify the tree -- remove all internal nodes with < 2 children - Keep trying until no nodes get removed - */ - int nRemoved; - do { - nRemoved = 0; - /* Here stack is the list of nodes we haven't visited yet while doing - a tree traversal */ - stack_size = 1; - stack[0] = root; - while (stack_size > 0) { - int node = stack[--stack_size]; - if (node >= unique->nUnique) { /* internal node */ - if (children[node].nChild <= 1) { - if (node != root) { - ReadTreeRemove(/*IN/OUT*/parent,/*IN/OUT*/children,node); - nRemoved++; - } else if (node == root && children[node].nChild == 1) { - int newroot = children[node].child[0]; - parent[newroot] = -1; - children[root].nChild = 0; - nRemoved++; - if(verbose > 5) - fprintf(stderr,"Changed root from %d to %d\n",root,newroot); - root = newroot; - stack[stack_size++] = newroot; - } - } else { - int j; - for (j = 0; j < children[node].nChild; j++) { - assert(stack_size < maxnodes); - stack[stack_size++] = children[node].child[j]; - if(verbose > 5) - fprintf(stderr,"Added %d to stack\n", stack[stack_size-1]); - } - } - } - } - } while (nRemoved > 0); - - /* Simplify the root node to 3 children if it has 2 */ - if (children[root].nChild == 2) { - for (i = 0; i < 2; i++) { - int child = children[root].child[i]; - assert(child >= 0 && child < maxnodes); - if (children[child].nChild == 2) { - ReadTreeRemove(parent,children,child); /* replace root -> child -> A,B with root->A,B */ - break; - } - } - } - - for (i = 0; i < maxnodes; i++) - if(verbose > 5) - fprintf(stderr,"Simplfied node %d has parent %d nchild %d\n", - i, parent[i], children[i].nChild); - - /* Map the remaining internal nodes to NJ nodes */ - int *map = (int*)mymalloc(sizeof(int)*maxnodes); - for (i = 0; i < unique->nUnique; i++) - map[i] = i; - for (i = unique->nUnique; i < maxnodes; i++) - map[i] = -1; - stack_size = 1; - stack[0] = root; - while (stack_size > 0) { - int node = stack[--stack_size]; - if (node >= unique->nUnique) { /* internal node */ - assert(node == root || children[node].nChild > 1); - map[node] = NJ->maxnode++; - for (i = 0; i < children[node].nChild; i++) { - assert(stack_size < maxnodes); - stack[stack_size++] = children[node].child[i]; - } - } - } - for (i = 0; i < maxnodes; i++) - if(verbose > 5) - fprintf(stderr,"Map %d to %d (parent %d nchild %d)\n", - i, map[i], parent[i], children[i].nChild); - - /* Set NJ->parent, NJ->children, NJ->root */ - NJ->root = map[root]; - int node; - for (node = 0; node < maxnodes; node++) { - int njnode = map[node]; - if (njnode >= 0) { - NJ->child[njnode].nChild = children[node].nChild; - for (i = 0; i < children[node].nChild; i++) { - assert(children[node].child[i] >= 0 && children[node].child[i] < maxnodes); - NJ->child[njnode].child[i] = map[children[node].child[i]]; - } - if (parent[node] >= 0) - NJ->parent[njnode] = map[parent[node]]; - } - } - - /* Make sure that parent/child relationships match */ - for (i = 0; i < NJ->maxnode; i++) { - children_t *c = &NJ->child[i]; - int j; - for (j = 0; j < c->nChild;j++) - assert(c->child[j] >= 0 && c->child[j] < NJ->maxnode && NJ->parent[c->child[j]] == i); - } - assert(NJ->parent[NJ->root] < 0); - - map = myfree(map,sizeof(int)*maxnodes); - stack = myfree(stack,sizeof(int)*maxnodes); - children = myfree(children,sizeof(children_t)*maxnodes); - parent = myfree(parent,sizeof(int)*maxnodes); - - /* Compute profiles as balanced -- the NNI stage will recompute these - profiles anyway - */ - traversal_t traversal = InitTraversal(NJ); - node = NJ->root; - while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { - if (node >= NJ->nSeq && node != NJ->root) - SetProfile(/*IN/OUT*/NJ, node, /*noweight*/-1.0); - } - traversal = FreeTraversal(traversal,NJ); -} - -/* Print topology using node indices as node names */ -void PrintNJInternal(FILE *fp, NJ_t *NJ, bool useLen) { - if (NJ->nSeq < 4) { - return; - } - typedef struct { int node; int end; } stack_t; - stack_t *stack = (stack_t *)mymalloc(sizeof(stack_t)*NJ->maxnodes); - int stackSize = 1; - stack[0].node = NJ->root; - stack[0].end = 0; - - while(stackSize>0) { - stack_t *last = &stack[stackSize-1]; - stackSize--; - /* Save last, as we are about to overwrite it */ - int node = last->node; - int end = last->end; - - if (node < NJ->nSeq) { - if (NJ->child[NJ->parent[node]].child[0] != node) fputs(",",fp); - fprintf(fp, "%d", node); - if (useLen) - fprintf(fp, ":%.4f", NJ->branchlength[node]); - } else if (end) { - fprintf(fp, ")%d", node); - if (useLen) - fprintf(fp, ":%.4f", NJ->branchlength[node]); - } else { - if (node != NJ->root && NJ->child[NJ->parent[node]].child[0] != node) fprintf(fp, ","); - fprintf(fp, "("); - stackSize++; - stack[stackSize-1].node = node; - stack[stackSize-1].end = 1; - children_t *c = &NJ->child[node]; - /* put children on in reverse order because we use the last one first */ - int i; - for (i = c->nChild-1; i >=0; i--) { - stackSize++; - stack[stackSize-1].node = c->child[i]; - stack[stackSize-1].end = 0; - } - } - } - fprintf(fp, ";\n"); - stack = myfree(stack, sizeof(stack_t)*NJ->maxnodes); -} - -void PrintNJ(FILE *fp, NJ_t *NJ, char **names, uniquify_t *unique, bool bShowSupport, bool bQuote) { - /* And print the tree: depth first search - * The stack contains - * list of remaining children with their depth - * parent node, with a flag of -1 so I know to print right-paren - */ - if (NJ->nSeq==1 && unique->alnNext[unique->uniqueFirst[0]] >= 0) { - /* Special case -- otherwise we end up with double parens */ - int first = unique->uniqueFirst[0]; - assert(first >= 0 && first < unique->nSeq); - fprintf(fp, bQuote ? "('%s':0.0" : "(%s:0.0", names[first]); - int iName = unique->alnNext[first]; - while (iName >= 0) { - assert(iName < unique->nSeq); - fprintf(fp, bQuote ? ",'%s':0.0" : ",%s:0.0", names[iName]); - iName = unique->alnNext[iName]; - } - fprintf(fp,");\n"); - return; - } - - typedef struct { int node; int end; } stack_t; - stack_t *stack = (stack_t *)mymalloc(sizeof(stack_t)*NJ->maxnodes); - int stackSize = 1; - stack[0].node = NJ->root; - stack[0].end = 0; - - while(stackSize>0) { - stack_t *last = &stack[stackSize-1]; - stackSize--; - /* Save last, as we are about to overwrite it */ - int node = last->node; - int end = last->end; - - if (node < NJ->nSeq) { - if (NJ->child[NJ->parent[node]].child[0] != node) fputs(",",fp); - int first = unique->uniqueFirst[node]; - assert(first >= 0 && first < unique->nSeq); - /* Print the name, or the subtree of duplicate names */ - if (unique->alnNext[first] == -1) { - fprintf(fp, bQuote ? "'%s'" : "%s", names[first]); - } else { - fprintf(fp, bQuote ? "('%s':0.0" : "(%s:0.0", names[first]); - int iName = unique->alnNext[first]; - while (iName >= 0) { - assert(iName < unique->nSeq); - fprintf(fp, bQuote ? ",'%s':0.0" : ",%s:0.0", names[iName]); - iName = unique->alnNext[iName]; - } - fprintf(fp,")"); - } - /* Print the branch length */ -#ifdef USE_DOUBLE -#define FP_FORMAT "%.9f" -#else -#define FP_FORMAT "%.5f" -#endif - fprintf(fp, ":" FP_FORMAT, NJ->branchlength[node]); - } else if (end) { - if (node == NJ->root) - fprintf(fp, ")"); - else if (bShowSupport) - fprintf(fp, ")%.3f:" FP_FORMAT, NJ->support[node], NJ->branchlength[node]); - else - fprintf(fp, "):" FP_FORMAT, NJ->branchlength[node]); - } else { - if (node != NJ->root && NJ->child[NJ->parent[node]].child[0] != node) fprintf(fp, ","); - fprintf(fp, "("); - stackSize++; - stack[stackSize-1].node = node; - stack[stackSize-1].end = 1; - children_t *c = &NJ->child[node]; - /* put children on in reverse order because we use the last one first */ - int i; - for (i = c->nChild-1; i >=0; i--) { - stackSize++; - stack[stackSize-1].node = c->child[i]; - stack[stackSize-1].end = 0; - } - } - } - fprintf(fp, ";\n"); - stack = myfree(stack, sizeof(stack_t)*NJ->maxnodes); -} - -alignment_t *ReadAlignment(/*IN*/FILE *fp, bool bQuote) { - /* bQuote supports the -quote option */ - int nSeq = 0; - int nPos = 0; - char **names = NULL; - char **seqs = NULL; - char buf[BUFFER_SIZE] = ""; - if (fgets(buf,sizeof(buf),fp) == NULL) { - fprintf(stderr, "Error reading header line\n"); - exit(1); - } - int nSaved = 100; - if (buf[0] == '>') { - /* FASTA, truncate names at any of these */ - char *nameStop = bQuote ? "'\t\r\n" : "(),: \t\r\n"; - char *seqSkip = " \t\r\n"; /* skip these characters in the sequence */ - seqs = (char**)mymalloc(sizeof(char*) * nSaved); - names = (char**)mymalloc(sizeof(char*) * nSaved); - - do { - /* loop over lines */ - if (buf[0] == '>') { - /* truncate the name */ - char *p, *q; - for (p = buf+1; *p != '\0'; p++) { - for (q = nameStop; *q != '\0'; q++) { - if (*p == *q) { - *p = '\0'; - break; - } - } - if (*p == '\0') break; - } - - /* allocate space for another sequence */ - nSeq++; - if (nSeq > nSaved) { - int nNewSaved = nSaved*2; - seqs = myrealloc(seqs,sizeof(char*)*nSaved,sizeof(char*)*nNewSaved, /*copy*/false); - names = myrealloc(names,sizeof(char*)*nSaved,sizeof(char*)*nNewSaved, /*copy*/false); - nSaved = nNewSaved; - } - names[nSeq-1] = (char*)mymemdup(buf+1,strlen(buf)); - seqs[nSeq-1] = NULL; - } else { - /* count non-space characters and append to sequence */ - int nKeep = 0; - char *p, *q; - for (p=buf; *p != '\0'; p++) { - for (q=seqSkip; *q != '\0'; q++) { - if (*p == *q) - break; - } - if (*p != *q) - nKeep++; - } - int nOld = (seqs[nSeq-1] == NULL) ? 0 : strlen(seqs[nSeq-1]); - seqs[nSeq-1] = (char*)myrealloc(seqs[nSeq-1], nOld, nOld+nKeep+1, /*copy*/false); - if (nOld+nKeep > nPos) - nPos = nOld + nKeep; - char *out = seqs[nSeq-1] + nOld; - for (p=buf; *p != '\0'; p++) { - for (q=seqSkip; *q != '\0'; q++) { - if (*p == *q) - break; - } - if (*p != *q) { - *out = *p; - out++; - } - } - assert(out-seqs[nSeq-1] == nKeep + nOld); - *out = '\0'; - } - } while(fgets(buf,sizeof(buf),fp) != NULL); - - if (seqs[nSeq-1] == NULL) { - fprintf(stderr, "No sequence data for last entry %s\n",names[nSeq-1]); - exit(1); - } - names = myrealloc(names,sizeof(char*)*nSaved,sizeof(char*)*nSeq, /*copy*/false); - seqs = myrealloc(seqs,sizeof(char*)*nSaved,sizeof(char*)*nSeq, /*copy*/false); - } else { - /* PHYLIP interleaved-like format - Allow arbitrary length names, require spaces between names and sequences - Allow multiple alignments, either separated by a single empty line (e.g. seqboot output) - or not. - */ - if (buf[0] == '\n' || buf[0] == '\r') { - if (fgets(buf,sizeof(buf),fp) == NULL) { - fprintf(stderr, "Empty header line followed by EOF\n"); - exit(1); - } - } - if (sscanf(buf, "%d%d", &nSeq, &nPos) != 2 - || nSeq < 1 || nPos < 1) { - fprintf(stderr, "Error parsing header line:%s\n", buf); - exit(1); - } - names = (char **)mymalloc(sizeof(char*) * nSeq); - seqs = (char **)mymalloc(sizeof(char*) * nSeq); - nSaved = nSeq; - - int i; - for (i = 0; i < nSeq; i++) { - names[i] = NULL; - seqs[i] = (char *)mymalloc(nPos+1); /* null-terminate */ - seqs[i][0] = '\0'; - } - int iSeq = 0; - - while(fgets(buf,sizeof(buf),fp)) { - if ((buf[0] == '\n' || buf[0] == '\r') && (iSeq == nSeq || iSeq == 0)) { - iSeq = 0; - } else { - int j = 0; /* character just past end of name */ - if (buf[0] == ' ') { - if (names[iSeq] == NULL) { - fprintf(stderr, "No name in phylip line %s", buf); - exit(1); - } - } else { - while (buf[j] != '\n' && buf[j] != '\0' && buf[j] != ' ') - j++; - if (buf[j] != ' ' || j == 0) { - fprintf(stderr, "No sequence in phylip line %s", buf); - exit(1); - } - if (iSeq >= nSeq) { - fprintf(stderr, "No empty line between sequence blocks (is the sequence count wrong?)\n"); - exit(1); - } - if (names[iSeq] == NULL) { - /* save the name */ - names[iSeq] = (char *)mymalloc(j+1); - int k; - for (k = 0; k < j; k++) names[iSeq][k] = buf[k]; - names[iSeq][j] = '\0'; - } else { - /* check the name */ - int k; - int match = 1; - for (k = 0; k < j; k++) { - if (names[iSeq][k] != buf[k]) { - match = 0; - break; - } - } - if (!match || names[iSeq][j] != '\0') { - fprintf(stderr, "Wrong name in phylip line %s\nExpected %s\n", buf, names[iSeq]); - exit(1); - } - } - } - int seqlen = strlen(seqs[iSeq]); - for (; buf[j] != '\n' && buf[j] != '\0'; j++) { - if (buf[j] != ' ') { - if (seqlen >= nPos) { - fprintf(stderr, "Too many characters (expected %d) for sequence named %s\nSo far have:\n%s\n", - nPos, names[iSeq], seqs[iSeq]); - exit(1); - } - seqs[iSeq][seqlen++] = toupper(buf[j]); - } - } - seqs[iSeq][seqlen] = '\0'; /* null-terminate */ - if(verbose>10) fprintf(stderr,"Read iSeq %d name %s seqsofar %s\n", iSeq, names[iSeq], seqs[iSeq]); - iSeq++; - if (iSeq == nSeq && strlen(seqs[0]) == nPos) - break; /* finished alignment */ - } /* end else non-empty phylip line */ - } - if (iSeq != nSeq && iSeq != 0) { - fprintf(stderr, "Wrong number of sequences: expected %d\n", nSeq); - exit(1); - } - } - /* Check lengths of sequences */ - int i; - for (i = 0; i < nSeq; i++) { - int seqlen = strlen(seqs[i]); - if (seqlen != nPos) { - fprintf(stderr, "Wrong number of characters for %s: expected %d but have %d instead.\n" - "This sequence may be truncated, or another sequence may be too long.\n", - names[i], nPos, seqlen); - exit(1); - } - } - /* Replace "." with "-" and warn if we find any */ - /* If nucleotide sequences, replace U with T and N with X */ - bool findDot = false; - for (i = 0; i < nSeq; i++) { - char *p; - for (p = seqs[i]; *p != '\0'; p++) { - if (*p == '.') { - findDot = true; - *p = '-'; - } - if (nCodes == 4 && *p == 'U') - *p = 'T'; - if (nCodes == 4 && *p == 'N') - *p = 'X'; - } - } - if (findDot) - fprintf(stderr, "Warning! Found \".\" character(s). These are treated as gaps\n"); - - if (ferror(fp)) { - fprintf(stderr, "Error reading input file\n"); - exit(1); - } - - alignment_t *align = (alignment_t*)mymalloc(sizeof(alignment_t)); - align->nSeq = nSeq; - align->nPos = nPos; - align->names = names; - align->seqs = seqs; - align->nSaved = nSaved; - return(align); -} - -void FreeAlignmentSeqs(/*IN/OUT*/alignment_t *aln) { - assert(aln != NULL); - int i; - for (i = 0; i < aln->nSeq; i++) - aln->seqs[i] = myfree(aln->seqs[i], aln->nPos+1); -} - -alignment_t *FreeAlignment(alignment_t *aln) { - if(aln==NULL) - return(NULL); - int i; - for (i = 0; i < aln->nSeq; i++) { - aln->names[i] = myfree(aln->names[i],strlen(aln->names[i])+1); - aln->seqs[i] = myfree(aln->seqs[i], aln->nPos+1); - } - aln->names = myfree(aln->names, sizeof(char*)*aln->nSaved); - aln->seqs = myfree(aln->seqs, sizeof(char*)*aln->nSaved); - myfree(aln, sizeof(alignment_t)); - return(NULL); -} - -char **AlnToConstraints(alignment_t *constraints, uniquify_t *unique, hashstrings_t *hashnames) { - /* look up constraints as names and map to unique-space */ - char ** uniqConstraints = (char**)mymalloc(sizeof(char*) * unique->nUnique); - int i; - for (i = 0; i < unique->nUnique; i++) - uniqConstraints[i] = NULL; - for (i = 0; i < constraints->nSeq; i++) { - char *name = constraints->names[i]; - char *constraintSeq = constraints->seqs[i]; - hashiterator_t hi = FindMatch(hashnames,name); - if (HashCount(hashnames,hi) != 1) { - fprintf(stderr, "Sequence %s from constraints file is not in the alignment\n", name); - exit(1); - } - int iSeqNonunique = HashFirst(hashnames,hi); - assert(iSeqNonunique >= 0 && iSeqNonunique < unique->nSeq); - int iSeqUnique = unique->alnToUniq[iSeqNonunique]; - assert(iSeqUnique >= 0 && iSeqUnique < unique->nUnique); - if (uniqConstraints[iSeqUnique] != NULL) { - /* Already set a constraint for this group of sequences! - Warn that we are ignoring this one unless the constraints match */ - if (strcmp(uniqConstraints[iSeqUnique],constraintSeq) != 0) { - fprintf(stderr, - "Warning: ignoring constraints for %s:\n%s\n" - "Another sequence has the same sequence but different constraints\n", - name, constraintSeq); - } - } else { - uniqConstraints[iSeqUnique] = constraintSeq; - } - } - return(uniqConstraints); -} - - -profile_t *SeqToProfile(/*IN/OUT*/NJ_t *NJ, - char *seq, int nPos, - /*OPTIONAL*/char *constraintSeq, int nConstraints, - int iNode, - unsigned long counts[256]) { - static unsigned char charToCode[256]; - static int codeSet = 0; - int c, i; - - if (!codeSet) { - for (c = 0; c < 256; c++) { - charToCode[c] = nCodes; - } - for (i = 0; codesString[i]; i++) { - charToCode[codesString[i]] = i; - charToCode[tolower(codesString[i])] = i; - } - charToCode['-'] = NOCODE; - codeSet=1; - } - - assert(strlen(seq) == nPos); - profile_t *profile = NewProfile(nPos,nConstraints); - - for (i = 0; i < nPos; i++) { - unsigned int character = (unsigned int) seq[i]; - counts[character]++; - c = charToCode[character]; - if(verbose>10 && i < 2) fprintf(stderr,"pos %d char %c code %d\n", i, seq[i], c); - /* treat unknowns as gaps */ - if (c == nCodes || c == NOCODE) { - profile->codes[i] = NOCODE; - profile->weights[i] = 0.0; - } else { - profile->codes[i] = c; - profile->weights[i] = 1.0; - } - } - if (nConstraints > 0) { - for (i = 0; i < nConstraints; i++) { - profile->nOn[i] = 0; - profile->nOff[i] = 0; - } - bool bWarn = false; - if (constraintSeq != NULL) { - assert(strlen(constraintSeq) == nConstraints); - for (i = 0; i < nConstraints; i++) { - if (constraintSeq[i] == '1') { - profile->nOn[i] = 1; - } else if (constraintSeq[i] == '0') { - profile->nOff[i] = 1; - } else if (constraintSeq[i] != '-') { - if (!bWarn) { - fprintf(stderr, "Constraint characters in unique sequence %d replaced with gap:", iNode+1); - bWarn = true; - } - fprintf(stderr, " %c%d", constraintSeq[i], i+1); - /* For the benefit of ConstraintSequencePenalty -- this is a bit of a hack, as - this modifies the value read from the alignment - */ - constraintSeq[i] = '-'; - } - } - if (bWarn) - fprintf(stderr, "\n"); - } - } - return profile; -} - -void SeqDist(unsigned char *codes1, unsigned char *codes2, int nPos, - distance_matrix_t *dmat, - /*OUT*/besthit_t *hit) { - double top = 0; /* summed over positions */ - int nUse = 0; - int i; - if (dmat==NULL) { - int nDiff = 0; - for (i = 0; i < nPos; i++) { - if (codes1[i] != NOCODE && codes2[i] != NOCODE) { - nUse++; - if (codes1[i] != codes2[i]) nDiff++; - } - } - top = (double)nDiff; - } else { - for (i = 0; i < nPos; i++) { - if (codes1[i] != NOCODE && codes2[i] != NOCODE) { - nUse++; - top += dmat->distances[(unsigned int)codes1[i]][(unsigned int)codes2[i]]; - } - } - } - hit->weight = (double)nUse; - hit->dist = nUse > 0 ? top/(double)nUse : 1.0; - seqOps++; -} - -void CorrectedPairDistances(profile_t **profiles, int nProfiles, - /*OPTIONAL*/distance_matrix_t *distance_matrix, - int nPos, - /*OUT*/double *distances) { - assert(distances != NULL); - assert(profiles != NULL); - assert(nProfiles>1 && nProfiles <= 4); - besthit_t hit[6]; - int iHit,i,j; - - for (iHit=0, i=0; i < nProfiles; i++) { - for (j=i+1; j < nProfiles; j++, iHit++) { - ProfileDist(profiles[i],profiles[j],nPos,distance_matrix,/*OUT*/&hit[iHit]); - distances[iHit] = hit[iHit].dist; - } - } - if (pseudoWeight > 0) { - /* Estimate the prior distance */ - double dTop = 0; - double dBottom = 0; - for (iHit=0; iHit < (nProfiles*(nProfiles-1))/2; iHit++) { - dTop += hit[iHit].dist * hit[iHit].weight; - dBottom += hit[iHit].weight; - } - double prior = (dBottom > 0.01) ? dTop/dBottom : 3.0; - for (iHit=0; iHit < (nProfiles*(nProfiles-1))/2; iHit++) - distances[iHit] = (distances[iHit] * hit[iHit].weight + prior * pseudoWeight) - / (hit[iHit].weight + pseudoWeight); - } - if (logdist) { - for (iHit=0; iHit < (nProfiles*(nProfiles-1))/2; iHit++) - distances[iHit] = LogCorrect(distances[iHit]); - } -} - -/* During the neighbor-joining phase, a join only violates our constraints if - node1, node2, and other are all represented in the constraint - and if one of the 3 is split and the other two do not agree - */ -int JoinConstraintPenalty(/*IN*/NJ_t *NJ, int node1, int node2) { - if (NJ->nConstraints == 0) - return(0.0); - int penalty = 0; - int iC; - for (iC = 0; iC < NJ->nConstraints; iC++) - penalty += JoinConstraintPenaltyPiece(NJ, node1, node2, iC); - return(penalty); -} - -int JoinConstraintPenaltyPiece(NJ_t *NJ, int node1, int node2, int iC) { - profile_t *pOut = NJ->outprofile; - profile_t *p1 = NJ->profiles[node1]; - profile_t *p2 = NJ->profiles[node2]; - int nOn1 = p1->nOn[iC]; - int nOff1 = p1->nOff[iC]; - int nOn2 = p2->nOn[iC]; - int nOff2 = p2->nOff[iC]; - int nOnOut = pOut->nOn[iC] - nOn1 - nOn2; - int nOffOut = pOut->nOff[iC] - nOff1 - nOff2; - - if ((nOn1+nOff1) > 0 && (nOn2+nOff2) > 0 && (nOnOut+nOffOut) > 0) { - /* code is -1 for split, 0 for off, 1 for on */ - int code1 = (nOn1 > 0 && nOff1 > 0) ? -1 : (nOn1 > 0 ? 1 : 0); - int code2 = (nOn2 > 0 && nOff2 > 0) ? -1 : (nOn2 > 0 ? 1 : 0); - int code3 = (nOnOut > 0 && nOffOut) > 0 ? -1 : (nOnOut > 0 ? 1 : 0); - int nSplit = (code1 == -1 ? 1 : 0) + (code2 == -1 ? 1 : 0) + (code3 == -1 ? 1 : 0); - int nOn = (code1 == 1 ? 1 : 0) + (code2 == 1 ? 1 : 0) + (code3 == 1 ? 1 : 0); - if (nSplit == 1 && nOn == 1) - return(SplitConstraintPenalty(nOn1+nOn2, nOff1+nOff2, nOnOut, nOffOut)); - } - /* else */ - return(0); -} - -void QuartetConstraintPenalties(profile_t *profiles[4], int nConstraints, /*OUT*/double penalty[3]) { - int i; - for (i=0; i < 3; i++) - penalty[i] = 0.0; - if(nConstraints == 0) - return; - int iC; - for (iC = 0; iC < nConstraints; iC++) { - double part[3]; - if (QuartetConstraintPenaltiesPiece(profiles, iC, /*OUT*/part)) { - for (i=0;i<3;i++) - penalty[i] += part[i]; - - if (verbose>2 - && (fabs(part[ABvsCD]-part[ACvsBD]) > 0.001 || fabs(part[ABvsCD]-part[ADvsBC]) > 0.001)) - fprintf(stderr, "Constraint Penalties at %d: ABvsCD %.3f ACvsBD %.3f ADvsBC %.3f %d/%d %d/%d %d/%d %d/%d\n", - iC, part[ABvsCD], part[ACvsBD], part[ADvsBC], - profiles[0]->nOn[iC], profiles[0]->nOff[iC], - profiles[1]->nOn[iC], profiles[1]->nOff[iC], - profiles[2]->nOn[iC], profiles[2]->nOff[iC], - profiles[3]->nOn[iC], profiles[3]->nOff[iC]); - } - } - if (verbose>2) - fprintf(stderr, "Total Constraint Penalties: ABvsCD %.3f ACvsBD %.3f ADvsBC %.3f\n", - penalty[ABvsCD], penalty[ACvsBD], penalty[ADvsBC]); -} - -double PairConstraintDistance(int nOn1, int nOff1, int nOn2, int nOff2) { - double f1 = nOn1/(double)(nOn1+nOff1); - double f2 = nOn2/(double)(nOn2+nOff2); - /* 1 - f1 * f2 - (1-f1)*(1-f2) = 1 - f1 * f2 - 1 + f1 + f2 - f1 * f2 */ - return(f1 + f2 - 2.0 * f1 * f2); -} - -bool QuartetConstraintPenaltiesPiece(profile_t *profiles[4], int iC, /*OUT*/double piece[3]) { - int nOn[4]; - int nOff[4]; - int i; - int nSplit = 0; - int nPlus = 0; - int nMinus = 0; - - for (i=0; i < 4; i++) { - nOn[i] = profiles[i]->nOn[iC]; - nOff[i] = profiles[i]->nOff[iC]; - if (nOn[i] + nOff[i] == 0) - return(false); /* ignore */ - else if (nOn[i] > 0 && nOff[i] > 0) - nSplit++; - else if (nOn[i] > 0) - nPlus++; - else - nMinus++; - } - /* If just one of them is split or on the other side and the others all agree, also ignore */ - if (nPlus >= 3 || nMinus >= 3) - return(false); - piece[ABvsCD] = constraintWeight - * (PairConstraintDistance(nOn[0],nOff[0],nOn[1],nOff[1]) - + PairConstraintDistance(nOn[2],nOff[2],nOn[3],nOff[3])); - piece[ACvsBD] = constraintWeight - * (PairConstraintDistance(nOn[0],nOff[0],nOn[2],nOff[2]) - + PairConstraintDistance(nOn[1],nOff[1],nOn[3],nOff[3])); - piece[ADvsBC] = constraintWeight - * (PairConstraintDistance(nOn[0],nOff[0],nOn[3],nOff[3]) - + PairConstraintDistance(nOn[2],nOff[2],nOn[1],nOff[1])); - return(true); -} - -/* Minimum number of constrained leaves that need to be moved - to satisfy the constraint (or 0 if constraint is satisfied) - Defining it this way should ensure that SPR moves that break - constraints get a penalty -*/ -int SplitConstraintPenalty(int nOn1, int nOff1, int nOn2, int nOff2) { - return(nOn1 + nOff2 < nOn2 + nOff1 ? - (nOn1 < nOff2 ? nOn1 : nOff2) - : (nOn2 < nOff1 ? nOn2 : nOff1)); -} - -bool SplitViolatesConstraint(profile_t *profiles[4], int iConstraint) { - int i; - int codes[4]; /* 0 for off, 1 for on, -1 for split (quit if not constrained at all) */ - for (i = 0; i < 4; i++) { - if (profiles[i]->nOn[iConstraint] + profiles[i]->nOff[iConstraint] == 0) - return(false); - else if (profiles[i]->nOn[iConstraint] > 0 && profiles[i]->nOff[iConstraint] == 0) - codes[i] = 1; - else if (profiles[i]->nOn[iConstraint] == 0 && profiles[i]->nOff[iConstraint] > 0) - codes[i] = 0; - else - codes[i] = -1; - } - int n0 = 0; - int n1 = 0; - for (i = 0; i < 4; i++) { - if (codes[i] == 0) - n0++; - else if (codes[i] == 1) - n1++; - } - /* 3 on one side means no violation, even if other is code -1 - otherwise must have code != -1 and agreement on the split - */ - if (n0 >= 3 || n1 >= 3) - return(false); - if (n0==2 && n1==2 && codes[0] == codes[1] && codes[2] == codes[3]) - return(false); - return(true); -} - -double LogCorrect(double dist) { - const double maxscore = 3.0; - if (nCodes == 4 && !useMatrix) { /* Jukes-Cantor */ - dist = dist < 0.74 ? -0.75*log(1.0 - dist * 4.0/3.0) : maxscore; - } else { /* scoredist-like */ - dist = dist < 0.99 ? -1.3*log(1.0 - dist) : maxscore; - } - return (dist < maxscore ? dist : maxscore); -} - -/* A helper function -- f1 and f2 can be NULL if the corresponding code != NOCODE -*/ -double ProfileDistPiece(unsigned int code1, unsigned int code2, - numeric_t *f1, numeric_t *f2, - /*OPTIONAL*/distance_matrix_t *dmat, - /*OPTIONAL*/numeric_t *codeDist2) { - if (dmat) { - if (code1 != NOCODE && code2 != NOCODE) { /* code1 vs code2 */ - return(dmat->distances[code1][code2]); - } else if (codeDist2 != NULL && code1 != NOCODE) { /* code1 vs. codeDist2 */ - return(codeDist2[code1]); - } else { /* f1 vs f2 */ - if (f1 == NULL) { - if(code1 == NOCODE) return(10.0); - f1 = &dmat->codeFreq[code1][0]; - } - if (f2 == NULL) { - if(code2 == NOCODE) return(10.0); - f2 = &dmat->codeFreq[code2][0]; - } - return(vector_multiply3_sum(f1,f2,dmat->eigenval,nCodes)); - } - } else { - /* no matrix */ - if (code1 != NOCODE) { - if (code2 != NOCODE) { - return(code1 == code2 ? 0.0 : 1.0); /* code1 vs code2 */ - } else { - if(f2 == NULL) return(10.0); - return(1.0 - f2[code1]); /* code1 vs. f2 */ - } - } else { - if (code2 != NOCODE) { - if(f1 == NULL) return(10.0); - return(1.0 - f1[code2]); /* f1 vs code2 */ - } else { /* f1 vs. f2 */ - if (f1 == NULL || f2 == NULL) return(10.0); - double piece = 1.0; - int k; - for (k = 0; k < nCodes; k++) { - piece -= f1[k] * f2[k]; - } - return(piece); - } - } - } - assert(0); -} - -/* E.g. GET_FREQ(profile,iPos,iVector) - Gets the next element of the vectors (and updates iVector), or - returns NULL if we didn't store a vector -*/ -#define GET_FREQ(P,I,IVECTOR) \ -(P->weights[I] > 0 && P->codes[I] == NOCODE ? &P->vectors[nCodes*(IVECTOR++)] : NULL) - -void ProfileDist(profile_t *profile1, profile_t *profile2, int nPos, - /*OPTIONAL*/distance_matrix_t *dmat, - /*OUT*/besthit_t *hit) { - double top = 0; - double denom = 0; - int iFreq1 = 0; - int iFreq2 = 0; - int i = 0; - for (i = 0; i < nPos; i++) { - numeric_t *f1 = GET_FREQ(profile1,i,/*IN/OUT*/iFreq1); - numeric_t *f2 = GET_FREQ(profile2,i,/*IN/OUT*/iFreq2); - if (profile1->weights[i] > 0 && profile2->weights[i] > 0) { - double weight = profile1->weights[i] * profile2->weights[i]; - denom += weight; - double piece = ProfileDistPiece(profile1->codes[i],profile2->codes[i],f1,f2,dmat, - profile2->codeDist ? &profile2->codeDist[i*nCodes] : NULL); - top += weight * piece; - } - } - assert(iFreq1 == profile1->nVectors); - assert(iFreq2 == profile2->nVectors); - hit->weight = denom > 0 ? denom : 0.01; /* 0.01 is an arbitrarily low value of weight (normally >>1) */ - hit->dist = denom > 0 ? top/denom : 1; - profileOps++; -} - -/* This should not be called if the update weight is 0, as - in that case code==NOCODE and in=NULL is possible, and then - it will fail. -*/ -void AddToFreq(/*IN/OUT*/numeric_t *fOut, - double weight, - unsigned int codeIn, /*OPTIONAL*/numeric_t *fIn, - /*OPTIONAL*/distance_matrix_t *dmat) { - assert(fOut != NULL); - if (fIn != NULL) { - vector_add_mult(fOut, fIn, weight, nCodes); - } else if (dmat) { - assert(codeIn != NOCODE); - vector_add_mult(fOut, dmat->codeFreq[codeIn], weight, nCodes); - } else { - assert(codeIn != NOCODE); - fOut[codeIn] += weight; - } -} - -void SetProfile(/*IN/OUT*/NJ_t *NJ, int node, double weight1) { - children_t *c = &NJ->child[node]; - assert(c->nChild == 2); - assert(NJ->profiles[c->child[0]] != NULL); - assert(NJ->profiles[c->child[1]] != NULL); - if (NJ->profiles[node] != NULL) - FreeProfile(NJ->profiles[node], NJ->nPos, NJ->nConstraints); - NJ->profiles[node] = AverageProfile(NJ->profiles[c->child[0]], - NJ->profiles[c->child[1]], - NJ->nPos, NJ->nConstraints, - NJ->distance_matrix, - weight1); -} - -/* bionjWeight is the weight of the first sequence (between 0 and 1), - or -1 to do the average. - */ -profile_t *AverageProfile(profile_t *profile1, profile_t *profile2, - int nPos, int nConstraints, - distance_matrix_t *dmat, - double bionjWeight) { - int i; - if (bionjWeight < 0) { - bionjWeight = 0.5; - } - - /* First, set codes and weights and see how big vectors will be */ - profile_t *out = NewProfile(nPos, nConstraints); - - for (i = 0; i < nPos; i++) { - out->weights[i] = bionjWeight * profile1->weights[i] - + (1-bionjWeight) * profile2->weights[i]; - out->codes[i] = NOCODE; - if (out->weights[i] > 0) { - if (profile1->weights[i] > 0 && profile1->codes[i] != NOCODE - && (profile2->weights[i] <= 0 || profile1->codes[i] == profile2->codes[i])) { - out->codes[i] = profile1->codes[i]; - } else if (profile1->weights[i] <= 0 - && profile2->weights[i] > 0 - && profile2->codes[i] != NOCODE) { - out->codes[i] = profile2->codes[i]; - } - if (out->codes[i] == NOCODE) out->nVectors++; - } - } - - /* Allocate and set the vectors */ - out->vectors = (numeric_t*)mymalloc(sizeof(numeric_t)*nCodes*out->nVectors); - for (i = 0; i < nCodes * out->nVectors; i++) out->vectors[i] = 0; - nProfileFreqAlloc += out->nVectors; - nProfileFreqAvoid += nPos - out->nVectors; - int iFreqOut = 0; - int iFreq1 = 0; - int iFreq2 = 0; - for (i=0; i < nPos; i++) { - numeric_t *f = GET_FREQ(out,i,/*IN/OUT*/iFreqOut); - numeric_t *f1 = GET_FREQ(profile1,i,/*IN/OUT*/iFreq1); - numeric_t *f2 = GET_FREQ(profile2,i,/*IN/OUT*/iFreq2); - if (f != NULL) { - if (profile1->weights[i] > 0) - AddToFreq(/*IN/OUT*/f, profile1->weights[i] * bionjWeight, - profile1->codes[i], f1, dmat); - if (profile2->weights[i] > 0) - AddToFreq(/*IN/OUT*/f, profile2->weights[i] * (1.0-bionjWeight), - profile2->codes[i], f2, dmat); - NormalizeFreq(/*IN/OUT*/f, dmat); - } /* end if computing f */ - if (verbose > 10 && i < 5) { - fprintf(stderr,"Average profiles: pos %d in-w1 %f in-w2 %f bionjWeight %f to weight %f code %d\n", - i, profile1->weights[i], profile2->weights[i], bionjWeight, - out->weights[i], out->codes[i]); - if (f!= NULL) { - int k; - for (k = 0; k < nCodes; k++) - fprintf(stderr, "\t%c:%f", codesString[k], f ? f[k] : -1.0); - fprintf(stderr,"\n"); - } - } - } /* end loop over positions */ - assert(iFreq1 == profile1->nVectors); - assert(iFreq2 == profile2->nVectors); - assert(iFreqOut == out->nVectors); - - /* compute total constraints */ - for (i = 0; i < nConstraints; i++) { - out->nOn[i] = profile1->nOn[i] + profile2->nOn[i]; - out->nOff[i] = profile1->nOff[i] + profile2->nOff[i]; - } - profileAvgOps++; - return(out); -} - -/* Make the (unrotated) frequencies sum to 1 - Simply dividing by total_weight is not ideal because of roundoff error - So compute total_freq instead -*/ -void NormalizeFreq(/*IN/OUT*/numeric_t *freq, distance_matrix_t *dmat) { - double total_freq = 0; - int k; - if (dmat != NULL) { - /* The total frequency is dot_product(true_frequencies, 1) - So we rotate the 1 vector by eigeninv (stored in eigentot) - */ - total_freq = vector_multiply_sum(freq, dmat->eigentot, nCodes); - } else { - for (k = 0; k < nCodes; k++) - total_freq += freq[k]; - } - if (total_freq > fPostTotalTolerance) { - numeric_t inverse_weight = 1.0/total_freq; - vector_multiply_by(/*IN/OUT*/freq, inverse_weight, nCodes); - } else { - /* This can happen if we are in a very low-weight region, e.g. if a mostly-gap position gets weighted down - repeatedly; just set them all to arbitrary but legal values */ - if (dmat == NULL) { - for (k = 0; k < nCodes; k++) - freq[k] = 1.0/nCodes; - } else { - for (k = 0; k < nCodes; k++) - freq[k] = dmat->codeFreq[0][k]; - } - } -} - -/* OutProfile() computes the out-profile */ -profile_t *OutProfile(profile_t **profiles, int nProfiles, - int nPos, int nConstraints, - distance_matrix_t *dmat) { - int i; /* position */ - int in; /* profile */ - profile_t *out = NewProfile(nPos, nConstraints); - - double inweight = 1.0/(double)nProfiles; /* The maximal output weight is 1.0 */ - - /* First, set weights -- code is always NOCODE, prevent weight=0 */ - for (i = 0; i < nPos; i++) { - out->weights[i] = 0; - for (in = 0; in < nProfiles; in++) - out->weights[i] += profiles[in]->weights[i] * inweight; - if (out->weights[i] <= 0) out->weights[i] = 1e-20; /* always store a vector */ - out->nVectors++; - out->codes[i] = NOCODE; /* outprofile is normally complicated */ - } - - /* Initialize the frequencies to 0 */ - out->vectors = (numeric_t*)mymalloc(sizeof(numeric_t)*nCodes*out->nVectors); - for (i = 0; i < nCodes*out->nVectors; i++) - out->vectors[i] = 0; - - /* Add up the weights, going through each sequence in turn */ - for (in = 0; in < nProfiles; in++) { - int iFreqOut = 0; - int iFreqIn = 0; - for (i = 0; i < nPos; i++) { - numeric_t *fIn = GET_FREQ(profiles[in],i,/*IN/OUT*/iFreqIn); - numeric_t *fOut = GET_FREQ(out,i,/*IN/OUT*/iFreqOut); - if (profiles[in]->weights[i] > 0) - AddToFreq(/*IN/OUT*/fOut, profiles[in]->weights[i], - profiles[in]->codes[i], fIn, dmat); - } - assert(iFreqOut == out->nVectors); - assert(iFreqIn == profiles[in]->nVectors); - } - - /* And normalize the frequencies to sum to 1 */ - int iFreqOut = 0; - for (i = 0; i < nPos; i++) { - numeric_t *fOut = GET_FREQ(out,i,/*IN/OUT*/iFreqOut); - if (fOut) - NormalizeFreq(/*IN/OUT*/fOut, dmat); - } - assert(iFreqOut == out->nVectors); - if (verbose > 10) fprintf(stderr,"Average %d profiles\n", nProfiles); - if(dmat) - SetCodeDist(/*IN/OUT*/out, nPos, dmat); - - /* Compute constraints */ - for (i = 0; i < nConstraints; i++) { - out->nOn[i] = 0; - out->nOff[i] = 0; - for (in = 0; in < nProfiles; in++) { - out->nOn[i] += profiles[in]->nOn[i]; - out->nOff[i] += profiles[in]->nOff[i]; - } - } - return(out); -} - -void UpdateOutProfile(/*IN/OUT*/profile_t *out, profile_t *old1, profile_t *old2, - profile_t *new, int nActiveOld, - int nPos, int nConstraints, - distance_matrix_t *dmat) { - int i, k; - int iFreqOut = 0; - int iFreq1 = 0; - int iFreq2 = 0; - int iFreqNew = 0; - assert(nActiveOld > 0); - - for (i = 0; i < nPos; i++) { - numeric_t *fOut = GET_FREQ(out,i,/*IN/OUT*/iFreqOut); - numeric_t *fOld1 = GET_FREQ(old1,i,/*IN/OUT*/iFreq1); - numeric_t *fOld2 = GET_FREQ(old2,i,/*IN/OUT*/iFreq2); - numeric_t *fNew = GET_FREQ(new,i,/*IN/OUT*/iFreqNew); - - assert(out->codes[i] == NOCODE && fOut != NULL); /* No no-vector optimization for outprofiles */ - if (verbose > 3 && i < 3) { - fprintf(stderr,"Updating out-profile position %d weight %f (mult %f)\n", - i, out->weights[i], out->weights[i]*nActiveOld); - } - double originalMult = out->weights[i]*nActiveOld; - double newMult = originalMult + new->weights[i] - old1->weights[i] - old2->weights[i]; - out->weights[i] = newMult/(nActiveOld-1); - if (out->weights[i] <= 0) out->weights[i] = 1e-20; /* always use the vector */ - - for (k = 0; k < nCodes; k++) fOut[k] *= originalMult; - - if (old1->weights[i] > 0) - AddToFreq(/*IN/OUT*/fOut, -old1->weights[i], old1->codes[i], fOld1, dmat); - if (old2->weights[i] > 0) - AddToFreq(/*IN/OUT*/fOut, -old2->weights[i], old2->codes[i], fOld2, dmat); - if (new->weights[i] > 0) - AddToFreq(/*IN/OUT*/fOut, new->weights[i], new->codes[i], fNew, dmat); - - /* And renormalize */ - NormalizeFreq(/*IN/OUT*/fOut, dmat); - - if (verbose > 2 && i < 3) { - fprintf(stderr,"Updated out-profile position %d weight %f (mult %f)", - i, out->weights[i], out->weights[i]*nActiveOld); - if(out->weights[i] > 0) - for (k=0;k<nCodes;k++) - fprintf(stderr, " %c:%f", dmat?'?':codesString[k], fOut[k]); - fprintf(stderr,"\n"); - } - } - assert(iFreqOut == out->nVectors); - assert(iFreq1 == old1->nVectors); - assert(iFreq2 == old2->nVectors); - assert(iFreqNew == new->nVectors); - if(dmat) - SetCodeDist(/*IN/OUT*/out,nPos,dmat); - - /* update constraints -- note in practice this should be a no-op */ - for (i = 0; i < nConstraints; i++) { - out->nOn[i] += new->nOn[i] - old1->nOn[i] - old2->nOn[i]; - out->nOff[i] += new->nOff[i] - old1->nOff[i] - old2->nOff[i]; - } -} - -void SetCodeDist(/*IN/OUT*/profile_t *profile, int nPos, - distance_matrix_t *dmat) { - if (profile->codeDist == NULL) - profile->codeDist = (numeric_t*)mymalloc(sizeof(numeric_t)*nPos*nCodes); - int i; - int iFreq = 0; - for (i = 0; i < nPos; i++) { - numeric_t *f = GET_FREQ(profile,i,/*IN/OUT*/iFreq); - - int k; - for (k = 0; k < nCodes; k++) - profile->codeDist[i*nCodes+k] = ProfileDistPiece(/*code1*/profile->codes[i], /*code2*/k, - /*f1*/f, /*f2*/NULL, - dmat, NULL); - } - assert(iFreq==profile->nVectors); -} - - -void SetBestHit(int node, NJ_t *NJ, int nActive, - /*OUT*/besthit_t *bestjoin, /*OUT OPTIONAL*/besthit_t *allhits) { - assert(NJ->parent[node] < 0); - - bestjoin->i = node; - bestjoin->j = -1; - bestjoin->dist = 1e20; - bestjoin->criterion = 1e20; - - int j; - besthit_t tmp; - -#ifdef OPENMP - /* Note -- if we are already in a parallel region, this will be ignored */ - #pragma omp parallel for schedule(dynamic, 50) -#endif - for (j = 0; j < NJ->maxnode; j++) { - besthit_t *sv = allhits != NULL ? &allhits[j] : &tmp; - sv->i = node; - sv->j = j; - if (NJ->parent[j] >= 0) { - sv->i = -1; /* illegal/empty join */ - sv->weight = 0.0; - sv->criterion = sv->dist = 1e20; - continue; - } - /* Note that we compute self-distances (allow j==node) because the top-hit heuristic - expects self to be within its top hits, but we exclude those from the bestjoin - that we return... - */ - SetDistCriterion(NJ, nActive, /*IN/OUT*/sv); - if (sv->criterion < bestjoin->criterion && node != j) - *bestjoin = *sv; - } - if (verbose>5) { - fprintf(stderr, "SetBestHit %d %d %f %f\n", bestjoin->i, bestjoin->j, bestjoin->dist, bestjoin->criterion); - } -} - -void ReadMatrix(char *filename, /*OUT*/numeric_t codes[MAXCODES][MAXCODES], bool checkCodes) { - char buf[BUFFER_SIZE] = ""; - FILE *fp = fopen(filename, "r"); - if (fp == NULL) { - fprintf(stderr, "Cannot read %s\n",filename); - exit(1); - } - if (fgets(buf,sizeof(buf),fp) == NULL) { - fprintf(stderr, "Error reading header line for %s:\n%s\n", filename, buf); - exit(1); - } - if (checkCodes) { - int i; - int iBufPos; - for (iBufPos=0,i=0;i<nCodes;i++,iBufPos++) { - if(buf[iBufPos] != codesString[i]) { - fprintf(stderr,"Header line\n%s\nin file %s does not have expected code %c # %d in %s\n", - buf, filename, codesString[i], i, codesString); - exit(1); - } - iBufPos++; - if(buf[iBufPos] != '\n' && buf[iBufPos] != '\r' && buf[iBufPos] != '\0' && buf[iBufPos] != '\t') { - fprintf(stderr, "Header line in %s should be tab-delimited\n", filename); - exit(1); - } - if (buf[iBufPos] == '\0' && i < nCodes-1) { - fprintf(stderr, "Header line in %s ends prematurely\n",filename); - exit(1); - } - } /* end loop over codes */ - /* Should be at end, but allow \n because of potential DOS \r\n */ - if(buf[iBufPos] != '\0' && buf[iBufPos] != '\n' && buf[iBufPos] != '\r') { - fprintf(stderr, "Header line in %s has too many entries\n", filename); - exit(1); - } - } - int iLine; - for (iLine = 0; iLine < nCodes; iLine++) { - buf[0] = '\0'; - if (fgets(buf,sizeof(buf),fp) == NULL) { - fprintf(stderr, "Cannot read line %d from file %s\n", iLine+2, filename); - exit(1); - } - char *field = strtok(buf,"\t\r\n"); - field = strtok(NULL, "\t"); /* ignore first column */ - int iColumn; - for (iColumn = 0; iColumn < nCodes && field != NULL; iColumn++, field = strtok(NULL,"\t")) { - if(sscanf(field,ScanNumericSpec,&codes[iLine][iColumn]) != 1) { - fprintf(stderr,"Cannot parse field %s in file %s\n", field, filename); - exit(1); - } - } - } -} - -void ReadVector(char *filename, /*OUT*/numeric_t codes[MAXCODES]) { - FILE *fp = fopen(filename,"r"); - if (fp == NULL) { - fprintf(stderr, "Cannot read %s\n",filename); - exit(1); - } - int i; - for (i = 0; i < nCodes; i++) { - if (fscanf(fp,ScanNumericSpec,&codes[i]) != 1) { - fprintf(stderr,"Cannot read %d entry of %s\n",i+1,filename); - exit(1); - } - } - if (fclose(fp) != 0) { - fprintf(stderr, "Error reading %s\n",filename); - exit(1); - } -} - -distance_matrix_t *ReadDistanceMatrix(char *prefix) { - char buffer[BUFFER_SIZE]; - distance_matrix_t *dmat = (distance_matrix_t*)mymalloc(sizeof(distance_matrix_t)); - - if(strlen(prefix) > BUFFER_SIZE-20) { - fprintf(stderr,"Filename %s too long\n", prefix); - exit(1); - } - - strcpy(buffer, prefix); - strcat(buffer, ".distances"); - ReadMatrix(buffer, /*OUT*/dmat->distances, /*checkCodes*/true); - - strcpy(buffer, prefix); - strcat(buffer, ".inverses"); - ReadMatrix(buffer, /*OUT*/dmat->eigeninv, /*checkCodes*/false); - - strcpy(buffer, prefix); - strcat(buffer, ".eigenvalues"); - ReadVector(buffer, /*OUT*/dmat->eigenval); - - if(verbose>1) fprintf(stderr, "Read distance matrix from %s\n",prefix); - SetupDistanceMatrix(/*IN/OUT*/dmat); - return(dmat); -} - -void SetupDistanceMatrix(/*IN/OUT*/distance_matrix_t *dmat) { - /* Check that the eigenvalues and eigen-inverse are consistent with the - distance matrix and that the matrix is symmetric */ - int i,j,k; - for (i = 0; i < nCodes; i++) { - for (j = 0; j < nCodes; j++) { - if(fabs(dmat->distances[i][j]-dmat->distances[j][i]) > 1e-6) { - fprintf(stderr,"Distance matrix not symmetric for %d,%d: %f vs %f\n", - i+1,j+1, - dmat->distances[i][j], - dmat->distances[j][i]); - exit(1); - } - double total = 0.0; - for (k = 0; k < nCodes; k++) - total += dmat->eigenval[k] * dmat->eigeninv[k][i] * dmat->eigeninv[k][j]; - if(fabs(total - dmat->distances[i][j]) > 1e-6) { - fprintf(stderr,"Distance matrix entry %d,%d should be %f but eigen-representation gives %f\n", - i+1,j+1,dmat->distances[i][j],total); - exit(1); - } - } - } - - /* And compute eigentot */ - for (k = 0; k < nCodes; k++) { - dmat->eigentot[k] = 0.; - int j; - for (j = 0; j < nCodes; j++) - dmat->eigentot[k] += dmat->eigeninv[k][j]; - } - - /* And compute codeFreq */ - int code; - for(code = 0; code < nCodes; code++) { - for (k = 0; k < nCodes; k++) { - dmat->codeFreq[code][k] = dmat->eigeninv[k][code]; - } - } - /* And gapFreq */ - for(code = 0; code < nCodes; code++) { - double gapFreq = 0.0; - for (k = 0; k < nCodes; k++) - gapFreq += dmat->codeFreq[k][code]; - dmat->gapFreq[code] = gapFreq / nCodes; - } - - if(verbose>10) fprintf(stderr, "Made codeFreq\n"); -} - -nni_t ChooseNNI(profile_t *profiles[4], - /*OPTIONAL*/distance_matrix_t *dmat, - int nPos, int nConstraints, - /*OUT*/double criteria[3]) { - double d[6]; - CorrectedPairDistances(profiles, 4, dmat, nPos, /*OUT*/d); - double penalty[3]; /* indexed as nni_t */ - QuartetConstraintPenalties(profiles, nConstraints, /*OUT*/penalty); - criteria[ABvsCD] = d[qAB] + d[qCD] + penalty[ABvsCD]; - criteria[ACvsBD] = d[qAC] + d[qBD] + penalty[ACvsBD]; - criteria[ADvsBC] = d[qAD] + d[qBC] + penalty[ADvsBC]; - - nni_t choice = ABvsCD; - if (criteria[ACvsBD] < criteria[ABvsCD] && criteria[ACvsBD] <= criteria[ADvsBC]) { - choice = ACvsBD; - } else if (criteria[ADvsBC] < criteria[ABvsCD] && criteria[ADvsBC] <= criteria[ACvsBD]) { - choice = ADvsBC; - } - if (verbose > 1 && penalty[choice] > penalty[ABvsCD] + 1e-6) { - fprintf(stderr, "Worsen constraint: from %.3f to %.3f distance %.3f to %.3f: ", - penalty[ABvsCD], penalty[choice], - criteria[ABvsCD], choice == ACvsBD ? criteria[ACvsBD] : criteria[ADvsBC]); - int iC; - for (iC = 0; iC < nConstraints; iC++) { - double ppart[3]; - if (QuartetConstraintPenaltiesPiece(profiles, iC, /*OUT*/ppart)) { - double old_penalty = ppart[ABvsCD]; - double new_penalty = ppart[choice]; - if (new_penalty > old_penalty + 1e-6) - fprintf(stderr, " %d (%d/%d %d/%d %d/%d %d/%d)", iC, - profiles[0]->nOn[iC], profiles[0]->nOff[iC], - profiles[1]->nOn[iC], profiles[1]->nOff[iC], - profiles[2]->nOn[iC], profiles[2]->nOff[iC], - profiles[3]->nOn[iC], profiles[3]->nOff[iC]); - } - } - fprintf(stderr,"\n"); - } - if (verbose > 3) - fprintf(stderr, "NNI scores ABvsCD %.5f ACvsBD %.5f ADvsBC %.5f choice %s\n", - criteria[ABvsCD], criteria[ACvsBD], criteria[ADvsBC], - choice == ABvsCD ? "AB|CD" : (choice == ACvsBD ? "AC|BD" : "AD|BC")); - return(choice); -} - -profile_t *PosteriorProfile(profile_t *p1, profile_t *p2, - double len1, double len2, - /*OPTIONAL*/transition_matrix_t *transmat, - rates_t *rates, - int nPos, int nConstraints) { - if (len1 < MLMinBranchLength) - len1 = MLMinBranchLength; - if (len2 < MLMinBranchLength) - len2 = MLMinBranchLength; - - int i,j,k; - profile_t *out = NewProfile(nPos, nConstraints); - for (i = 0; i < nPos; i++) { - out->codes[i] = NOCODE; - out->weights[i] = 1.0; - } - out->nVectors = nPos; - out->vectors = (numeric_t*)mymalloc(sizeof(numeric_t)*nCodes*out->nVectors); - for (i = 0; i < nCodes * out->nVectors; i++) out->vectors[i] = 0; - int iFreqOut = 0; - int iFreq1 = 0; - int iFreq2 = 0; - numeric_t *expeigenRates1 = NULL, *expeigenRates2 = NULL; - - if (transmat != NULL) { - expeigenRates1 = ExpEigenRates(len1, transmat, rates); - expeigenRates2 = ExpEigenRates(len2, transmat, rates); - } - - if (transmat == NULL) { /* Jukes-Cantor */ - assert(nCodes == 4); - - double *PSame1 = PSameVector(len1, rates); - double *PDiff1 = PDiffVector(PSame1, rates); - double *PSame2 = PSameVector(len2, rates); - double *PDiff2 = PDiffVector(PSame2, rates); - - numeric_t mix1[4], mix2[4]; - - for (i=0; i < nPos; i++) { - int iRate = rates->ratecat[i]; - double w1 = p1->weights[i]; - double w2 = p2->weights[i]; - int code1 = p1->codes[i]; - int code2 = p2->codes[i]; - numeric_t *f1 = GET_FREQ(p1,i,/*IN/OUT*/iFreq1); - numeric_t *f2 = GET_FREQ(p2,i,/*IN/OUT*/iFreq2); - - /* First try to store a simple profile */ - if (f1 == NULL && f2 == NULL) { - if (code1 == NOCODE && code2 == NOCODE) { - out->codes[i] = NOCODE; - out->weights[i] = 0.0; - continue; - } else if (code1 == NOCODE) { - /* Posterior(parent | character & gap, len1, len2) = Posterior(parent | character, len1) - = PSame() for matching characters and 1-PSame() for the rest - = (pSame - pDiff) * character + (1-(pSame-pDiff)) * gap - */ - out->codes[i] = code2; - out->weights[i] = w2 * (PSame2[iRate] - PDiff2[iRate]); - continue; - } else if (code2 == NOCODE) { - out->codes[i] = code1; - out->weights[i] = w1 * (PSame1[iRate] - PDiff1[iRate]); - continue; - } else if (code1 == code2) { - out->codes[i] = code1; - double f12code = (w1*PSame1[iRate] + (1-w1)*0.25) * (w2*PSame2[iRate] + (1-w2)*0.25); - double f12other = (w1*PDiff1[iRate] + (1-w1)*0.25) * (w2*PDiff2[iRate] + (1-w2)*0.25); - /* posterior probability of code1/code2 after scaling */ - double pcode = f12code/(f12code+3*f12other); - /* Now f = w * (code ? 1 : 0) + (1-w) * 0.25, so to get pcode we need - fcode = 1/4 + w1*3/4 or w = (f-1/4)*4/3 - */ - out->weights[i] = (pcode - 0.25) * 4.0/3.0; - /* This can be zero because of numerical problems, I think */ - if (out->weights[i] < 1e-6) { - if (verbose > 1) - fprintf(stderr, "Replaced weight %f with %f from w1 %f w2 %f PSame %f %f f12code %f f12other %f\n", - out->weights[i], 1e-6, - w1, w2, - PSame1[iRate], PSame2[iRate], - f12code, f12other); - out->weights[i] = 1e-6; - } - continue; - } - } - /* if we did not compute a simple profile, then do the full computation and - store the full vector - */ - if (f1 == NULL) { - for (j = 0; j < 4; j++) - mix1[j] = (1-w1)*0.25; - if(code1 != NOCODE) - mix1[code1] += w1; - f1 = mix1; - } - if (f2 == NULL) { - for (j = 0; j < 4; j++) - mix2[j] = (1-w2)*0.25; - if(code2 != NOCODE) - mix2[code2] += w2; - f2 = mix2; - } - out->codes[i] = NOCODE; - out->weights[i] = 1.0; - numeric_t *f = GET_FREQ(out,i,/*IN/OUT*/iFreqOut); - double lkAB = 0; - for (j = 0; j < 4; j++) { - f[j] = (f1[j] * PSame1[iRate] + (1.0-f1[j]) * PDiff1[iRate]) - * (f2[j] * PSame2[iRate] + (1.0-f2[j]) * PDiff2[iRate]); - lkAB += f[j]; - } - double lkABInv = 1.0/lkAB; - for (j = 0; j < 4; j++) - f[j] *= lkABInv; - } - PSame1 = myfree(PSame1, sizeof(double) * rates->nRateCategories); - PSame2 = myfree(PSame2, sizeof(double) * rates->nRateCategories); - PDiff1 = myfree(PDiff1, sizeof(double) * rates->nRateCategories); - PDiff2 = myfree(PDiff2, sizeof(double) * rates->nRateCategories); - } else if (nCodes == 4) { /* matrix model on nucleotides */ - numeric_t *fGap = &transmat->codeFreq[NOCODE][0]; - numeric_t f1mix[4], f2mix[4]; - - for (i=0; i < nPos; i++) { - if (p1->codes[i] == NOCODE && p2->codes[i] == NOCODE - && p1->weights[i] == 0 && p2->weights[i] == 0) { - /* aligning gap with gap -- just output a gap - out->codes[i] is already set to NOCODE so need not set that */ - out->weights[i] = 0; - continue; - } - int iRate = rates->ratecat[i]; - numeric_t *expeigen1 = &expeigenRates1[iRate*4]; - numeric_t *expeigen2 = &expeigenRates2[iRate*4]; - numeric_t *f1 = GET_FREQ(p1,i,/*IN/OUT*/iFreq1); - numeric_t *f2 = GET_FREQ(p2,i,/*IN/OUT*/iFreq2); - numeric_t *fOut = GET_FREQ(out,i,/*IN/OUT*/iFreqOut); - assert(fOut != NULL); - - if (f1 == NULL) { - f1 = &transmat->codeFreq[p1->codes[i]][0]; /* codeFreq includes an entry for NOCODE */ - double w = p1->weights[i]; - if (w > 0.0 && w < 1.0) { - for (j = 0; j < 4; j++) - f1mix[j] = w * f1[j] + (1.0-w) * fGap[j]; - f1 = f1mix; - } - } - if (f2 == NULL) { - f2 = &transmat->codeFreq[p2->codes[i]][0]; - double w = p2->weights[i]; - if (w > 0.0 && w < 1.0) { - for (j = 0; j < 4; j++) - f2mix[j] = w * f2[j] + (1.0-w) * fGap[j]; - f2 = f2mix; - } - } - numeric_t fMult1[4] ALIGNED; /* rotated1 * expeigen1 */ - numeric_t fMult2[4] ALIGNED; /* rotated2 * expeigen2 */ -#if 0 /* SSE3 is slower */ - vector_multiply(f1, expeigen1, 4, /*OUT*/fMult1); - vector_multiply(f2, expeigen2, 4, /*OUT*/fMult2); -#else - for (j = 0; j < 4; j++) { - fMult1[j] = f1[j]*expeigen1[j]; - fMult2[j] = f2[j]*expeigen2[j]; - } -#endif - numeric_t fPost[4] ALIGNED; /* in unrotated space */ - for (j = 0; j < 4; j++) { -#if 0 /* SSE3 is slower */ - fPost[j] = vector_dot_product_rot(fMult1, fMult2, &transmat->codeFreq[j][0], 4) - * transmat->statinv[j]; */ -#else - double out1 = 0; - double out2 = 0; - for (k = 0; k < 4; k++) { - out1 += fMult1[k] * transmat->codeFreq[j][k]; - out2 += fMult2[k] * transmat->codeFreq[j][k]; - } - fPost[j] = out1*out2*transmat->statinv[j]; -#endif - } - double fPostTot = 0; - for (j = 0; j < 4; j++) - fPostTot += fPost[j]; - assert(fPostTot > fPostTotalTolerance); - double fPostInv = 1.0/fPostTot; -#if 0 /* SSE3 is slower */ - vector_multiply_by(fPost, fPostInv, 4); -#else - for (j = 0; j < 4; j++) - fPost[j] *= fPostInv; -#endif - - /* and finally, divide by stat again & rotate to give the new frequencies */ - matrixt_by_vector4(transmat->eigeninvT, fPost, /*OUT*/fOut); - } /* end loop over position i */ - } else if (nCodes == 20) { /* matrix model on amino acids */ - numeric_t *fGap = &transmat->codeFreq[NOCODE][0]; - numeric_t f1mix[20] ALIGNED; - numeric_t f2mix[20] ALIGNED; - - for (i=0; i < nPos; i++) { - if (p1->codes[i] == NOCODE && p2->codes[i] == NOCODE - && p1->weights[i] == 0 && p2->weights[i] == 0) { - /* aligning gap with gap -- just output a gap - out->codes[i] is already set to NOCODE so need not set that */ - out->weights[i] = 0; - continue; - } - int iRate = rates->ratecat[i]; - numeric_t *expeigen1 = &expeigenRates1[iRate*20]; - numeric_t *expeigen2 = &expeigenRates2[iRate*20]; - numeric_t *f1 = GET_FREQ(p1,i,/*IN/OUT*/iFreq1); - numeric_t *f2 = GET_FREQ(p2,i,/*IN/OUT*/iFreq2); - numeric_t *fOut = GET_FREQ(out,i,/*IN/OUT*/iFreqOut); - assert(fOut != NULL); - - if (f1 == NULL) { - f1 = &transmat->codeFreq[p1->codes[i]][0]; /* codeFreq includes an entry for NOCODE */ - double w = p1->weights[i]; - if (w > 0.0 && w < 1.0) { - for (j = 0; j < 20; j++) - f1mix[j] = w * f1[j] + (1.0-w) * fGap[j]; - f1 = f1mix; - } - } - if (f2 == NULL) { - f2 = &transmat->codeFreq[p2->codes[i]][0]; - double w = p2->weights[i]; - if (w > 0.0 && w < 1.0) { - for (j = 0; j < 20; j++) - f2mix[j] = w * f2[j] + (1.0-w) * fGap[j]; - f2 = f2mix; - } - } - numeric_t fMult1[20] ALIGNED; /* rotated1 * expeigen1 */ - numeric_t fMult2[20] ALIGNED; /* rotated2 * expeigen2 */ - vector_multiply(f1, expeigen1, 20, /*OUT*/fMult1); - vector_multiply(f2, expeigen2, 20, /*OUT*/fMult2); - numeric_t fPost[20] ALIGNED; /* in unrotated space */ - for (j = 0; j < 20; j++) { - numeric_t value = vector_dot_product_rot(fMult1, fMult2, &transmat->codeFreq[j][0], 20) - * transmat->statinv[j]; - /* Added this logic try to avoid rare numerical problems */ - fPost[j] = value >= 0 ? value : 0; - } - double fPostTot = vector_sum(fPost, 20); - assert(fPostTot > fPostTotalTolerance); - double fPostInv = 1.0/fPostTot; - vector_multiply_by(/*IN/OUT*/fPost, fPostInv, 20); - int ch = -1; /* the dominant character, if any */ - if (!exactML) { - for (j = 0; j < 20; j++) { - if (fPost[j] >= approxMLminf) { - ch = j; - break; - } - } - } - - /* now, see if we can use the approximation - fPost ~= (1 or 0) * w + nearP * (1-w) - to avoid rotating */ - double w = 0; - if (ch >= 0) { - w = (fPost[ch] - transmat->nearP[ch][ch]) / (1.0 - transmat->nearP[ch][ch]); - for (j = 0; j < 20; j++) { - if (j != ch) { - double fRough = (1.0-w) * transmat->nearP[ch][j]; - if (fRough < fPost[j] * approxMLminratio) { - ch = -1; /* give up on the approximation */ - break; - } - } - } - } - if (ch >= 0) { - nAAPosteriorRough++; - double wInvStat = w * transmat->statinv[ch]; - for (j = 0; j < 20; j++) - fOut[j] = wInvStat * transmat->codeFreq[ch][j] + (1.0-w) * transmat->nearFreq[ch][j]; - } else { - /* and finally, divide by stat again & rotate to give the new frequencies */ - nAAPosteriorExact++; - for (j = 0; j < 20; j++) - fOut[j] = vector_multiply_sum(fPost, &transmat->eigeninv[j][0], 20); - } - } /* end loop over position i */ - } else { - assert(0); /* illegal nCodes */ - } - - if (transmat != NULL) { - expeigenRates1 = myfree(expeigenRates1, sizeof(numeric_t) * rates->nRateCategories * nCodes); - expeigenRates2 = myfree(expeigenRates2, sizeof(numeric_t) * rates->nRateCategories * nCodes); - } - - /* Reallocate out->vectors to be the right size */ - out->nVectors = iFreqOut; - if (out->nVectors == 0) - out->vectors = (numeric_t*)myfree(out->vectors, sizeof(numeric_t)*nCodes*nPos); - else - out->vectors = (numeric_t*)myrealloc(out->vectors, - /*OLDSIZE*/sizeof(numeric_t)*nCodes*nPos, - /*NEWSIZE*/sizeof(numeric_t)*nCodes*out->nVectors, - /*copy*/true); /* try to save space */ - nProfileFreqAlloc += out->nVectors; - nProfileFreqAvoid += nPos - out->nVectors; - - /* compute total constraints */ - for (i = 0; i < nConstraints; i++) { - out->nOn[i] = p1->nOn[i] + p2->nOn[i]; - out->nOff[i] = p1->nOff[i] + p2->nOff[i]; - } - nPosteriorCompute++; - return(out); -} - -double *PSameVector(double length, rates_t *rates) { - double *pSame = mymalloc(sizeof(double) * rates->nRateCategories); - int iRate; - for (iRate = 0; iRate < rates->nRateCategories; iRate++) - pSame[iRate] = 0.25 + 0.75 * exp((-4.0/3.0) * fabs(length*rates->rates[iRate])); - return(pSame); -} - -double *PDiffVector(double *pSame, rates_t *rates) { - double *pDiff = mymalloc(sizeof(double) * rates->nRateCategories); - int iRate; - for (iRate = 0; iRate < rates->nRateCategories; iRate++) - pDiff[iRate] = (1.0 - pSame[iRate])/3.0; - return(pDiff); -} - -numeric_t *ExpEigenRates(double length, transition_matrix_t *transmat, rates_t *rates) { - numeric_t *expeigen = mymalloc(sizeof(numeric_t) * nCodes * rates->nRateCategories); - int iRate, j; - for (iRate = 0; iRate < rates->nRateCategories; iRate++) { - for (j = 0; j < nCodes; j++) { - double relLen = length * rates->rates[iRate]; - /* very short branch lengths lead to numerical problems so prevent them */ - if (relLen < MLMinRelBranchLength) - relLen = MLMinRelBranchLength; - expeigen[iRate*nCodes + j] = exp(relLen * transmat->eigenval[j]); - } - } - return(expeigen); -} - -double PairLogLk(profile_t *pA, profile_t *pB, double length, int nPos, - /*OPTIONAL*/transition_matrix_t *transmat, - rates_t *rates, - /*OPTIONAL IN/OUT*/double *site_likelihoods) { - double lk = 1.0; - double loglk = 0.0; /* stores underflow of lk during the loop over positions */ - int i,j; - assert(rates != NULL && rates->nRateCategories > 0); - numeric_t *expeigenRates = NULL; - if (transmat != NULL) - expeigenRates = ExpEigenRates(length, transmat, rates); - - if (transmat == NULL) { /* Jukes-Cantor */ - assert (nCodes == 4); - double *pSame = PSameVector(length, rates); - double *pDiff = PDiffVector(pSame, rates); - - int iFreqA = 0; - int iFreqB = 0; - for (i = 0; i < nPos; i++) { - int iRate = rates->ratecat[i]; - double wA = pA->weights[i]; - double wB = pB->weights[i]; - int codeA = pA->codes[i]; - int codeB = pB->codes[i]; - numeric_t *fA = GET_FREQ(pA,i,/*IN/OUT*/iFreqA); - numeric_t *fB = GET_FREQ(pB,i,/*IN/OUT*/iFreqB); - double lkAB = 0; - - if (fA == NULL && fB == NULL) { - if (codeA == NOCODE) { /* A is all gaps */ - /* gap to gap is sum(j) 0.25 * (0.25 * pSame + 0.75 * pDiff) = sum(i) 0.25*0.25 = 0.25 - gap to any character gives the same result - */ - lkAB = 0.25; - } else if (codeB == NOCODE) { /* B is all gaps */ - lkAB = 0.25; - } else if (codeA == codeB) { /* A and B match */ - lkAB = pSame[iRate] * wA*wB + 0.25 * (1-wA*wB); - } else { /* codeA != codeB */ - lkAB = pDiff[iRate] * wA*wB + 0.25 * (1-wA*wB); - } - } else if (fA == NULL) { - /* Compare codeA to profile of B */ - if (codeA == NOCODE) - lkAB = 0.25; - else - lkAB = wA * (pDiff[iRate] + fB[codeA] * (pSame[iRate]-pDiff[iRate])) + (1.0-wA) * 0.25; - /* because lkAB = wA * P(codeA->B) + (1-wA) * 0.25 - P(codeA -> B) = sum(j) P(B==j) * (j==codeA ? pSame : pDiff) - = sum(j) P(B==j) * pDiff + - = pDiff + P(B==codeA) * (pSame-pDiff) - */ - } else if (fB == NULL) { /* Compare codeB to profile of A */ - if (codeB == NOCODE) - lkAB = 0.25; - else - lkAB = wB * (pDiff[iRate] + fA[codeB] * (pSame[iRate]-pDiff[iRate])) + (1.0-wB) * 0.25; - } else { /* both are full profiles */ - for (j = 0; j < 4; j++) - lkAB += fB[j] * (fA[j] * pSame[iRate] + (1-fA[j])* pDiff[iRate]); /* P(A|B) */ - } - assert(lkAB > 0); - lk *= lkAB; - while (lk < LkUnderflow) { - lk *= LkUnderflowInv; - loglk -= LogLkUnderflow; - } - if (site_likelihoods != NULL) - site_likelihoods[i] *= lkAB; - } - pSame = myfree(pSame, sizeof(double) * rates->nRateCategories); - pDiff = myfree(pDiff, sizeof(double) * rates->nRateCategories); - } else if (nCodes == 4) { /* matrix model on nucleotides */ - int iFreqA = 0; - int iFreqB = 0; - numeric_t fAmix[4], fBmix[4]; - numeric_t *fGap = &transmat->codeFreq[NOCODE][0]; - - for (i = 0; i < nPos; i++) { - int iRate = rates->ratecat[i]; - numeric_t *expeigen = &expeigenRates[iRate*4]; - double wA = pA->weights[i]; - double wB = pB->weights[i]; - if (wA == 0 && wB == 0 && pA->codes[i] == NOCODE && pB->codes[i] == NOCODE) { - /* Likelihood of A vs B is 1, so nothing changes - Do not need to advance iFreqA or iFreqB */ - continue; - } - numeric_t *fA = GET_FREQ(pA,i,/*IN/OUT*/iFreqA); - numeric_t *fB = GET_FREQ(pB,i,/*IN/OUT*/iFreqB); - if (fA == NULL) - fA = &transmat->codeFreq[pA->codes[i]][0]; - if (wA > 0.0 && wA < 1.0) { - for (j = 0; j < 4; j++) - fAmix[j] = wA*fA[j] + (1.0-wA)*fGap[j]; - fA = fAmix; - } - if (fB == NULL) - fB = &transmat->codeFreq[pB->codes[i]][0]; - if (wB > 0.0 && wB < 1.0) { - for (j = 0; j < 4; j++) - fBmix[j] = wB*fB[j] + (1.0-wB)*fGap[j]; - fB = fBmix; - } - /* SSE3 instructions do not speed this step up: - numeric_t lkAB = vector_multiply3_sum(expeigen, fA, fB); */ - // dsp this is where check for <=0 was added in 2.1.1.LG - double lkAB = 0; - for (j = 0; j < 4; j++) - lkAB += expeigen[j]*fA[j]*fB[j]; - assert(lkAB > 0); - if (site_likelihoods != NULL) - site_likelihoods[i] *= lkAB; - lk *= lkAB; - while (lk < LkUnderflow) { - lk *= LkUnderflowInv; - loglk -= LogLkUnderflow; - } - while (lk > LkUnderflowInv) { - lk *= LkUnderflow; - loglk += LogLkUnderflow; - } - } - } else if (nCodes == 20) { /* matrix model on amino acids */ - int iFreqA = 0; - int iFreqB = 0; - numeric_t fAmix[20], fBmix[20]; - numeric_t *fGap = &transmat->codeFreq[NOCODE][0]; - - for (i = 0; i < nPos; i++) { - int iRate = rates->ratecat[i]; - numeric_t *expeigen = &expeigenRates[iRate*20]; - double wA = pA->weights[i]; - double wB = pB->weights[i]; - if (wA == 0 && wB == 0 && pA->codes[i] == NOCODE && pB->codes[i] == NOCODE) { - /* Likelihood of A vs B is 1, so nothing changes - Do not need to advance iFreqA or iFreqB */ - continue; - } - numeric_t *fA = GET_FREQ(pA,i,/*IN/OUT*/iFreqA); - numeric_t *fB = GET_FREQ(pB,i,/*IN/OUT*/iFreqB); - if (fA == NULL) - fA = &transmat->codeFreq[pA->codes[i]][0]; - if (wA > 0.0 && wA < 1.0) { - for (j = 0; j < 20; j++) - fAmix[j] = wA*fA[j] + (1.0-wA)*fGap[j]; - fA = fAmix; - } - if (fB == NULL) - fB = &transmat->codeFreq[pB->codes[i]][0]; - if (wB > 0.0 && wB < 1.0) { - for (j = 0; j < 20; j++) - fBmix[j] = wB*fB[j] + (1.0-wB)*fGap[j]; - fB = fBmix; - } - numeric_t lkAB = vector_multiply3_sum(expeigen, fA, fB, 20); - if (!(lkAB > 0)) { - /* If this happens, it indicates a numerical problem that needs to be addressed elsewhere, - so report all the details */ - fprintf(stderr, "# FastTree.c::PairLogLk -- numerical problem!\n"); - fprintf(stderr, "# This block is intended for loading into R\n"); - - fprintf(stderr, "lkAB = %.8g\n", lkAB); - fprintf(stderr, "Branch_length= %.8g\nalignment_position=%d\nnCodes=%d\nrate_category=%d\nrate=%.8g\n", - length, i, nCodes, iRate, rates->rates[iRate]); - fprintf(stderr, "wA=%.8g\nwB=%.8g\n", wA, wB); - fprintf(stderr, "codeA = %d\ncodeB = %d\n", pA->codes[i], pB->codes[i]); - - fprintf(stderr, "fA = c("); - for (j = 0; j < nCodes; j++) fprintf(stderr, "%s %.8g", j==0?"":",", fA[j]); - fprintf(stderr,")\n"); - - fprintf(stderr, "fB = c("); - for (j = 0; j < nCodes; j++) fprintf(stderr, "%s %.8g", j==0?"":",", fB[j]); - fprintf(stderr,")\n"); - - fprintf(stderr, "stat = c("); - for (j = 0; j < nCodes; j++) fprintf(stderr, "%s %.8g", j==0?"":",", transmat->stat[j]); - fprintf(stderr,")\n"); - - fprintf(stderr, "eigenval = c("); - for (j = 0; j < nCodes; j++) fprintf(stderr, "%s %.8g", j==0?"":",", transmat->eigenval[j]); - fprintf(stderr,")\n"); - - fprintf(stderr, "expeigen = c("); - for (j = 0; j < nCodes; j++) fprintf(stderr, "%s %.8g", j==0?"":",", expeigen[j]); - fprintf(stderr,")\n"); - - int k; - fprintf(stderr, "codeFreq = c("); - for (j = 0; j < nCodes; j++) for(k = 0; k < nCodes; k++) fprintf(stderr, "%s %.8g", j==0 && k==0?"":",", - transmat->codeFreq[j][k]); - fprintf(stderr,")\n"); - - fprintf(stderr, "eigeninv = c("); - for (j = 0; j < nCodes; j++) for(k = 0; k < nCodes; k++) fprintf(stderr, "%s %.8g", j==0 && k==0?"":",", - transmat->eigeninv[j][k]); - fprintf(stderr,")\n"); - - fprintf(stderr, "# Transform into matrices and compute un-rotated vectors for profiles A and B\n"); - fprintf(stderr, "codeFreq = matrix(codeFreq,nrow=20);\n"); - fprintf(stderr, "eigeninv = matrix(eigeninv,nrow=20);\n"); - fputs("unrotA = stat * (eigeninv %*% fA)\n", stderr); - fputs("unrotB = stat * (eigeninv %*% fB)\n", stderr); - fprintf(stderr,"# End of R block\n"); - } - assert(lkAB > 0); - if (site_likelihoods != NULL) - site_likelihoods[i] *= lkAB; - lk *= lkAB; - while (lk < LkUnderflow) { - lk *= LkUnderflowInv; - loglk -= LogLkUnderflow; - } - while (lk > LkUnderflowInv) { - lk *= LkUnderflow; - loglk += LogLkUnderflow; - } - } - } else { - assert(0); /* illegal nCodes */ - } - if (transmat != NULL) - expeigenRates = myfree(expeigenRates, sizeof(numeric_t) * rates->nRateCategories * 20); - loglk += log(lk); - nLkCompute++; - return(loglk); -} - -double MLQuartetLogLk(profile_t *pA, profile_t *pB, profile_t *pC, profile_t *pD, - int nPos, /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, - /*IN*/double branch_lengths[5], - /*OPTIONAL OUT*/double *site_likelihoods) { - profile_t *pAB = PosteriorProfile(pA, pB, - branch_lengths[0], branch_lengths[1], - transmat, - rates, - nPos, /*nConstraints*/0); - profile_t *pCD = PosteriorProfile(pC, pD, - branch_lengths[2], branch_lengths[3], - transmat, - rates, - nPos, /*nConstraints*/0); - if (site_likelihoods != NULL) { - int i; - for (i = 0; i < nPos; i++) - site_likelihoods[i] = 1.0; - } - /* Roughly, P(A,B,C,D) = P(A) P(B|A) P(D|C) P(AB | CD) */ - double loglk = PairLogLk(pA, pB, branch_lengths[0]+branch_lengths[1], - nPos, transmat, rates, /*OPTIONAL IN/OUT*/site_likelihoods) - + PairLogLk(pC, pD, branch_lengths[2]+branch_lengths[3], - nPos, transmat, rates, /*OPTIONAL IN/OUT*/site_likelihoods) - + PairLogLk(pAB, pCD, branch_lengths[4], - nPos, transmat, rates, /*OPTIONAL IN/OUT*/site_likelihoods); - pAB = FreeProfile(pAB, nPos, /*nConstraints*/0); - pCD = FreeProfile(pCD, nPos, /*nConstraints*/0); - return(loglk); -} - -double PairNegLogLk(double x, void *data) { - quartet_opt_t *qo = (quartet_opt_t *)data; - assert(qo != NULL); - assert(qo->pair1 != NULL && qo->pair2 != NULL); - qo->nEval++; - double loglk = PairLogLk(qo->pair1, qo->pair2, x, qo->nPos, qo->transmat, qo->rates, /*site_lk*/NULL); - assert(loglk < 1e100); - if (verbose > 5) - fprintf(stderr, "PairLogLk(%.4f) = %.4f\n", x, loglk); - return(-loglk); -} - -double MLQuartetOptimize(profile_t *pA, profile_t *pB, profile_t *pC, profile_t *pD, - int nPos, /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, - /*IN/OUT*/double branch_lengths[5], - /*OPTIONAL OUT*/bool *pStarTest, - /*OPTIONAL OUT*/double *site_likelihoods) { - int j; - double start_length[5]; - for (j = 0; j < 5; j++) { - start_length[j] = branch_lengths[j]; - if (branch_lengths[j] < MLMinBranchLength) - branch_lengths[j] = MLMinBranchLength; - } - quartet_opt_t qopt = { nPos, transmat, rates, /*nEval*/0, - /*pair1*/NULL, /*pair2*/NULL }; - double f2x, negloglk; - - if (pStarTest != NULL) - *pStarTest = false; - - /* First optimize internal branch, then branch to A, B, C, D, in turn - May use star test to quit after internal branch - */ - profile_t *pAB = PosteriorProfile(pA, pB, - branch_lengths[LEN_A], branch_lengths[LEN_B], - transmat, rates, nPos, /*nConstraints*/0); - profile_t *pCD = PosteriorProfile(pC, pD, - branch_lengths[LEN_C], branch_lengths[LEN_D], - transmat, rates, nPos, /*nConstraints*/0); - qopt.pair1 = pAB; - qopt.pair2 = pCD; - branch_lengths[LEN_I] = onedimenmin(/*xmin*/MLMinBranchLength, - /*xguess*/branch_lengths[LEN_I], - /*xmax*/6.0, - PairNegLogLk, - /*data*/&qopt, - /*ftol*/MLFTolBranchLength, - /*atol*/MLMinBranchLengthTolerance, - /*OUT*/&negloglk, - /*OUT*/&f2x); - - if (pStarTest != NULL) { - assert(site_likelihoods == NULL); - double loglkStar = -PairNegLogLk(MLMinBranchLength, &qopt); - if (loglkStar < -negloglk - closeLogLkLimit) { - *pStarTest = true; - double off = PairLogLk(pA, pB, - branch_lengths[LEN_A] + branch_lengths[LEN_B], - qopt.nPos, qopt.transmat, qopt.rates, /*site_lk*/NULL) - + PairLogLk(pC, pD, - branch_lengths[LEN_C] + branch_lengths[LEN_D], - qopt.nPos, qopt.transmat, qopt.rates, /*site_lk*/NULL); - pAB = FreeProfile(pAB, nPos, /*nConstraints*/0); - pCD = FreeProfile(pCD, nPos, /*nConstraints*/0); - return (-negloglk + off); - } - } - pAB = FreeProfile(pAB, nPos, /*nConstraints*/0); - profile_t *pBCD = PosteriorProfile(pB, pCD, - branch_lengths[LEN_B], branch_lengths[LEN_I], - transmat, rates, nPos, /*nConstraints*/0); - qopt.pair1 = pA; - qopt.pair2 = pBCD; - branch_lengths[LEN_A] = onedimenmin(/*xmin*/MLMinBranchLength, - /*xguess*/branch_lengths[LEN_A], - /*xmax*/6.0, - PairNegLogLk, - /*data*/&qopt, - /*ftol*/MLFTolBranchLength, - /*atol*/MLMinBranchLengthTolerance, - /*OUT*/&negloglk, - /*OUT*/&f2x); - pBCD = FreeProfile(pBCD, nPos, /*nConstraints*/0); - profile_t *pACD = PosteriorProfile(pA, pCD, - branch_lengths[LEN_A], branch_lengths[LEN_I], - transmat, rates, nPos, /*nConstraints*/0); - qopt.pair1 = pB; - qopt.pair2 = pACD; - branch_lengths[LEN_B] = onedimenmin(/*xmin*/MLMinBranchLength, - /*xguess*/branch_lengths[LEN_B], - /*xmax*/6.0, - PairNegLogLk, - /*data*/&qopt, - /*ftol*/MLFTolBranchLength, - /*atol*/MLMinBranchLengthTolerance, - /*OUT*/&negloglk, - /*OUT*/&f2x); - pACD = FreeProfile(pACD, nPos, /*nConstraints*/0); - pCD = FreeProfile(pCD, nPos, /*nConstraints*/0); - pAB = PosteriorProfile(pA, pB, - branch_lengths[LEN_A], branch_lengths[LEN_B], - transmat, rates, nPos, /*nConstraints*/0); - profile_t *pABD = PosteriorProfile(pAB, pD, - branch_lengths[LEN_I], branch_lengths[LEN_D], - transmat, rates, nPos, /*nConstraints*/0); - qopt.pair1 = pC; - qopt.pair2 = pABD; - branch_lengths[LEN_C] = onedimenmin(/*xmin*/MLMinBranchLength, - /*xguess*/branch_lengths[LEN_C], - /*xmax*/6.0, - PairNegLogLk, - /*data*/&qopt, - /*ftol*/MLFTolBranchLength, - /*atol*/MLMinBranchLengthTolerance, - /*OUT*/&negloglk, - /*OUT*/&f2x); - pABD = FreeProfile(pABD, nPos, /*nConstraints*/0); - profile_t *pABC = PosteriorProfile(pAB, pC, - branch_lengths[LEN_I], branch_lengths[LEN_C], - transmat, rates, nPos, /*nConstraints*/0); - qopt.pair1 = pD; - qopt.pair2 = pABC; - branch_lengths[LEN_D] = onedimenmin(/*xmin*/MLMinBranchLength, - /*xguess*/branch_lengths[LEN_D], - /*xmax*/6.0, - PairNegLogLk, - /*data*/&qopt, - /*ftol*/MLFTolBranchLength, - /*atol*/MLMinBranchLengthTolerance, - /*OUT*/&negloglk, - /*OUT*/&f2x); - - /* Compute the total quartet likelihood - PairLogLk(ABC,D) + PairLogLk(AB,C) + PairLogLk(A,B) - */ - double loglkABCvsD = -negloglk; - if (site_likelihoods) { - for (j = 0; j < nPos; j++) - site_likelihoods[j] = 1.0; - PairLogLk(pABC, pD, branch_lengths[LEN_D], - qopt.nPos, qopt.transmat, qopt.rates, /*IN/OUT*/site_likelihoods); - } - double quartetloglk = loglkABCvsD - + PairLogLk(pAB, pC, branch_lengths[LEN_I] + branch_lengths[LEN_C], - qopt.nPos, qopt.transmat, qopt.rates, - /*IN/OUT*/site_likelihoods) - + PairLogLk(pA, pB, branch_lengths[LEN_A] + branch_lengths[LEN_B], - qopt.nPos, qopt.transmat, qopt.rates, - /*IN/OUT*/site_likelihoods); - - pABC = FreeProfile(pABC, nPos, /*nConstraints*/0); - pAB = FreeProfile(pAB, nPos, /*nConstraints*/0); - - if (verbose > 3) { - double loglkStart = MLQuartetLogLk(pA, pB, pC, pD, nPos, transmat, rates, start_length, /*site_lk*/NULL); - fprintf(stderr, "Optimize loglk from %.5f to %.5f eval %d lengths from\n" - " %.5f %.5f %.5f %.5f %.5f to\n" - " %.5f %.5f %.5f %.5f %.5f\n", - loglkStart, quartetloglk, qopt.nEval, - start_length[0], start_length[1], start_length[2], start_length[3], start_length[4], - branch_lengths[0], branch_lengths[1], branch_lengths[2], branch_lengths[3], branch_lengths[4]); - } - return(quartetloglk); -} - -nni_t MLQuartetNNI(profile_t *profiles[4], - /*OPTIONAL*/transition_matrix_t *transmat, - rates_t *rates, - int nPos, int nConstraints, - /*OUT*/double criteria[3], /* The three potential quartet log-likelihoods */ - /*IN/OUT*/numeric_t len[5], - bool bFast) -{ - int i; - double lenABvsCD[5] = {len[LEN_A], len[LEN_B], len[LEN_C], len[LEN_D], len[LEN_I]}; - double lenACvsBD[5] = {len[LEN_A], len[LEN_C], len[LEN_B], len[LEN_D], len[LEN_I]}; /* Swap B & C */ - double lenADvsBC[5] = {len[LEN_A], len[LEN_D], len[LEN_C], len[LEN_B], len[LEN_I]}; /* Swap B & D */ - bool bConsiderAC = true; - bool bConsiderAD = true; - int iRound; - int nRounds = mlAccuracy < 2 ? 2 : mlAccuracy; - double penalty[3]; - QuartetConstraintPenalties(profiles, nConstraints, /*OUT*/penalty); - if (penalty[ABvsCD] > penalty[ACvsBD] || penalty[ABvsCD] > penalty[ADvsBC]) - bFast = false; -#ifdef OPENMP - bFast = false; /* turn off star topology test */ -#endif - - for (iRound = 0; iRound < nRounds; iRound++) { - bool bStarTest = false; - { -#ifdef OPENMP - #pragma omp parallel - #pragma omp sections -#endif - { -#ifdef OPENMP - #pragma omp section -#endif - { - criteria[ABvsCD] = MLQuartetOptimize(profiles[0], profiles[1], profiles[2], profiles[3], - nPos, transmat, rates, - /*IN/OUT*/lenABvsCD, - bFast ? &bStarTest : NULL, - /*site_likelihoods*/NULL) - - penalty[ABvsCD]; /* subtract penalty b/c we are trying to maximize log lk */ - } - -#ifdef OPENMP - #pragma omp section -#else - if (bStarTest) { - nStarTests++; - criteria[ACvsBD] = -1e20; - criteria[ADvsBC] = -1e20; - len[LEN_I] = lenABvsCD[LEN_I]; - return(ABvsCD); - } -#endif - { - if (bConsiderAC) - criteria[ACvsBD] = MLQuartetOptimize(profiles[0], profiles[2], profiles[1], profiles[3], - nPos, transmat, rates, - /*IN/OUT*/lenACvsBD, NULL, /*site_likelihoods*/NULL) - - penalty[ACvsBD]; - } - -#ifdef OPENMP - #pragma omp section -#endif - { - if (bConsiderAD) - criteria[ADvsBC] = MLQuartetOptimize(profiles[0], profiles[3], profiles[2], profiles[1], - nPos, transmat, rates, - /*IN/OUT*/lenADvsBC, NULL, /*site_likelihoods*/NULL) - - penalty[ADvsBC]; - } - } - } /* end parallel sections */ - if (mlAccuracy < 2) { - /* If clearly worse then ABvsCD, or have short internal branch length and worse, then - give up */ - if (criteria[ACvsBD] < criteria[ABvsCD] - closeLogLkLimit - || (lenACvsBD[LEN_I] <= 2.0*MLMinBranchLength && criteria[ACvsBD] < criteria[ABvsCD])) - bConsiderAC = false; - if (criteria[ADvsBC] < criteria[ABvsCD] - closeLogLkLimit - || (lenADvsBC[LEN_I] <= 2.0*MLMinBranchLength && criteria[ADvsBC] < criteria[ABvsCD])) - bConsiderAD = false; - if (!bConsiderAC && !bConsiderAD) - break; - /* If clearly better than either alternative, then give up - (Comparison is probably biased in favor of ABvsCD anyway) */ - if (criteria[ACvsBD] > criteria[ABvsCD] + closeLogLkLimit - && criteria[ACvsBD] > criteria[ADvsBC] + closeLogLkLimit) - break; - if (criteria[ADvsBC] > criteria[ABvsCD] + closeLogLkLimit - && criteria[ADvsBC] > criteria[ACvsBD] + closeLogLkLimit) - break; - } - } /* end loop over rounds */ - - if (verbose > 2) { - fprintf(stderr, "Optimized quartet for %d rounds: ABvsCD %.5f ACvsBD %.5f ADvsBC %.5f\n", - iRound, criteria[ABvsCD], criteria[ACvsBD], criteria[ADvsBC]); - } - if (criteria[ACvsBD] > criteria[ABvsCD] && criteria[ACvsBD] > criteria[ADvsBC]) { - for (i = 0; i < 5; i++) len[i] = lenACvsBD[i]; - return(ACvsBD); - } else if (criteria[ADvsBC] > criteria[ABvsCD] && criteria[ADvsBC] > criteria[ACvsBD]) { - for (i = 0; i < 5; i++) len[i] = lenADvsBC[i]; - return(ADvsBC); - } else { - for (i = 0; i < 5; i++) len[i] = lenABvsCD[i]; - return(ABvsCD); - } -} - -double TreeLength(/*IN/OUT*/NJ_t *NJ, bool recomputeProfiles) { - if (recomputeProfiles) { - traversal_t traversal2 = InitTraversal(NJ); - int j = NJ->root; - while((j = TraversePostorder(j, NJ, /*IN/OUT*/traversal2, /*pUp*/NULL)) >= 0) { - /* nothing to do for leaves or root */ - if (j >= NJ->nSeq && j != NJ->root) - SetProfile(/*IN/OUT*/NJ, j, /*noweight*/-1.0); - } - traversal2 = FreeTraversal(traversal2,NJ); - } - UpdateBranchLengths(/*IN/OUT*/NJ); - double total_len = 0; - int iNode; - for (iNode = 0; iNode < NJ->maxnode; iNode++) - total_len += NJ->branchlength[iNode]; - return(total_len); -} - -double TreeLogLk(/*IN*/NJ_t *NJ, /*OPTIONAL OUT*/double *site_loglk) { - int i; - if (NJ->nSeq < 2) - return(0.0); - double loglk = 0.0; - double *site_likelihood = NULL; - if (site_loglk != NULL) { - site_likelihood = mymalloc(sizeof(double)*NJ->nPos); - for (i = 0; i < NJ->nPos; i++) { - site_likelihood[i] = 1.0; - site_loglk[i] = 0.0; - } - } - traversal_t traversal = InitTraversal(NJ); - int node = NJ->root; - while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { - int nChild = NJ->child[node].nChild; - if (nChild == 0) - continue; - assert(nChild >= 2); - int *children = NJ->child[node].child; - double loglkchild = PairLogLk(NJ->profiles[children[0]], NJ->profiles[children[1]], - NJ->branchlength[children[0]]+NJ->branchlength[children[1]], - NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/site_likelihood); - loglk += loglkchild; - if (site_likelihood != NULL) { - /* prevent underflows */ - for (i = 0; i < NJ->nPos; i++) { - while(site_likelihood[i] < LkUnderflow) { - site_likelihood[i] *= LkUnderflowInv; - site_loglk[i] -= LogLkUnderflow; - } - } - } - if (verbose > 2) - fprintf(stderr, "At %d: LogLk(%d:%.4f,%d:%.4f) = %.3f\n", - node, - children[0], NJ->branchlength[children[0]], - children[1], NJ->branchlength[children[1]], - loglkchild); - if (NJ->child[node].nChild == 3) { - assert(node == NJ->root); - /* Infer the common parent of the 1st two to define the third... */ - profile_t *pAB = PosteriorProfile(NJ->profiles[children[0]], - NJ->profiles[children[1]], - NJ->branchlength[children[0]], - NJ->branchlength[children[1]], - NJ->transmat, &NJ->rates, - NJ->nPos, /*nConstraints*/0); - double loglkup = PairLogLk(pAB, NJ->profiles[children[2]], - NJ->branchlength[children[2]], - NJ->nPos, NJ->transmat, &NJ->rates, - /*IN/OUT*/site_likelihood); - loglk += loglkup; - if (verbose > 2) - fprintf(stderr, "At root %d: LogLk((%d/%d),%d:%.3f) = %.3f\n", - node, children[0], children[1], children[2], - NJ->branchlength[children[2]], - loglkup); - pAB = FreeProfile(pAB, NJ->nPos, NJ->nConstraints); - } - } - traversal = FreeTraversal(traversal,NJ); - if (site_likelihood != NULL) { - for (i = 0; i < NJ->nPos; i++) { - site_loglk[i] += log(site_likelihood[i]); - } - site_likelihood = myfree(site_likelihood, sizeof(double)*NJ->nPos); - } - - /* For Jukes-Cantor, with a tree of size 4, if the children of the root are - (A,B), C, and D, then - P(ABCD) = P(A) P(B|A) P(C|AB) P(D|ABC) - - Above we compute P(B|A) P(C|AB) P(D|ABC) -- note P(B|A) is at the child of root - and P(C|AB) P(D|ABC) is at root. - - Similarly if the children of the root are C, D, and (A,B), then - P(ABCD) = P(C|D) P(A|B) P(AB|CD) P(D), and above we compute that except for P(D) - - So we need to multiply by P(A) = 0.25, so we pay log(4) at each position - (if ungapped). Each gapped position in any sequence reduces the payment by log(4) - - For JTT or GTR, we are computing P(A & B) and the posterior profiles are scaled to take - the prior into account, so we do not need any correction. - codeFreq[NOCODE] is scaled x higher so that P(-) = 1 not P(-)=1/nCodes, so gaps - do not need to be corrected either. - */ - - if (nCodes == 4 && NJ->transmat == NULL) { - int nGaps = 0; - double logNCodes = log((double)nCodes); - for (i = 0; i < NJ->nPos; i++) { - int nGapsThisPos = 0; - for (node = 0; node < NJ->nSeq; node++) { - unsigned char *codes = NJ->profiles[node]->codes; - if (codes[i] == NOCODE) - nGapsThisPos++; - } - nGaps += nGapsThisPos; - if (site_loglk != NULL) { - site_loglk[i] += nGapsThisPos * logNCodes; - if (nCodes == 4 && NJ->transmat == NULL) - site_loglk[i] -= logNCodes; - } - } - loglk -= NJ->nPos * logNCodes; - loglk += nGaps * logNCodes; /* do not pay for gaps -- only Jukes-Cantor */ - } - return(loglk); -} - -void SetMLGtr(/*IN/OUT*/NJ_t *NJ, /*OPTIONAL IN*/double *freq_in, /*OPTIONAL WRITE*/FILE *fpLog) { - int i; - assert(nCodes==4); - gtr_opt_t gtr; - gtr.NJ = NJ; - gtr.fpLog = fpLog; - if (freq_in != NULL) { - for (i=0; i<4; i++) - gtr.freq[i]=freq_in[i]; - } else { - /* n[] and sum were int in FastTree 2.1.9 and earlier -- this - caused gtr analyses to fail on analyses with >2e9 positions */ - long n[4] = {1,1,1,1}; /* pseudocounts */ - for (i=0; i<NJ->nSeq; i++) { - unsigned char *codes = NJ->profiles[i]->codes; - int iPos; - for (iPos=0; iPos<NJ->nPos; iPos++) - if (codes[iPos] < 4) - n[codes[iPos]]++; - } - long sum = n[0]+n[1]+n[2]+n[3]; - for (i=0; i<4; i++) - gtr.freq[i] = n[i]/(double)sum; - } - for (i=0; i<6; i++) - gtr.rates[i] = 1.0; - int nRounds = mlAccuracy < 2 ? 2 : mlAccuracy; - for (i = 0; i < nRounds; i++) { - for (gtr.iRate = 0; gtr.iRate < 6; gtr.iRate++) { - ProgressReport("Optimizing GTR model, step %d of %d", i*6+gtr.iRate+1, 12, 0, 0); - double negloglk, f2x; - gtr.rates[gtr.iRate] = onedimenmin(/*xmin*/0.05, - /*xguess*/gtr.rates[gtr.iRate], - /*xmax*/20.0, - GTRNegLogLk, - /*data*/>r, - /*ftol*/0.001, - /*atol*/0.0001, - /*OUT*/&negloglk, - /*OUT*/&f2x); - } - } - /* normalize gtr so last rate is 1 -- specifying that rate separately is useful for optimization only */ - for (i = 0; i < 5; i++) - gtr.rates[i] /= gtr.rates[5]; - gtr.rates[5] = 1.0; - if (verbose) { - fprintf(stderr, "GTR Frequencies: %.4f %.4f %.4f %.4f\n", gtr.freq[0], gtr.freq[1], gtr.freq[2], gtr.freq[3]); - fprintf(stderr, "GTR rates(ac ag at cg ct gt) %.4f %.4f %.4f %.4f %.4f %.4f\n", - gtr.rates[0],gtr.rates[1],gtr.rates[2],gtr.rates[3],gtr.rates[4],gtr.rates[5]); - } - if (fpLog != NULL) { - fprintf(fpLog, "GTRFreq\t%.4f\t%.4f\t%.4f\t%.4f\n", gtr.freq[0], gtr.freq[1], gtr.freq[2], gtr.freq[3]); - fprintf(fpLog, "GTRRates\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\n", - gtr.rates[0],gtr.rates[1],gtr.rates[2],gtr.rates[3],gtr.rates[4],gtr.rates[5]); - } - myfree(NJ->transmat, sizeof(transition_matrix_t)); - NJ->transmat = CreateGTR(gtr.rates, gtr.freq); - RecomputeMLProfiles(/*IN/OUT*/NJ); - OptimizeAllBranchLengths(/*IN/OUT*/NJ); -} - -double GTRNegLogLk(double x, void *data) { - - gtr_opt_t *gtr = (gtr_opt_t*)data; - assert(nCodes == 4); - assert(gtr->NJ != NULL); - assert(gtr->iRate >= 0 && gtr->iRate < 6); - assert(x > 0); - transition_matrix_t *old = gtr->NJ->transmat; - double rates[6]; - int i; - for (i = 0; i < 6; i++) - rates[i] = gtr->rates[i]; - rates[gtr->iRate] = x; - - FILE *fpLog = gtr->fpLog; - if (fpLog) - fprintf(fpLog, "GTR_Opt\tfreq %.5f %.5f %.5f %.5f rates %.5f %.5f %.5f %.5f %.5f %.5f\n", - gtr->freq[0], gtr->freq[1], gtr->freq[2], gtr->freq[3], - rates[0], rates[1], rates[2], rates[3], rates[4], rates[5]); - - gtr->NJ->transmat = CreateGTR(rates, gtr->freq); - RecomputeMLProfiles(/*IN/OUT*/gtr->NJ); - double loglk = TreeLogLk(gtr->NJ, /*site_loglk*/NULL); - myfree(gtr->NJ->transmat, sizeof(transition_matrix_t)); - gtr->NJ->transmat = old; - /* Do not recompute profiles -- assume the caller will do that */ - if (verbose > 2) - fprintf(stderr, "GTR LogLk(%.5f %.5f %.5f %.5f %.5f %.5f) = %f\n", - rates[0], rates[1], rates[2], rates[3], rates[4], rates[5], loglk); - if (fpLog) - fprintf(fpLog, "GTR_Opt\tGTR LogLk(%.5f %.5f %.5f %.5f %.5f %.5f) = %f\n", - rates[0], rates[1], rates[2], rates[3], rates[4], rates[5], loglk); - return(-loglk); -} - -/* Caller must free the resulting vector of n rates */ -numeric_t *MLSiteRates(int nRateCategories) { - /* Even spacing from 1/nRate to nRate */ - double logNCat = log((double)nRateCategories); - double logMinRate = -logNCat; - double logMaxRate = logNCat; - double logd = (logMaxRate-logMinRate)/(double)(nRateCategories-1); - - numeric_t *rates = mymalloc(sizeof(numeric_t)*nRateCategories); - int i; - for (i = 0; i < nRateCategories; i++) - rates[i] = exp(logMinRate + logd*(double)i); - return(rates); -} - -double *MLSiteLikelihoodsByRate(/*IN*/NJ_t *NJ, /*IN*/numeric_t *rates, int nRateCategories) { - double *site_loglk = mymalloc(sizeof(double)*NJ->nPos*nRateCategories); - - /* save the original rates */ - assert(NJ->rates.nRateCategories > 0); - numeric_t *oldRates = NJ->rates.rates; - NJ->rates.rates = mymalloc(sizeof(numeric_t) * NJ->rates.nRateCategories); - - /* Compute site likelihood for each rate */ - int iPos; - int iRate; - for (iRate = 0; iRate < nRateCategories; iRate++) { - int i; - for (i = 0; i < NJ->rates.nRateCategories; i++) - NJ->rates.rates[i] = rates[iRate]; - RecomputeMLProfiles(/*IN/OUT*/NJ); - double loglk = TreeLogLk(NJ, /*OUT*/&site_loglk[NJ->nPos*iRate]); - ProgressReport("Site likelihoods with rate category %d of %d", iRate+1, nRateCategories, 0, 0); - if(verbose > 2) { - fprintf(stderr, "Rate %.3f Loglk %.3f SiteLogLk", rates[iRate], loglk); - for (iPos = 0; iPos < NJ->nPos; iPos++) - fprintf(stderr,"\t%.3f", site_loglk[NJ->nPos*iRate + iPos]); - fprintf(stderr,"\n"); - } - } - - /* restore original rates and profiles */ - myfree(NJ->rates.rates, sizeof(numeric_t) * NJ->rates.nRateCategories); - NJ->rates.rates = oldRates; - RecomputeMLProfiles(/*IN/OUT*/NJ); - - return(site_loglk); -} - -void SetMLRates(/*IN/OUT*/NJ_t *NJ, int nRateCategories) { - assert(nRateCategories > 0); - AllocRateCategories(/*IN/OUT*/&NJ->rates, 1, NJ->nPos); /* set to 1 category of rate 1 */ - if (nRateCategories == 1) { - RecomputeMLProfiles(/*IN/OUT*/NJ); - return; - } - numeric_t *rates = MLSiteRates(nRateCategories); - double *site_loglk = MLSiteLikelihoodsByRate(/*IN*/NJ, /*IN*/rates, nRateCategories); - - /* Select best rate for each site, correcting for the prior - For a prior, use a gamma distribution with shape parameter 3, scale 1/3, so - Prior(rate) ~ rate**2 * exp(-3*rate) - log Prior(rate) = C + 2 * log(rate) - 3 * rate - */ - double sumRates = 0; - int iPos; - int iRate; - for (iPos = 0; iPos < NJ->nPos; iPos++) { - int iBest = -1; - double dBest = -1e20; - for (iRate = 0; iRate < nRateCategories; iRate++) { - double site_loglk_with_prior = site_loglk[NJ->nPos*iRate + iPos] - + 2.0 * log(rates[iRate]) - 3.0 * rates[iRate]; - if (site_loglk_with_prior > dBest) { - iBest = iRate; - dBest = site_loglk_with_prior; - } - } - if (verbose > 2) - fprintf(stderr, "Selected rate category %d rate %.3f for position %d\n", - iBest, rates[iBest], iPos+1); - NJ->rates.ratecat[iPos] = iBest; - sumRates += rates[iBest]; - } - site_loglk = myfree(site_loglk, sizeof(double)*NJ->nPos*nRateCategories); - - /* Force the rates to average to 1 */ - double avgRate = sumRates/NJ->nPos; - for (iRate = 0; iRate < nRateCategories; iRate++) - rates[iRate] /= avgRate; - - /* Save the rates */ - NJ->rates.rates = myfree(NJ->rates.rates, sizeof(numeric_t) * NJ->rates.nRateCategories); - NJ->rates.rates = rates; - NJ->rates.nRateCategories = nRateCategories; - - /* Update profiles based on rates */ - RecomputeMLProfiles(/*IN/OUT*/NJ); - - if (verbose) { - fprintf(stderr, "Switched to using %d rate categories (CAT approximation)\n", nRateCategories); - fprintf(stderr, "Rate categories were divided by %.3f so that average rate = 1.0\n", avgRate); - fprintf(stderr, "CAT-based log-likelihoods may not be comparable across runs\n"); - if (!gammaLogLk) - fprintf(stderr, "Use -gamma for approximate but comparable Gamma(20) log-likelihoods\n"); - } -} - -double GammaLogLk(/*IN*/siteratelk_t *s, /*OPTIONAL OUT*/double *gamma_loglk_sites) { - int iRate, iPos; - double *dRate = mymalloc(sizeof(double) * s->nRateCats); - for (iRate = 0; iRate < s->nRateCats; iRate++) { - /* The probability density for each rate is approximated by the total - density between the midpoints */ - double pMin = iRate == 0 ? 0.0 : - PGamma(s->mult * (s->rates[iRate-1] + s->rates[iRate])/2.0, s->alpha); - double pMax = iRate == s->nRateCats-1 ? 1.0 : - PGamma(s->mult * (s->rates[iRate]+s->rates[iRate+1])/2.0, s->alpha); - dRate[iRate] = pMax-pMin; - } - - double loglk = 0.0; - for (iPos = 0; iPos < s->nPos; iPos++) { - /* Prevent underflow on large trees by comparing to maximum loglk */ - double maxloglk = -1e20; - for (iRate = 0; iRate < s->nRateCats; iRate++) { - double site_loglk = s->site_loglk[s->nPos*iRate + iPos]; - if (site_loglk > maxloglk) - maxloglk = site_loglk; - } - double rellk = 0; /* likelihood scaled by exp(maxloglk) */ - for (iRate = 0; iRate < s->nRateCats; iRate++) { - double lk = exp(s->site_loglk[s->nPos*iRate + iPos] - maxloglk); - rellk += lk * dRate[iRate]; - } - double loglk_site = maxloglk + log(rellk); - loglk += loglk_site; - if (gamma_loglk_sites != NULL) - gamma_loglk_sites[iPos] = loglk_site; - } - dRate = myfree(dRate, sizeof(double)*s->nRateCats); - return(loglk); -} - -double OptAlpha(double alpha, void *data) { - siteratelk_t *s = (siteratelk_t *)data; - s->alpha = alpha; - return(-GammaLogLk(s, NULL)); -} - -double OptMult(double mult, void *data) { - siteratelk_t *s = (siteratelk_t *)data; - s->mult = mult; - return(-GammaLogLk(s, NULL)); -} - -/* Input site_loglk must be for each rate */ -double RescaleGammaLogLk(int nPos, int nRateCats, /*IN*/numeric_t *rates, /*IN*/double *site_loglk, - /*OPTIONAL*/FILE *fpLog) { - siteratelk_t s = { /*mult*/1.0, /*alpha*/1.0, nPos, nRateCats, rates, site_loglk }; - double fx, f2x; - int i; - fx = -GammaLogLk(&s, NULL); - if (verbose>2) - fprintf(stderr, "Optimizing alpha, starting at loglk %.3f\n", -fx); - for (i = 0; i < 10; i++) { - ProgressReport("Optimizing alpha round %d", i+1, 0, 0, 0); - double start = fx; - s.alpha = onedimenmin(0.01, s.alpha, 10.0, OptAlpha, &s, 0.001, 0.001, &fx, &f2x); - if (verbose>2) - fprintf(stderr, "Optimize alpha round %d to %.3f lk %.3f\n", i+1, s.alpha, -fx); - s.mult = onedimenmin(0.01, s.mult, 10.0, OptMult, &s, 0.001, 0.001, &fx, &f2x); - if (verbose>2) - fprintf(stderr, "Optimize mult round %d to %.3f lk %.3f\n", i+1, s.mult, -fx); - if (fx > start - 0.001) { - if (verbose>2) - fprintf(stderr, "Optimizing alpha & mult converged\n"); - break; - } - } - - double *gamma_loglk_sites = mymalloc(sizeof(double) * nPos); - double gammaLogLk = GammaLogLk(&s, /*OUT*/gamma_loglk_sites); - if (verbose > 0) - fprintf(stderr, "Gamma(%d) LogLk = %.3f alpha = %.3f rescaling lengths by %.3f\n", - nRateCats, gammaLogLk, s.alpha, 1/s.mult); - if (fpLog) { - int iPos; - int iRate; - fprintf(fpLog, "Gamma%dLogLk\t%.3f\tApproximate\tAlpha\t%.3f\tRescale\t%.3f\n", - nRateCats, gammaLogLk, s.alpha, 1/s.mult); - fprintf(fpLog, "Gamma%d\tSite\tLogLk", nRateCats); - for (iRate = 0; iRate < nRateCats; iRate++) - fprintf(fpLog, "\tr=%.3f", rates[iRate]/s.mult); - fprintf(fpLog,"\n"); - for (iPos = 0; iPos < nPos; iPos++) { - fprintf(fpLog, "Gamma%d\t%d\t%.3f", nRateCats, iPos, gamma_loglk_sites[iPos]); - for (iRate = 0; iRate < nRateCats; iRate++) - fprintf(fpLog, "\t%.3f", site_loglk[nPos*iRate + iPos]); - fprintf(fpLog,"\n"); - } - } - gamma_loglk_sites = myfree(gamma_loglk_sites, sizeof(double) * nPos); - return(1.0/s.mult); -} - -double MLPairOptimize(profile_t *pA, profile_t *pB, - int nPos, /*OPTIONAL*/transition_matrix_t *transmat, rates_t *rates, - /*IN/OUT*/double *branch_length) { - quartet_opt_t qopt = { nPos, transmat, rates, - /*nEval*/0, /*pair1*/pA, /*pair2*/pB }; - double f2x,negloglk; - *branch_length = onedimenmin(/*xmin*/MLMinBranchLength, - /*xguess*/*branch_length, - /*xmax*/6.0, - PairNegLogLk, - /*data*/&qopt, - /*ftol*/MLFTolBranchLength, - /*atol*/MLMinBranchLengthTolerance, - /*OUT*/&negloglk, - /*OUT*/&f2x); - return(-negloglk); /* the log likelihood */ -} - -void OptimizeAllBranchLengths(/*IN/OUT*/NJ_t *NJ) { - if (NJ->nSeq < 2) - return; - if (NJ->nSeq == 2) { - int parent = NJ->root; - assert(NJ->child[parent].nChild==2); - int nodes[2] = { NJ->child[parent].child[0], NJ->child[parent].child[1] }; - double length = 1.0; - (void)MLPairOptimize(NJ->profiles[nodes[0]], NJ->profiles[nodes[1]], - NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/&length); - NJ->branchlength[nodes[0]] = length/2.0; - NJ->branchlength[nodes[1]] = length/2.0; - return; - }; - - traversal_t traversal = InitTraversal(NJ); - profile_t **upProfiles = UpProfiles(NJ); - int node = NJ->root; - int iDone = 0; - while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { - int nChild = NJ->child[node].nChild; - if (nChild > 0) { - if ((iDone % 100) == 0) - ProgressReport("ML Lengths %d of %d splits", iDone+1, NJ->maxnode - NJ->nSeq, 0, 0); - iDone++; - - /* optimize the branch lengths between self, parent, and children, - with two iterations - */ - assert(nChild == 2 || nChild == 3); - int nodes[3] = { NJ->child[node].child[0], - NJ->child[node].child[1], - nChild == 3 ? NJ->child[node].child[2] : node }; - profile_t *profiles[3] = { NJ->profiles[nodes[0]], - NJ->profiles[nodes[1]], - nChild == 3 ? NJ->profiles[nodes[2]] - : GetUpProfile(/*IN/OUT*/upProfiles, NJ, node, /*useML*/true) }; - int iter; - for (iter = 0; iter < 2; iter++) { - int i; - for (i = 0; i < 3; i++) { - profile_t *pA = profiles[i]; - int b1 = (i+1) % 3; - int b2 = (i+2) % 3; - profile_t *pB = PosteriorProfile(profiles[b1], profiles[b2], - NJ->branchlength[nodes[b1]], - NJ->branchlength[nodes[b2]], - NJ->transmat, &NJ->rates, NJ->nPos, /*nConstraints*/0); - double len = NJ->branchlength[nodes[i]]; - if (len < MLMinBranchLength) - len = MLMinBranchLength; - (void)MLPairOptimize(pA, pB, NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/&len); - NJ->branchlength[nodes[i]] = len; - pB = FreeProfile(pB, NJ->nPos, /*nConstraints*/0); - if (verbose>3) - fprintf(stderr, "Optimize length for %d to %.3f\n", - nodes[i], NJ->branchlength[nodes[i]]); - } - } - if (node != NJ->root) { - RecomputeProfile(/*IN/OUT*/NJ, /*IN/OUT*/upProfiles, node, /*useML*/true); - DeleteUpProfile(upProfiles, NJ, node); - } - } - } - traversal = FreeTraversal(traversal,NJ); - upProfiles = FreeUpProfiles(upProfiles,NJ); -} - -void RecomputeMLProfiles(/*IN/OUT*/NJ_t *NJ) { - traversal_t traversal = InitTraversal(NJ); - int node = NJ->root; - while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { - if (NJ->child[node].nChild == 2) { - NJ->profiles[node] = FreeProfile(NJ->profiles[node], NJ->nPos, NJ->nConstraints); - int *children = NJ->child[node].child; - NJ->profiles[node] = PosteriorProfile(NJ->profiles[children[0]], NJ->profiles[children[1]], - NJ->branchlength[children[0]], NJ->branchlength[children[1]], - NJ->transmat, &NJ->rates, NJ->nPos, NJ->nConstraints); - } - } - traversal = FreeTraversal(traversal, NJ); -} - -void RecomputeProfiles(/*IN/OUT*/NJ_t *NJ, /*OPTIONAL*/distance_matrix_t *dmat) { - traversal_t traversal = InitTraversal(NJ); - int node = NJ->root; - while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { - if (NJ->child[node].nChild == 2) { - int *child = NJ->child[node].child; - NJ->profiles[node] = FreeProfile(NJ->profiles[node], NJ->nPos, NJ->nConstraints); - NJ->profiles[node] = AverageProfile(NJ->profiles[child[0]], NJ->profiles[child[1]], - NJ->nPos, NJ->nConstraints, - dmat, /*unweighted*/-1.0); - } - } - traversal = FreeTraversal(traversal,NJ); -} - -int NNI(/*IN/OUT*/NJ_t *NJ, int iRound, int nRounds, bool useML, - /*IN/OUT*/nni_stats_t *stats, - /*OUT*/double *dMaxDelta) { - /* For each non-root node N, with children A,B, sibling C, and uncle D, - we compare the current topology AB|CD to the alternate topologies - AC|BD and AD|BC, by using the 4 relevant profiles. - - If useML is true, it uses quartet maximum likelihood, and it - updates branch lengths as it goes. - - If useML is false, it uses the minimum-evolution criterion with - log-corrected distances on profiles. (If logdist is false, then - the log correction is not done.) If useML is false, then NNI() - does NOT modify the branch lengths. - - Regardless of whether it changes the topology, it recomputes the - profile for the node, using the pairwise distances and BIONJ-like - weightings (if bionj is set). The parent's profile has changed, - but recomputing it is not necessary because we will visit it - before we need it (we use postorder, so we may visit the sibling - and its children before we visit the parent, but we never - consider an ancestor's profile, so that is OK). When we change - the parent's profile, this alters the uncle's up-profile, so we - remove that. Finally, if the topology has changed, we remove the - up-profiles of the nodes. - - If we do an NNI during post-order traversal, the result is a bit - tricky. E.g. if we are at node N, and have visited its children A - and B but not its uncle C, and we do an NNI that swaps B & C, - then the post-order traversal will visit C, and its children, but - then on the way back up, it will skip N, as it has already - visited it. So, the profile of N will not be recomputed: any - changes beneath C will not be reflected in the profile of N, and - the profile of N will be slightly stale. This will be corrected - on the next round of NNIs. - */ - double supportThreshold = useML ? treeLogLkDelta : MEMinDelta; - int i; - *dMaxDelta = 0.0; - int nNNIThisRound = 0; - - if (NJ->nSeq <= 3) - return(0); /* nothing to do */ - if (verbose > 2) { - fprintf(stderr, "Beginning round %d of NNIs with ml? %d\n", iRound, useML?1:0); - PrintNJInternal(/*WRITE*/stderr, NJ, /*useLen*/useML && iRound > 0 ? 1 : 0); - } - /* For each node the upProfile or NULL */ - profile_t **upProfiles = UpProfiles(NJ); - - traversal_t traversal = InitTraversal(NJ); - - /* Identify nodes we can skip traversing into */ - int node; - if (fastNNI) { - for (node = 0; node < NJ->maxnode; node++) { - if (node != NJ->root - && node >= NJ->nSeq - && stats[node].age >= 2 - && stats[node].subtreeAge >= 2 - && stats[node].support > supportThreshold) { - int nodeABCD[4]; - SetupABCD(NJ, node, NULL, NULL, /*OUT*/nodeABCD, useML); - for (i = 0; i < 4; i++) - if (stats[nodeABCD[i]].age == 0 && stats[nodeABCD[i]].support > supportThreshold) - break; - if (i == 4) { - SkipTraversalInto(node, /*IN/OUT*/traversal); - if (verbose > 2) - fprintf(stderr, "Skipping subtree at %d: child %d %d parent %d age %d subtreeAge %d support %.3f\n", - node, nodeABCD[0], nodeABCD[1], NJ->parent[node], - stats[node].age, stats[node].subtreeAge, stats[node].support); - } - } - } - } - - int iDone = 0; - bool bUp; - node = NJ->root; - while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, &bUp)) >= 0) { - if (node < NJ->nSeq || node == NJ->root) - continue; /* nothing to do for leaves or root */ - if (bUp) { - if(verbose > 2) - fprintf(stderr, "Going up back to node %d\n", node); - /* No longer needed */ - for (i = 0; i < NJ->child[node].nChild; i++) - DeleteUpProfile(upProfiles, NJ, NJ->child[node].child[i]); - DeleteUpProfile(upProfiles, NJ, node); - RecomputeProfile(/*IN/OUT*/NJ, /*IN/OUT*/upProfiles, node, useML); - continue; - } - if ((iDone % 100) == 0) { - char buf[100]; - sprintf(buf, "%s NNI round %%d of %%d, %%d of %%d splits", useML ? "ML" : "ME"); - if (iDone > 0) - sprintf(buf+strlen(buf), ", %d changes", nNNIThisRound); - if (nNNIThisRound > 0) - sprintf(buf+strlen(buf), " (max delta %.3f)", *dMaxDelta); - ProgressReport(buf, iRound+1, nRounds, iDone+1, NJ->maxnode - NJ->nSeq); - } - iDone++; - - profile_t *profiles[4]; - int nodeABCD[4]; - /* Note -- during the first round of ML NNIs, we use the min-evo-based branch lengths, - which may be suboptimal */ - SetupABCD(NJ, node, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, useML); - - /* Given our 4 profiles, consider doing a swap */ - int nodeA = nodeABCD[0]; - int nodeB = nodeABCD[1]; - int nodeC = nodeABCD[2]; - int nodeD = nodeABCD[3]; - - nni_t choice = ABvsCD; - - if (verbose > 2) - fprintf(stderr,"Considering NNI around %d: Swap A=%d B=%d C=%d D=up(%d) or parent %d\n", - node, nodeA, nodeB, nodeC, nodeD, NJ->parent[node]); - if (verbose > 3 && useML) { - double len[5] = { NJ->branchlength[nodeA], NJ->branchlength[nodeB], NJ->branchlength[nodeC], NJ->branchlength[nodeD], - NJ->branchlength[node] }; - for (i=0; i < 5; i++) - if (len[i] < MLMinBranchLength) - len[i] = MLMinBranchLength; - fprintf(stderr, "Starting quartet likelihood %.3f len %.3f %.3f %.3f %.3f %.3f\n", - MLQuartetLogLk(profiles[0],profiles[1],profiles[2],profiles[3],NJ->nPos,NJ->transmat,&NJ->rates,len, /*site_lk*/NULL), - len[0], len[1], len[2], len[3], len[4]); - } - - numeric_t newlength[5]; - double criteria[3]; - if (useML) { - for (i = 0; i < 4; i++) - newlength[i] = NJ->branchlength[nodeABCD[i]]; - newlength[4] = NJ->branchlength[node]; - bool bFast = mlAccuracy < 2 && stats[node].age > 0; - choice = MLQuartetNNI(profiles, NJ->transmat, &NJ->rates, NJ->nPos, NJ->nConstraints, - /*OUT*/criteria, /*IN/OUT*/newlength, bFast); - } else { - choice = ChooseNNI(profiles, NJ->distance_matrix, NJ->nPos, NJ->nConstraints, - /*OUT*/criteria); - /* invert criteria so that higher is better, as in ML case, to simplify code below */ - for (i = 0; i < 3; i++) - criteria[i] = -criteria[i]; - } - - if (choice == ACvsBD) { - /* swap B and C */ - ReplaceChild(/*IN/OUT*/NJ, node, nodeB, nodeC); - ReplaceChild(/*IN/OUT*/NJ, NJ->parent[node], nodeC, nodeB); - } else if (choice == ADvsBC) { - /* swap A and C */ - ReplaceChild(/*IN/OUT*/NJ, node, nodeA, nodeC); - ReplaceChild(/*IN/OUT*/NJ, NJ->parent[node], nodeC, nodeA); - } - - if (useML) { - /* update branch length for the internal branch, and of any - branches that lead to leaves, b/c those will not are not - the internal branch for NNI and would not otherwise be set. - */ - if (choice == ADvsBC) { - /* For ADvsBC, MLQuartetNNI swaps B with D, but we swap A with C */ - double length2[5] = { newlength[LEN_C], newlength[LEN_D], - newlength[LEN_A], newlength[LEN_B], - newlength[LEN_I] }; - int i; - for (i = 0; i < 5; i++) newlength[i] = length2[i]; - /* and swap A and C */ - double tmp = newlength[LEN_A]; - newlength[LEN_A] = newlength[LEN_C]; - newlength[LEN_C] = tmp; - } else if (choice == ACvsBD) { - /* swap B and C */ - double tmp = newlength[LEN_B]; - newlength[LEN_B] = newlength[LEN_C]; - newlength[LEN_C] = tmp; - } - - NJ->branchlength[node] = newlength[LEN_I]; - NJ->branchlength[nodeA] = newlength[LEN_A]; - NJ->branchlength[nodeB] = newlength[LEN_B]; - NJ->branchlength[nodeC] = newlength[LEN_C]; - NJ->branchlength[nodeD] = newlength[LEN_D]; - } - - if (verbose>2 && (choice != ABvsCD || verbose > 2)) - fprintf(stderr,"NNI around %d: Swap A=%d B=%d C=%d D=out(C) -- choose %s %s %.4f\n", - node, nodeA, nodeB, nodeC, - choice == ACvsBD ? "AC|BD" : (choice == ABvsCD ? "AB|CD" : "AD|BC"), - useML ? "delta-loglk" : "-deltaLen", - criteria[choice] - criteria[ABvsCD]); - if(verbose >= 3 && slow && useML) - fprintf(stderr, "Old tree lk -- %.4f\n", TreeLogLk(NJ, /*site_likelihoods*/NULL)); - - /* update stats, *dMaxDelta, etc. */ - if (choice == ABvsCD) { - stats[node].age++; - } else { - if (useML) - nML_NNI++; - else - nNNI++; - nNNIThisRound++; - stats[node].age = 0; - stats[nodeA].age = 0; - stats[nodeB].age = 0; - stats[nodeC].age = 0; - stats[nodeD].age = 0; - } - stats[node].delta = criteria[choice] - criteria[ABvsCD]; /* 0 if ABvsCD */ - if (stats[node].delta > *dMaxDelta) - *dMaxDelta = stats[node].delta; - - /* support is improvement of score for self over better of alternatives */ - stats[node].support = 1e20; - for (i = 0; i < 3; i++) - if (choice != i && criteria[choice]-criteria[i] < stats[node].support) - stats[node].support = criteria[choice]-criteria[i]; - - /* subtreeAge is the number of rounds since self or descendent had a significant improvement */ - if (stats[node].delta > supportThreshold) - stats[node].subtreeAge = 0; - else { - stats[node].subtreeAge++; - for (i = 0; i < 2; i++) { - int child = NJ->child[node].child[i]; - if (stats[node].subtreeAge > stats[child].subtreeAge) - stats[node].subtreeAge = stats[child].subtreeAge; - } - } - - /* update profiles and free up unneeded up-profiles */ - if (choice == ABvsCD) { - /* No longer needed */ - DeleteUpProfile(upProfiles, NJ, nodeA); - DeleteUpProfile(upProfiles, NJ, nodeB); - DeleteUpProfile(upProfiles, NJ, nodeC); - RecomputeProfile(/*IN/OUT*/NJ, /*IN/OUT*/upProfiles, node, useML); - if(slow && useML) - UpdateForNNI(NJ, node, upProfiles, useML); - } else { - UpdateForNNI(NJ, node, upProfiles, useML); - } - if(verbose > 2 && slow && useML) { - /* Note we recomputed profiles back up to root already if slow */ - PrintNJInternal(/*WRITE*/stderr, NJ, /*useLen*/true); - fprintf(stderr, "New tree lk -- %.4f\n", TreeLogLk(NJ, /*site_likelihoods*/NULL)); - } - } /* end postorder traversal */ - traversal = FreeTraversal(traversal,NJ); - if (verbose>=2) { - int nUp = 0; - for (i = 0; i < NJ->maxnodes; i++) - if (upProfiles[i] != NULL) - nUp++; - fprintf(stderr, "N up profiles at end of NNI: %d\n", nUp); - } - upProfiles = FreeUpProfiles(upProfiles,NJ); - return(nNNIThisRound); -} - -nni_stats_t *InitNNIStats(NJ_t *NJ) { - nni_stats_t *stats = mymalloc(sizeof(nni_stats_t)*NJ->maxnode); - const int LargeAge = 1000000; - int i; - for (i = 0; i < NJ->maxnode; i++) { - stats[i].delta = 0; - stats[i].support = 0; - if (i == NJ->root || i < NJ->nSeq) { - stats[i].age = LargeAge; - stats[i].subtreeAge = LargeAge; - } else { - stats[i].age = 0; - stats[i].subtreeAge = 0; - } - } - return(stats); -} - -nni_stats_t *FreeNNIStats(nni_stats_t *stats, NJ_t *NJ) { - return(myfree(stats, sizeof(nni_stats_t)*NJ->maxnode)); -} - -int FindSPRSteps(/*IN/OUT*/NJ_t *NJ, - int nodeMove, /* the node to move multiple times */ - int nodeAround, /* sibling or parent of node to NNI to start the chain */ - /*IN/OUT*/profile_t **upProfiles, - /*OUT*/spr_step_t *steps, - int maxSteps, - bool bFirstAC) { - int iStep; - for (iStep = 0; iStep < maxSteps; iStep++) { - if (NJ->child[nodeAround].nChild != 2) - break; /* no further to go */ - - /* Consider the NNIs around nodeAround */ - profile_t *profiles[4]; - int nodeABCD[4]; - SetupABCD(NJ, nodeAround, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, /*useML*/false); - double criteria[3]; - (void) ChooseNNI(profiles, NJ->distance_matrix, NJ->nPos, NJ->nConstraints, - /*OUT*/criteria); - - /* Do & save the swap */ - spr_step_t *step = &steps[iStep]; - if (iStep == 0 ? bFirstAC : criteria[ACvsBD] < criteria[ADvsBC]) { - /* swap B & C to put AC together */ - step->deltaLength = criteria[ACvsBD] - criteria[ABvsCD]; - step->nodes[0] = nodeABCD[1]; - step->nodes[1] = nodeABCD[2]; - } else { - /* swap AC to put AD together */ - step->deltaLength = criteria[ADvsBC] - criteria[ABvsCD]; - step->nodes[0] = nodeABCD[0]; - step->nodes[1] = nodeABCD[2]; - } - - if (verbose>3) { - fprintf(stderr, "SPR chain step %d for %d around %d swap %d %d deltaLen %.5f\n", - iStep+1, nodeAround, nodeMove, step->nodes[0], step->nodes[1], step->deltaLength); - if (verbose>4) - PrintNJInternal(stderr, NJ, /*useLen*/false); - } - ReplaceChild(/*IN/OUT*/NJ, nodeAround, step->nodes[0], step->nodes[1]); - ReplaceChild(/*IN/OUT*/NJ, NJ->parent[nodeAround], step->nodes[1], step->nodes[0]); - UpdateForNNI(/*IN/OUT*/NJ, nodeAround, /*IN/OUT*/upProfiles, /*useML*/false); - - /* set the new nodeAround -- either parent(nodeMove) or sibling(nodeMove) -- - so that it different from current nodeAround - */ - int newAround[2] = { NJ->parent[nodeMove], Sibling(NJ, nodeMove) }; - if (NJ->parent[nodeMove] == NJ->root) - RootSiblings(NJ, nodeMove, /*OUT*/newAround); - assert(newAround[0] == nodeAround || newAround[1] == nodeAround); - assert(newAround[0] != newAround[1]); - nodeAround = newAround[newAround[0] == nodeAround ? 1 : 0]; - } - return(iStep); -} - -void UnwindSPRStep(/*IN/OUT*/NJ_t *NJ, - /*IN*/spr_step_t *step, - /*IN/OUT*/profile_t **upProfiles) { - int parents[2]; - int i; - for (i = 0; i < 2; i++) { - assert(step->nodes[i] >= 0 && step->nodes[i] < NJ->maxnodes); - parents[i] = NJ->parent[step->nodes[i]]; - assert(parents[i] >= 0); - } - assert(parents[0] != parents[1]); - ReplaceChild(/*IN/OUT*/NJ, parents[0], step->nodes[0], step->nodes[1]); - ReplaceChild(/*IN/OUT*/NJ, parents[1], step->nodes[1], step->nodes[0]); - int iYounger = 0; - if (NJ->parent[parents[0]] == parents[1]) { - iYounger = 0; - } else { - assert(NJ->parent[parents[1]] == parents[0]); - iYounger = 1; - } - UpdateForNNI(/*IN/OUT*/NJ, parents[iYounger], /*IN/OUT*/upProfiles, /*useML*/false); -} - -/* Update the profile of node and its ancestor, and delete nearby out-profiles */ -void UpdateForNNI(/*IN/OUT*/NJ_t *NJ, int node, /*IN/OUT*/profile_t **upProfiles, - bool useML) { - int i; - if (slow) { - /* exhaustive update */ - for (i = 0; i < NJ->maxnodes; i++) - DeleteUpProfile(upProfiles, NJ, i); - - /* update profiles back to root */ - int ancestor; - for (ancestor = node; ancestor >= 0; ancestor = NJ->parent[ancestor]) - RecomputeProfile(/*IN/OUT*/NJ, upProfiles, ancestor, useML); - - /* remove any up-profiles made while doing that*/ - for (i = 0; i < NJ->maxnodes; i++) - DeleteUpProfile(upProfiles, NJ, i); - } else { - /* if fast, only update around self - note that upProfile(parent) is still OK after an NNI, but - up-profiles of uncles may not be - */ - DeleteUpProfile(upProfiles, NJ, node); - for (i = 0; i < NJ->child[node].nChild; i++) - DeleteUpProfile(upProfiles, NJ, NJ->child[node].child[i]); - assert(node != NJ->root); - int parent = NJ->parent[node]; - int neighbors[2] = { parent, Sibling(NJ, node) }; - if (parent == NJ->root) - RootSiblings(NJ, node, /*OUT*/neighbors); - DeleteUpProfile(upProfiles, NJ, neighbors[0]); - DeleteUpProfile(upProfiles, NJ, neighbors[1]); - int uncle = Sibling(NJ, parent); - if (uncle >= 0) - DeleteUpProfile(upProfiles, NJ, uncle); - RecomputeProfile(/*IN/OUT*/NJ, upProfiles, node, useML); - RecomputeProfile(/*IN/OUT*/NJ, upProfiles, parent, useML); - } -} - -void SPR(/*IN/OUT*/NJ_t *NJ, int maxSPRLength, int iRound, int nRounds) { - /* Given a non-root node N with children A,B, sibling C, and uncle D, - we can try to move A by doing three types of moves (4 choices): - "down" -- swap A with a child of B (if B is not a leaf) [2 choices] - "over" -- swap B with C - "up" -- swap A with D - We follow down moves with down moves, over moves with down moves, and - up moves with either up or over moves. (Other choices are just backing - up and hence useless.) - - As with NNIs, we keep track of up-profiles as we go. However, some of the regular - profiles may also become "stale" so it is a bit trickier. - - We store the traversal before we do SPRs to avoid any possible infinite loop - */ - double last_tot_len = 0.0; - if (NJ->nSeq <= 3 || maxSPRLength < 1) - return; - if (slow) - last_tot_len = TreeLength(NJ, /*recomputeLengths*/true); - int *nodeList = mymalloc(sizeof(int) * NJ->maxnodes); - int nodeListLen = 0; - traversal_t traversal = InitTraversal(NJ); - int node = NJ->root; - while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { - nodeList[nodeListLen++] = node; - } - assert(nodeListLen == NJ->maxnode); - traversal = FreeTraversal(traversal,NJ); - - profile_t **upProfiles = UpProfiles(NJ); - spr_step_t *steps = mymalloc(sizeof(spr_step_t) * maxSPRLength); /* current chain of SPRs */ - - int i; - for (i = 0; i < nodeListLen; i++) { - node = nodeList[i]; - if ((i % 100) == 0) - ProgressReport("SPR round %3d of %3d, %d of %d nodes", - iRound+1, nRounds, i+1, nodeListLen); - if (node == NJ->root) - continue; /* nothing to do for root */ - /* The nodes to NNI around */ - int nodeAround[2] = { NJ->parent[node], Sibling(NJ, node) }; - if (NJ->parent[node] == NJ->root) { - /* NNI around both siblings instead */ - RootSiblings(NJ, node, /*OUT*/nodeAround); - } - bool bChanged = false; - int iAround; - for (iAround = 0; iAround < 2 && bChanged == false; iAround++) { - int ACFirst; - for (ACFirst = 0; ACFirst < 2 && bChanged == false; ACFirst++) { - if(verbose > 3) - PrintNJInternal(stderr, NJ, /*useLen*/false); - int chainLength = FindSPRSteps(/*IN/OUT*/NJ, node, nodeAround[iAround], - upProfiles, /*OUT*/steps, maxSPRLength, (bool)ACFirst); - double dMinDelta = 0.0; - int iCBest = -1; - double dTotDelta = 0.0; - int iC; - for (iC = 0; iC < chainLength; iC++) { - dTotDelta += steps[iC].deltaLength; - if (dTotDelta < dMinDelta) { - dMinDelta = dTotDelta; - iCBest = iC; - } - } - - if (verbose>3) { - fprintf(stderr, "SPR %s %d around %d chainLength %d of %d deltaLength %.5f swaps:", - iCBest >= 0 ? "move" : "abandoned", - node,nodeAround[iAround],iCBest+1,chainLength,dMinDelta); - for (iC = 0; iC < chainLength; iC++) - fprintf(stderr, " (%d,%d)%.4f", steps[iC].nodes[0], steps[iC].nodes[1], steps[iC].deltaLength); - fprintf(stderr,"\n"); - } - for (iC = chainLength - 1; iC > iCBest; iC--) - UnwindSPRStep(/*IN/OUT*/NJ, /*IN*/&steps[iC], /*IN/OUT*/upProfiles); - if(verbose > 3) - PrintNJInternal(stderr, NJ, /*useLen*/false); - while (slow && iCBest >= 0) { - double expected_tot_len = last_tot_len + dMinDelta; - double new_tot_len = TreeLength(NJ, /*recompute*/true); - if (verbose > 2) - fprintf(stderr, "Total branch-length is now %.4f was %.4f expected %.4f\n", - new_tot_len, last_tot_len, expected_tot_len); - if (new_tot_len < last_tot_len) { - last_tot_len = new_tot_len; - break; /* no rewinding necessary */ - } - if (verbose > 2) - fprintf(stderr, "Rewinding SPR to %d\n",iCBest); - UnwindSPRStep(/*IN/OUT*/NJ, /*IN*/&steps[iCBest], /*IN/OUT*/upProfiles); - dMinDelta -= steps[iCBest].deltaLength; - iCBest--; - } - if (iCBest >= 0) - bChanged = true; - } /* loop over which step to take at 1st NNI */ - } /* loop over which node to pivot around */ - - if (bChanged) { - nSPR++; /* the SPR move is OK */ - /* make sure all the profiles are OK */ - int j; - for (j = 0; j < NJ->maxnodes; j++) - DeleteUpProfile(upProfiles, NJ, j); - int ancestor; - for (ancestor = NJ->parent[node]; ancestor >= 0; ancestor = NJ->parent[ancestor]) - RecomputeProfile(/*IN/OUT*/NJ, upProfiles, ancestor, /*useML*/false); - } - } /* end loop over subtrees to prune & regraft */ - steps = myfree(steps, sizeof(spr_step_t) * maxSPRLength); - upProfiles = FreeUpProfiles(upProfiles,NJ); - nodeList = myfree(nodeList, sizeof(int) * NJ->maxnodes); -} - -void RecomputeProfile(/*IN/OUT*/NJ_t *NJ, /*IN/OUT*/profile_t **upProfiles, int node, - bool useML) { - if (node < NJ->nSeq || node == NJ->root) - return; /* no profile to compute */ - assert(NJ->child[node].nChild==2); - - profile_t *profiles[4]; - double weight = 0.5; - if (useML || !bionj) { - profiles[0] = NJ->profiles[NJ->child[node].child[0]]; - profiles[1] = NJ->profiles[NJ->child[node].child[1]]; - } else { - int nodeABCD[4]; - SetupABCD(NJ, node, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, useML); - weight = QuartetWeight(profiles, NJ->distance_matrix, NJ->nPos); - } - if (verbose>3) { - if (useML) { - fprintf(stderr, "Recompute %d from %d %d lengths %.4f %.4f\n", - node, - NJ->child[node].child[0], - NJ->child[node].child[1], - NJ->branchlength[NJ->child[node].child[0]], - NJ->branchlength[NJ->child[node].child[1]]); - } else { - fprintf(stderr, "Recompute %d from %d %d weight %.3f\n", - node, NJ->child[node].child[0], NJ->child[node].child[1], weight); - } - } - NJ->profiles[node] = FreeProfile(NJ->profiles[node], NJ->nPos, NJ->nConstraints); - if (useML) { - NJ->profiles[node] = PosteriorProfile(profiles[0], profiles[1], - NJ->branchlength[NJ->child[node].child[0]], - NJ->branchlength[NJ->child[node].child[1]], - NJ->transmat, &NJ->rates, NJ->nPos, NJ->nConstraints); - } else { - NJ->profiles[node] = AverageProfile(profiles[0], profiles[1], - NJ->nPos, NJ->nConstraints, - NJ->distance_matrix, weight); - } -} - -/* The BIONJ-like formula for the weight of A when building a profile for AB is - 1/2 + (avgD(B,CD) - avgD(A,CD))/(2*d(A,B)) -*/ -double QuartetWeight(profile_t *profiles[4], distance_matrix_t *dmat, int nPos) { - if (!bionj) - return(-1.0); /* even weighting */ - double d[6]; - CorrectedPairDistances(profiles, 4, dmat, nPos, /*OUT*/d); - if (d[qAB] < 0.01) - return -1.0; - double weight = 0.5 + ((d[qBC]+d[qBD])-(d[qAC]+d[qAD]))/(4*d[qAB]); - if (weight < 0) - weight = 0; - if (weight > 1) - weight = 1; - return (weight); -} - -/* Resets the children entry of parent and also the parent entry of newchild */ -void ReplaceChild(/*IN/OUT*/NJ_t *NJ, int parent, int oldchild, int newchild) { - NJ->parent[newchild] = parent; - - int iChild; - for (iChild = 0; iChild < NJ->child[parent].nChild; iChild++) { - if (NJ->child[parent].child[iChild] == oldchild) { - NJ->child[parent].child[iChild] = newchild; - return; - } - } - assert(0); -} - -/* Recomputes all branch lengths - - For internal branches such as (A,B) vs. (C,D), uses the formula - - length(AB|CD) = (d(A,C)+d(A,D)+d(B,C)+d(B,D))/4 - d(A,B)/2 - d(C,D)/2 - - (where all distances are profile distances - diameters). - - For external branches (e.g. to leaves) A vs. (B,C), use the formula - - length(A|BC) = (d(A,B)+d(A,C)-d(B,C))/2 -*/ -void UpdateBranchLengths(/*IN/OUT*/NJ_t *NJ) { - if (NJ->nSeq < 2) - return; - else if (NJ->nSeq == 2) { - int root = NJ->root; - int nodeA = NJ->child[root].child[0]; - int nodeB = NJ->child[root].child[1]; - besthit_t h; - ProfileDist(NJ->profiles[nodeA],NJ->profiles[nodeB], - NJ->nPos, NJ->distance_matrix, /*OUT*/&h); - if (logdist) - h.dist = LogCorrect(h.dist); - NJ->branchlength[nodeA] = h.dist/2.0; - NJ->branchlength[nodeB] = h.dist/2.0; - return; - } - - profile_t **upProfiles = UpProfiles(NJ); - traversal_t traversal = InitTraversal(NJ); - int node = NJ->root; - - while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { - /* reset branch length of node (distance to its parent) */ - if (node == NJ->root) - continue; /* no branch length to set */ - if (node < NJ->nSeq) { /* a leaf */ - profile_t *profileA = NJ->profiles[node]; - profile_t *profileB = NULL; - profile_t *profileC = NULL; - - int sib = Sibling(NJ,node); - if (sib == -1) { /* at root, have 2 siblings */ - int sibs[2]; - RootSiblings(NJ, node, /*OUT*/sibs); - profileB = NJ->profiles[sibs[0]]; - profileC = NJ->profiles[sibs[1]]; - } else { - profileB = NJ->profiles[sib]; - profileC = GetUpProfile(/*IN/OUT*/upProfiles, NJ, NJ->parent[node], /*useML*/false); - } - profile_t *profiles[3] = {profileA,profileB,profileC}; - double d[3]; /*AB,AC,BC*/ - CorrectedPairDistances(profiles, 3, NJ->distance_matrix, NJ->nPos, /*OUT*/d); - /* d(A,BC) = (dAB+dAC-dBC)/2 */ - NJ->branchlength[node] = (d[0]+d[1]-d[2])/2.0; - } else { - profile_t *profiles[4]; - int nodeABCD[4]; - SetupABCD(NJ, node, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, /*useML*/false); - double d[6]; - CorrectedPairDistances(profiles, 4, NJ->distance_matrix, NJ->nPos, /*OUT*/d); - NJ->branchlength[node] = (d[qAC]+d[qAD]+d[qBC]+d[qBD])/4.0 - (d[qAB]+d[qCD])/2.0; - - /* no longer needed */ - DeleteUpProfile(upProfiles, NJ, nodeABCD[0]); - DeleteUpProfile(upProfiles, NJ, nodeABCD[1]); - } - } - traversal = FreeTraversal(traversal,NJ); - upProfiles = FreeUpProfiles(upProfiles,NJ); -} - -/* Pick columns for resampling, stored as returned_vector[iBoot*nPos + j] */ -int *ResampleColumns(int nPos, int nBootstrap) { - long lPos = nPos; /* to prevent overflow on very long alignments when multiplying nPos * nBootstrap */ - int *col = (int*)mymalloc(sizeof(int)*lPos*(size_t)nBootstrap); - int i; - for (i = 0; i < nBootstrap; i++) { - int j; - for (j = 0; j < nPos; j++) { - int pos = (int)(knuth_rand() * nPos); - if (pos<0) - pos = 0; - else if (pos == nPos) - pos = nPos-1; - col[i*lPos + j] = pos; - } - } - if (verbose > 5) { - for (i=0; i < 3 && i < nBootstrap; i++) { - fprintf(stderr,"Boot%d",i); - int j; - for (j = 0; j < nPos; j++) { - fprintf(stderr,"\t%d",col[i*lPos+j]); - } - fprintf(stderr,"\n"); - } - } - return(col); -} - -void ReliabilityNJ(/*IN/OUT*/NJ_t *NJ, int nBootstrap) { - /* For each non-root node N, with children A,B, parent P, sibling C, and grandparent G, - we test the reliability of the split (A,B) versus rest by comparing the profiles - of A, B, C, and the "up-profile" of P. - - Each node's upProfile is the average of its sibling's (down)-profile + its parent's up-profile - (If node's parent is the root, then there are two siblings and we don't need an up-profile) - - To save memory, we do depth-first-search down from the root, and we only keep - up-profiles for nodes in the active path. - */ - if (NJ->nSeq <= 3 || nBootstrap <= 0) - return; /* nothing to do */ - int *col = ResampleColumns(NJ->nPos, nBootstrap); - - profile_t **upProfiles = UpProfiles(NJ); - traversal_t traversal = InitTraversal(NJ); - int node = NJ->root; - int iNodesDone = 0; - while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { - if (node < NJ->nSeq || node == NJ->root) - continue; /* nothing to do for leaves or root */ - - if(iNodesDone > 0 && (iNodesDone % 100) == 0) - ProgressReport("Local bootstrap for %6d of %6d internal splits", iNodesDone, NJ->nSeq-3, 0, 0); - iNodesDone++; - - profile_t *profiles[4]; - int nodeABCD[4]; - SetupABCD(NJ, node, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, /*useML*/false); - - NJ->support[node] = SplitSupport(profiles[0], profiles[1], profiles[2], profiles[3], - NJ->distance_matrix, - NJ->nPos, - nBootstrap, - col); - - /* no longer needed */ - DeleteUpProfile(upProfiles, NJ, nodeABCD[0]); - DeleteUpProfile(upProfiles, NJ, nodeABCD[1]); - DeleteUpProfile(upProfiles, NJ, nodeABCD[2]); - } - traversal = FreeTraversal(traversal,NJ); - upProfiles = FreeUpProfiles(upProfiles,NJ); - col = myfree(col, sizeof(int)*((size_t)NJ->nPos)*nBootstrap); -} - -profile_t *NewProfile(int nPos, int nConstraints) { - profile_t *profile = (profile_t *)mymalloc(sizeof(profile_t)); - profile->weights = mymalloc(sizeof(numeric_t)*nPos); - profile->codes = mymalloc(sizeof(unsigned char)*nPos); - profile->vectors = NULL; - profile->nVectors = 0; - profile->codeDist = NULL; - if (nConstraints == 0) { - profile->nOn = NULL; - profile->nOff = NULL; - } else { - profile->nOn = mymalloc(sizeof(int)*nConstraints); - profile->nOff = mymalloc(sizeof(int)*nConstraints); - } - return(profile); -} - -profile_t *FreeProfile(profile_t *profile, int nPos, int nConstraints) { - if(profile==NULL) return(NULL); - myfree(profile->codes, nPos); - myfree(profile->weights, nPos); - myfree(profile->vectors, sizeof(numeric_t)*nCodes*profile->nVectors); - myfree(profile->codeDist, sizeof(numeric_t)*nCodes*nPos); - if (nConstraints > 0) { - myfree(profile->nOn, sizeof(int)*nConstraints); - myfree(profile->nOff, sizeof(int)*nConstraints); - } - return(myfree(profile, sizeof(profile_t))); -} - -void SetupABCD(NJ_t *NJ, int node, - /* the 4 profiles; the last one is an outprofile */ - /*OPTIONAL OUT*/profile_t *profiles[4], - /*OPTIONAL IN/OUT*/profile_t **upProfiles, - /*OUT*/int nodeABCD[4], - bool useML) { - int parent = NJ->parent[node]; - assert(parent >= 0); - assert(NJ->child[node].nChild == 2); - nodeABCD[0] = NJ->child[node].child[0]; /*A*/ - nodeABCD[1] = NJ->child[node].child[1]; /*B*/ - - profile_t *profile4 = NULL; - if (parent == NJ->root) { - int sibs[2]; - RootSiblings(NJ, node, /*OUT*/sibs); - nodeABCD[2] = sibs[0]; - nodeABCD[3] = sibs[1]; - if (profiles == NULL) - return; - profile4 = NJ->profiles[sibs[1]]; - } else { - nodeABCD[2] = Sibling(NJ,node); - assert(nodeABCD[2] >= 0); - nodeABCD[3] = parent; - if (profiles == NULL) - return; - profile4 = GetUpProfile(upProfiles,NJ,parent,useML); - } - assert(upProfiles != NULL); - int i; - for (i = 0; i < 3; i++) - profiles[i] = NJ->profiles[nodeABCD[i]]; - profiles[3] = profile4; -} - - -int Sibling(NJ_t *NJ, int node) { - int parent = NJ->parent[node]; - if (parent < 0 || parent == NJ->root) - return(-1); - int iChild; - for(iChild=0;iChild<NJ->child[parent].nChild;iChild++) { - if(NJ->child[parent].child[iChild] != node) - return (NJ->child[parent].child[iChild]); - } - assert(0); - return(-1); -} - -void RootSiblings(NJ_t *NJ, int node, /*OUT*/int sibs[2]) { - assert(NJ->parent[node] == NJ->root); - assert(NJ->child[NJ->root].nChild == 3); - - int nSibs = 0; - int iChild; - for(iChild=0; iChild < NJ->child[NJ->root].nChild; iChild++) { - int child = NJ->child[NJ->root].child[iChild]; - if (child != node) sibs[nSibs++] = child; - } - assert(nSibs==2); -} - -void TestSplitsML(/*IN/OUT*/NJ_t *NJ, /*OUT*/SplitCount_t *splitcount, int nBootstrap) { - const double tolerance = 1e-6; - splitcount->nBadSplits = 0; - splitcount->nConstraintViolations = 0; - splitcount->nBadBoth = 0; - splitcount->nSplits = 0; - splitcount->dWorstDeltaUnconstrained = 0; - splitcount->dWorstDeltaConstrained = 0; - - profile_t **upProfiles = UpProfiles(NJ); - traversal_t traversal = InitTraversal(NJ); - int node = NJ->root; - - int *col = nBootstrap > 0 ? ResampleColumns(NJ->nPos, nBootstrap) : NULL; - double *site_likelihoods[3]; - int choice; - for (choice = 0; choice < 3; choice++) - site_likelihoods[choice] = mymalloc(sizeof(double)*NJ->nPos); - - int iNodesDone = 0; - while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { - if (node < NJ->nSeq || node == NJ->root) - continue; /* nothing to do for leaves or root */ - - if(iNodesDone > 0 && (iNodesDone % 100) == 0) - ProgressReport("ML split tests for %6d of %6d internal splits", iNodesDone, NJ->nSeq-3, 0, 0); - iNodesDone++; - - profile_t *profiles[4]; - int nodeABCD[4]; - SetupABCD(NJ, node, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, /*useML*/true); - double loglk[3]; - double len[5]; - int i; - for (i = 0; i < 4; i++) - len[i] = NJ->branchlength[nodeABCD[i]]; - len[4] = NJ->branchlength[node]; - double lenABvsCD[5] = {len[LEN_A], len[LEN_B], len[LEN_C], len[LEN_D], len[LEN_I]}; - double lenACvsBD[5] = {len[LEN_A], len[LEN_C], len[LEN_B], len[LEN_D], len[LEN_I]}; /* Swap B & C */ - double lenADvsBC[5] = {len[LEN_A], len[LEN_D], len[LEN_C], len[LEN_B], len[LEN_I]}; /* Swap B & D */ - - { -#ifdef OPENMP - #pragma omp parallel - #pragma omp sections -#endif - { -#ifdef OPENMP - #pragma omp section -#endif - { - /* Lengths are already optimized for ABvsCD */ - loglk[ABvsCD] = MLQuartetLogLk(profiles[0], profiles[1], profiles[2], profiles[3], - NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/lenABvsCD, - /*OUT*/site_likelihoods[ABvsCD]); - } - -#ifdef OPENMP - #pragma omp section -#endif - { - loglk[ACvsBD] = MLQuartetOptimize(profiles[0], profiles[2], profiles[1], profiles[3], - NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/lenACvsBD, /*pStarTest*/NULL, - /*OUT*/site_likelihoods[ACvsBD]); - } - -#ifdef OPENMP - #pragma omp section -#endif - { - loglk[ADvsBC] = MLQuartetOptimize(profiles[0], profiles[3], profiles[2], profiles[1], - NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/lenADvsBC, /*pStarTest*/NULL, - /*OUT*/site_likelihoods[ADvsBC]); - } - } - } - - /* do a second pass on the better alternative if it is close */ - if (loglk[ACvsBD] > loglk[ADvsBC]) { - if (mlAccuracy > 1 || loglk[ACvsBD] > loglk[ABvsCD] - closeLogLkLimit) { - loglk[ACvsBD] = MLQuartetOptimize(profiles[0], profiles[2], profiles[1], profiles[3], - NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/lenACvsBD, /*pStarTest*/NULL, - /*OUT*/site_likelihoods[ACvsBD]); - } - } else { - if (mlAccuracy > 1 || loglk[ADvsBC] > loglk[ABvsCD] - closeLogLkLimit) { - loglk[ADvsBC] = MLQuartetOptimize(profiles[0], profiles[3], profiles[2], profiles[1], - NJ->nPos, NJ->transmat, &NJ->rates, /*IN/OUT*/lenADvsBC, /*pStarTest*/NULL, - /*OUT*/site_likelihoods[ADvsBC]); - } - } - - if (loglk[ABvsCD] >= loglk[ACvsBD] && loglk[ABvsCD] >= loglk[ADvsBC]) - choice = ABvsCD; - else if (loglk[ACvsBD] >= loglk[ABvsCD] && loglk[ACvsBD] >= loglk[ADvsBC]) - choice = ACvsBD; - else - choice = ADvsBC; - bool badSplit = loglk[choice] > loglk[ABvsCD] + treeLogLkDelta; /* ignore small changes in likelihood */ - - /* constraint penalties, indexed by nni_t (lower is better) */ - double p[3]; - QuartetConstraintPenalties(profiles, NJ->nConstraints, /*OUT*/p); - bool bBadConstr = p[ABvsCD] > p[ACvsBD] + tolerance || p[ABvsCD] > p[ADvsBC] + tolerance; - bool violateConstraint = false; - int iC; - for (iC=0; iC < NJ->nConstraints; iC++) { - if (SplitViolatesConstraint(profiles, iC)) { - violateConstraint = true; - break; - } - } - splitcount->nSplits++; - if (violateConstraint) - splitcount->nConstraintViolations++; - if (badSplit) - splitcount->nBadSplits++; - if (badSplit && bBadConstr) - splitcount->nBadBoth++; - if (badSplit) { - double delta = loglk[choice] - loglk[ABvsCD]; - /* If ABvsCD is favored over the more likely NNI by constraints, - then this is probably a bad split because of the constraint */ - if (p[choice] > p[ABvsCD] + tolerance) - splitcount->dWorstDeltaConstrained = MAX(delta, splitcount->dWorstDeltaConstrained); - else - splitcount->dWorstDeltaUnconstrained = MAX(delta, splitcount->dWorstDeltaUnconstrained); - } - if (nBootstrap>0) - NJ->support[node] = badSplit ? 0.0 : SHSupport(NJ->nPos, nBootstrap, col, loglk, site_likelihoods); - - /* No longer needed */ - DeleteUpProfile(upProfiles, NJ, nodeABCD[0]); - DeleteUpProfile(upProfiles, NJ, nodeABCD[1]); - DeleteUpProfile(upProfiles, NJ, nodeABCD[2]); - } - traversal = FreeTraversal(traversal,NJ); - upProfiles = FreeUpProfiles(upProfiles,NJ); - if (nBootstrap>0) - col = myfree(col, sizeof(int)*((size_t)NJ->nPos)*nBootstrap); - for (choice = 0; choice < 3; choice++) - site_likelihoods[choice] = myfree(site_likelihoods[choice], sizeof(double)*NJ->nPos); -} - - -void TestSplitsMinEvo(NJ_t *NJ, /*OUT*/SplitCount_t *splitcount) { - const double tolerance = 1e-6; - splitcount->nBadSplits = 0; - splitcount->nConstraintViolations = 0; - splitcount->nBadBoth = 0; - splitcount->nSplits = 0; - splitcount->dWorstDeltaUnconstrained = 0.0; - splitcount->dWorstDeltaConstrained = 0.0; - - profile_t **upProfiles = UpProfiles(NJ); - traversal_t traversal = InitTraversal(NJ); - int node = NJ->root; - - while((node = TraversePostorder(node, NJ, /*IN/OUT*/traversal, /*pUp*/NULL)) >= 0) { - if (node < NJ->nSeq || node == NJ->root) - continue; /* nothing to do for leaves or root */ - - profile_t *profiles[4]; - int nodeABCD[4]; - SetupABCD(NJ, node, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, /*useML*/false); - - if (verbose>2) - fprintf(stderr,"Testing Split around %d: A=%d B=%d C=%d D=up(%d) or node parent %d\n", - node, nodeABCD[0], nodeABCD[1], nodeABCD[2], nodeABCD[3], NJ->parent[node]); - - double d[6]; /* distances, perhaps log-corrected distances, no constraint penalties */ - CorrectedPairDistances(profiles, 4, NJ->distance_matrix, NJ->nPos, /*OUT*/d); - - /* alignment-based scores for each split (lower is better) */ - double sABvsCD = d[qAB] + d[qCD]; - double sACvsBD = d[qAC] + d[qBD]; - double sADvsBC = d[qAD] + d[qBC]; - - /* constraint penalties, indexed by nni_t (lower is better) */ - double p[3]; - QuartetConstraintPenalties(profiles, NJ->nConstraints, /*OUT*/p); - - int nConstraintsViolated = 0; - int iC; - for (iC=0; iC < NJ->nConstraints; iC++) { - if (SplitViolatesConstraint(profiles, iC)) { - nConstraintsViolated++; - if (verbose > 2) { - double penalty[3] = {0.0,0.0,0.0}; - (void)QuartetConstraintPenaltiesPiece(profiles, iC, /*OUT*/penalty); - fprintf(stderr, "Violate constraint %d at %d (children %d %d) penalties %.3f %.3f %.3f %d/%d %d/%d %d/%d %d/%d\n", - iC, node, NJ->child[node].child[0], NJ->child[node].child[1], - penalty[ABvsCD], penalty[ACvsBD], penalty[ADvsBC], - profiles[0]->nOn[iC], profiles[0]->nOff[iC], - profiles[1]->nOn[iC], profiles[1]->nOff[iC], - profiles[2]->nOn[iC], profiles[2]->nOff[iC], - profiles[3]->nOn[iC], profiles[3]->nOff[iC]); - } - } - } - - double delta = sABvsCD - MIN(sACvsBD,sADvsBC); - bool bBadDist = delta > tolerance; - bool bBadConstr = p[ABvsCD] > p[ACvsBD] + tolerance || p[ABvsCD] > p[ADvsBC] + tolerance; - - splitcount->nSplits++; - if (bBadDist) { - nni_t choice = sACvsBD < sADvsBC ? ACvsBD : ADvsBC; - /* If ABvsCD is favored over the shorter NNI by constraints, - then this is probably a bad split because of the constraint */ - if (p[choice] > p[ABvsCD] + tolerance) - splitcount->dWorstDeltaConstrained = MAX(delta, splitcount->dWorstDeltaConstrained); - else - splitcount->dWorstDeltaUnconstrained = MAX(delta, splitcount->dWorstDeltaUnconstrained); - } - - if (nConstraintsViolated > 0) - splitcount->nConstraintViolations++; /* count splits with any violations, not #constraints in a splits */ - if (bBadDist) - splitcount->nBadSplits++; - if (bBadDist && bBadConstr) - splitcount->nBadBoth++; - if (bBadConstr && verbose > 2) { - /* Which NNI would be better */ - double dist_advantage = 0; - double constraint_penalty = 0; - if (p[ACvsBD] < p[ADvsBC]) { - dist_advantage = sACvsBD - sABvsCD; - constraint_penalty = p[ABvsCD] - p[ACvsBD]; - } else { - dist_advantage = sADvsBC - sABvsCD; - constraint_penalty = p[ABvsCD] - p[ADvsBC]; - } - fprintf(stderr, "Violate constraints %d distance_advantage %.3f constraint_penalty %.3f (children %d %d):", - node, dist_advantage, constraint_penalty, - NJ->child[node].child[0], NJ->child[node].child[1]); - /* list the constraints with a penalty, meaning that ABCD all have non-zero - values and that AB|CD worse than others */ - for (iC = 0; iC < NJ->nConstraints; iC++) { - double ppart[6]; - if (QuartetConstraintPenaltiesPiece(profiles, iC, /*OUT*/ppart)) { - if (ppart[qAB] + ppart[qCD] > ppart[qAD] + ppart[qBC] + tolerance - || ppart[qAB] + ppart[qCD] > ppart[qAC] + ppart[qBD] + tolerance) - fprintf(stderr, " %d (%d/%d %d/%d %d/%d %d/%d)", iC, - profiles[0]->nOn[iC], profiles[0]->nOff[iC], - profiles[1]->nOn[iC], profiles[1]->nOff[iC], - profiles[2]->nOn[iC], profiles[2]->nOff[iC], - profiles[3]->nOn[iC], profiles[3]->nOff[iC]); - } - } - fprintf(stderr, "\n"); - } - - /* no longer needed */ - DeleteUpProfile(upProfiles, NJ, nodeABCD[0]); - DeleteUpProfile(upProfiles, NJ, nodeABCD[1]); - } - traversal = FreeTraversal(traversal,NJ); - upProfiles = FreeUpProfiles(upProfiles,NJ); -} - -/* Computes support for (A,B),(C,D) compared to that for (A,C),(B,D) and (A,D),(B,C) */ -double SplitSupport(profile_t *pA, profile_t *pB, profile_t *pC, profile_t *pD, - /*OPTIONAL*/distance_matrix_t *dmat, - int nPos, - int nBootstrap, - int *col) { - int i,j; - long lPos = nPos; /* to avoid overflow when multiplying */ - - /* Note distpieces are weighted */ - double *distpieces[6]; - double *weights[6]; - for (j = 0; j < 6; j++) { - distpieces[j] = (double*)mymalloc(sizeof(double)*nPos); - weights[j] = (double*)mymalloc(sizeof(double)*nPos); - } - - int iFreqA = 0; - int iFreqB = 0; - int iFreqC = 0; - int iFreqD = 0; - for (i = 0; i < nPos; i++) { - numeric_t *fA = GET_FREQ(pA, i, /*IN/OUT*/iFreqA); - numeric_t *fB = GET_FREQ(pB, i, /*IN/OUT*/iFreqB); - numeric_t *fC = GET_FREQ(pC, i, /*IN/OUT*/iFreqC); - numeric_t *fD = GET_FREQ(pD, i, /*IN/OUT*/iFreqD); - - weights[qAB][i] = pA->weights[i] * pB->weights[i]; - weights[qAC][i] = pA->weights[i] * pC->weights[i]; - weights[qAD][i] = pA->weights[i] * pD->weights[i]; - weights[qBC][i] = pB->weights[i] * pC->weights[i]; - weights[qBD][i] = pB->weights[i] * pD->weights[i]; - weights[qCD][i] = pC->weights[i] * pD->weights[i]; - - distpieces[qAB][i] = weights[qAB][i] * ProfileDistPiece(pA->codes[i], pB->codes[i], fA, fB, dmat, NULL); - distpieces[qAC][i] = weights[qAC][i] * ProfileDistPiece(pA->codes[i], pC->codes[i], fA, fC, dmat, NULL); - distpieces[qAD][i] = weights[qAD][i] * ProfileDistPiece(pA->codes[i], pD->codes[i], fA, fD, dmat, NULL); - distpieces[qBC][i] = weights[qBC][i] * ProfileDistPiece(pB->codes[i], pC->codes[i], fB, fC, dmat, NULL); - distpieces[qBD][i] = weights[qBD][i] * ProfileDistPiece(pB->codes[i], pD->codes[i], fB, fD, dmat, NULL); - distpieces[qCD][i] = weights[qCD][i] * ProfileDistPiece(pC->codes[i], pD->codes[i], fC, fD, dmat, NULL); - } - assert(iFreqA == pA->nVectors); - assert(iFreqB == pB->nVectors); - assert(iFreqC == pC->nVectors); - assert(iFreqD == pD->nVectors); - - double totpieces[6]; - double totweights[6]; - double dists[6]; - for (j = 0; j < 6; j++) { - totpieces[j] = 0.0; - totweights[j] = 0.0; - for (i = 0; i < nPos; i++) { - totpieces[j] += distpieces[j][i]; - totweights[j] += weights[j][i]; - } - dists[j] = totweights[j] > 0.01 ? totpieces[j]/totweights[j] : 3.0; - if (logdist) - dists[j] = LogCorrect(dists[j]); - } - - /* Support1 = Support(AB|CD over AC|BD) = d(A,C)+d(B,D)-d(A,B)-d(C,D) - Support2 = Support(AB|CD over AD|BC) = d(A,D)+d(B,C)-d(A,B)-d(C,D) - */ - double support1 = dists[qAC] + dists[qBD] - dists[qAB] - dists[qCD]; - double support2 = dists[qAD] + dists[qBC] - dists[qAB] - dists[qCD]; - - if (support1 < 0 || support2 < 0) { - nSuboptimalSplits++; /* Another split seems superior */ - } - - assert(nBootstrap > 0); - int nSupport = 0; - - int iBoot; - for (iBoot=0;iBoot<nBootstrap;iBoot++) { - int *colw = &col[lPos*iBoot]; - - for (j = 0; j < 6; j++) { - double totp = 0; - double totw = 0; - double *d = distpieces[j]; - double *w = weights[j]; - for (i=0; i<nPos; i++) { - int c = colw[i]; - totp += d[c]; - totw += w[c]; - } - dists[j] = totw > 0.01 ? totp/totw : 3.0; - if (logdist) - dists[j] = LogCorrect(dists[j]); - } - support1 = dists[qAC] + dists[qBD] - dists[qAB] - dists[qCD]; - support2 = dists[qAD] + dists[qBC] - dists[qAB] - dists[qCD]; - if (support1 > 0 && support2 > 0) - nSupport++; - } /* end loop over bootstrap replicates */ - - for (j = 0; j < 6; j++) { - distpieces[j] = myfree(distpieces[j], sizeof(double)*nPos); - weights[j] = myfree(weights[j], sizeof(double)*nPos); - } - return( nSupport/(double)nBootstrap ); -} - -double SHSupport(int nPos, int nBootstrap, int *col, double loglk[3], double *site_likelihoods[3]) { - long lPos = nPos; /* to avoid overflow when multiplying */ - assert(nBootstrap>0); - double delta1 = loglk[0]-loglk[1]; - double delta2 = loglk[0]-loglk[2]; - double delta = delta1 < delta2 ? delta1 : delta2; - - double *siteloglk[3]; - int i,j; - for (i = 0; i < 3; i++) { - siteloglk[i] = mymalloc(sizeof(double)*nPos); - for (j = 0; j < nPos; j++) - siteloglk[i][j] = log(site_likelihoods[i][j]); - } - - int nSupport = 0; - int iBoot; - for (iBoot = 0; iBoot < nBootstrap; iBoot++) { - double resampled[3]; - for (i = 0; i < 3; i++) - resampled[i] = -loglk[i]; - for (j = 0; j < nPos; j++) { - int pos = col[iBoot*lPos+j]; - for (i = 0; i < 3; i++) - resampled[i] += siteloglk[i][pos]; - } - int iBest = 0; - for (i = 1; i < 3; i++) - if (resampled[i] > resampled[iBest]) - iBest = i; - double resample1 = resampled[iBest] - resampled[(iBest+1)%3]; - double resample2 = resampled[iBest] - resampled[(iBest+2)%3]; - double resampleDelta = resample1 < resample2 ? resample1 : resample2; - if (resampleDelta < delta) - nSupport++; - } - for (i=0;i<3;i++) - siteloglk[i] = myfree(siteloglk[i], sizeof(double)*nPos); - return(nSupport/(double)nBootstrap); -} - - -void SetDistCriterion(/*IN/OUT*/NJ_t *NJ, int nActive, /*IN/OUT*/besthit_t *hit) { - if (hit->i < NJ->nSeq && hit->j < NJ->nSeq) { - SeqDist(NJ->profiles[hit->i]->codes, - NJ->profiles[hit->j]->codes, - NJ->nPos, NJ->distance_matrix, /*OUT*/hit); - } else { - ProfileDist(NJ->profiles[hit->i], - NJ->profiles[hit->j], - NJ->nPos, NJ->distance_matrix, /*OUT*/hit); - hit->dist -= (NJ->diameter[hit->i] + NJ->diameter[hit->j]); - } - hit->dist += constraintWeight - * (double)JoinConstraintPenalty(NJ, hit->i, hit->j); - SetCriterion(NJ,nActive,/*IN/OUT*/hit); -} - -void SetCriterion(/*IN/UPDATE*/NJ_t *NJ, int nActive, /*IN/OUT*/besthit_t *join) { - if(join->i < 0 - || join->j < 0 - || NJ->parent[join->i] >= 0 - || NJ->parent[join->j] >= 0) - return; - assert(NJ->nOutDistActive[join->i] >= nActive); - assert(NJ->nOutDistActive[join->j] >= nActive); - - int nDiffAllow = tophitsMult > 0 ? (int)(nActive*staleOutLimit) : 0; - if (NJ->nOutDistActive[join->i] - nActive > nDiffAllow) - SetOutDistance(NJ, join->i, nActive); - if (NJ->nOutDistActive[join->j] - nActive > nDiffAllow) - SetOutDistance(NJ, join->j, nActive); - double outI = NJ->outDistances[join->i]; - if (NJ->nOutDistActive[join->i] != nActive) - outI *= (nActive-1)/(double)(NJ->nOutDistActive[join->i]-1); - double outJ = NJ->outDistances[join->j]; - if (NJ->nOutDistActive[join->j] != nActive) - outJ *= (nActive-1)/(double)(NJ->nOutDistActive[join->j]-1); - join->criterion = join->dist - (outI+outJ)/(double)(nActive-2); - if (verbose > 2 && nActive <= 5) { - fprintf(stderr, "Set Criterion to join %d %d with nActive=%d dist+penalty %.3f criterion %.3f\n", - join->i, join->j, nActive, join->dist, join->criterion); - } -} - -void SetOutDistance(NJ_t *NJ, int iNode, int nActive) { - if (NJ->nOutDistActive[iNode] == nActive) - return; - - /* May be called by InitNJ before we have parents */ - assert(iNode>=0 && (NJ->parent == NULL || NJ->parent[iNode]<0)); - besthit_t dist; - ProfileDist(NJ->profiles[iNode], NJ->outprofile, NJ->nPos, NJ->distance_matrix, &dist); - outprofileOps++; - - /* out(A) = sum(X!=A) d(A,X) - = sum(X!=A) (profiledist(A,X) - diam(A) - diam(X)) - = sum(X!=A) profiledist(A,X) - (N-1)*diam(A) - (totdiam - diam(A)) - - in the absence of gaps: - profiledist(A,out) = mean profiledist(A, all active nodes) - sum(X!=A) profiledist(A,X) = N * profiledist(A,out) - profiledist(A,A) - - With gaps, we need to take the weights of the comparisons into account, where - w(Ai) is the weight of position i in profile A: - w(A,B) = sum_i w(Ai) * w(Bi) - d(A,B) = sum_i w(Ai) * w(Bi) * d(Ai,Bi) / w(A,B) - - sum(X!=A) profiledist(A,X) ~= (N-1) * profiledist(A, Out w/o A) - profiledist(A, Out w/o A) = sum_X!=A sum_i d(Ai,Xi) * w(Ai) * w(Bi) / ( sum_X!=A sum_i w(Ai) * w(Bi) ) - d(A, Out) = sum_A sum_i d(Ai,Xi) * w(Ai) * w(Bi) / ( sum_X sum_i w(Ai) * w(Bi) ) - - and so we get - profiledist(A,out w/o A) = (top of d(A,Out) - top of d(A,A)) / (weight of d(A,Out) - weight of d(A,A)) - top = dist * weight - with another correction of nActive because the weight of the out-profile is the average - weight not the total weight. - */ - double top = (nActive-1) - * (dist.dist * dist.weight * nActive - NJ->selfweight[iNode] * NJ->selfdist[iNode]); - double bottom = (dist.weight * nActive - NJ->selfweight[iNode]); - double pdistOutWithoutA = top/bottom; - NJ->outDistances[iNode] = bottom > 0.01 ? - pdistOutWithoutA - NJ->diameter[iNode] * (nActive-1) - (NJ->totdiam - NJ->diameter[iNode]) - : 3.0; - NJ->nOutDistActive[iNode] = nActive; - - if(verbose>3 && iNode < 5) - fprintf(stderr,"NewOutDist for %d %f from dist %f selfd %f diam %f totdiam %f newActive %d\n", - iNode, NJ->outDistances[iNode], dist.dist, NJ->selfdist[iNode], NJ->diameter[iNode], - NJ->totdiam, nActive); - if (verbose>6 && (iNode % 10) == 0) { - /* Compute the actual out-distance and compare */ - double total = 0.0; - double total_pd = 0.0; - int j; - for (j=0;j<NJ->maxnode;j++) { - if (j!=iNode && (NJ->parent==NULL || NJ->parent[j]<0)) { - besthit_t bh; - ProfileDist(NJ->profiles[iNode], NJ->profiles[j], NJ->nPos, NJ->distance_matrix, /*OUT*/&bh); - total_pd += bh.dist; - total += bh.dist - (NJ->diameter[iNode] + NJ->diameter[j]); - } - } - fprintf(stderr,"OutDist for Node %d %f truth %f profiled %f truth %f pd_err %f\n", - iNode, NJ->outDistances[iNode], total, pdistOutWithoutA, total_pd,fabs(pdistOutWithoutA-total_pd)); - } -} - -top_hits_t *FreeTopHits(top_hits_t *tophits) { - if (tophits == NULL) - return(NULL); - int iNode; - for (iNode = 0; iNode < tophits->maxnodes; iNode++) { - top_hits_list_t *l = &tophits->top_hits_lists[iNode]; - if (l->hits != NULL) - l->hits = myfree(l->hits, sizeof(hit_t) * l->nHits); - } - tophits->top_hits_lists = myfree(tophits->top_hits_lists, sizeof(top_hits_list_t) * tophits->maxnodes); - tophits->visible = myfree(tophits->visible, sizeof(hit_t*) * tophits->maxnodes); - tophits->topvisible = myfree(tophits->topvisible, sizeof(int) * tophits->nTopVisible); -#ifdef OPENMP - for (iNode = 0; iNode < tophits->maxnodes; iNode++) - omp_destroy_lock(&tophits->locks[iNode]); - tophits->locks = myfree(tophits->locks, sizeof(omp_lock_t) * tophits->maxnodes); -#endif - return(myfree(tophits, sizeof(top_hits_t))); -} - -top_hits_t *InitTopHits(NJ_t *NJ, int m) { - int iNode; - assert(m > 0); - top_hits_t *tophits = mymalloc(sizeof(top_hits_t)); - tophits->m = m; - tophits->q = (int)(0.5 + tophits2Mult * sqrt(tophits->m)); - if (!useTopHits2nd || tophits->q >= tophits->m) - tophits->q = 0; - tophits->maxnodes = NJ->maxnodes; - tophits->top_hits_lists = mymalloc(sizeof(top_hits_list_t) * tophits->maxnodes); - tophits->visible = mymalloc(sizeof(hit_t) * tophits->maxnodes); - tophits->nTopVisible = (int)(0.5 + topvisibleMult*m); - tophits->topvisible = mymalloc(sizeof(int) * tophits->nTopVisible); -#ifdef OPENMP - tophits->locks = mymalloc(sizeof(omp_lock_t) * tophits->maxnodes); - for (iNode = 0; iNode < tophits->maxnodes; iNode++) - omp_init_lock(&tophits->locks[iNode]); -#endif - int i; - for (i = 0; i < tophits->nTopVisible; i++) - tophits->topvisible[i] = -1; /* empty */ - tophits->topvisibleAge = 0; - - for (iNode = 0; iNode < tophits->maxnodes; iNode++) { - top_hits_list_t *l = &tophits->top_hits_lists[iNode]; - l->nHits = 0; - l->hits = NULL; - l->hitSource = -1; - l->age = 0; - hit_t *v = &tophits->visible[iNode]; - v->j = -1; - v->dist = 1e20; - } - return(tophits); -} - -/* Helper function for sorting in SetAllLeafTopHits, - and the global variables it needs -*/ -NJ_t *CompareSeedNJ = NULL; -int *CompareSeedGaps = NULL; -int CompareSeeds(const void *c1, const void *c2) { - int seed1 = *(int *)c1; - int seed2 = *(int *)c2; - int gapdiff = CompareSeedGaps[seed1] - CompareSeedGaps[seed2]; - if (gapdiff != 0) return(gapdiff); /* fewer gaps is better */ - double outdiff = CompareSeedNJ->outDistances[seed1] - CompareSeedNJ->outDistances[seed2]; - if(outdiff < 0) return(-1); /* closer to more nodes is better */ - if(outdiff > 0) return(1); - return(0); -} - -/* Using the seed heuristic and the close global variable */ -void SetAllLeafTopHits(/*IN/UPDATE*/NJ_t *NJ, /*IN/OUT*/top_hits_t *tophits) { - double close = tophitsClose; - if (close < 0) { - if (fastest && NJ->nSeq >= 50000) { - close = 0.99; - } else { - double logN = log((double)NJ->nSeq)/log(2.0); - close = logN/(logN+2.0); - } - } - /* Sort the potential seeds, by a combination of nGaps and NJ->outDistances - We don't store nGaps so we need to compute that - */ - int *nGaps = (int*)mymalloc(sizeof(int)*NJ->nSeq); - int iNode; - for(iNode=0; iNode<NJ->nSeq; iNode++) { - nGaps[iNode] = (int)(0.5 + NJ->nPos - NJ->selfweight[iNode]); - } - int *seeds = (int*)mymalloc(sizeof(int)*NJ->nSeq); - for (iNode=0; iNode<NJ->nSeq; iNode++) seeds[iNode] = iNode; - CompareSeedNJ = NJ; - CompareSeedGaps = nGaps; - qsort(/*IN/OUT*/seeds, NJ->nSeq, sizeof(int), CompareSeeds); - CompareSeedNJ = NULL; - CompareSeedGaps = NULL; - - /* For each seed, save its top 2*m hits and then look for close neighbors */ - assert(2 * tophits->m <= NJ->nSeq); - int iSeed; - int nHasTopHits = 0; -#ifdef OPENMP - #pragma omp parallel for schedule(dynamic, 50) -#endif - for(iSeed=0; iSeed < NJ->nSeq; iSeed++) { - int seed = seeds[iSeed]; - if (iSeed > 0 && (iSeed % 100) == 0) { -#ifdef OPENMP - #pragma omp critical -#endif - ProgressReport("Top hits for %6d of %6d seqs (at seed %6d)", - nHasTopHits, NJ->nSeq, - iSeed, 0); - } - if (tophits->top_hits_lists[seed].nHits > 0) { - if(verbose>2) fprintf(stderr, "Skipping seed %d\n", seed); - continue; - } - - besthit_t *besthitsSeed = (besthit_t*)mymalloc(sizeof(besthit_t)*NJ->nSeq); - besthit_t *besthitsNeighbor = (besthit_t*)mymalloc(sizeof(besthit_t) * 2 * tophits->m); - besthit_t bestjoin; - - if(verbose>2) fprintf(stderr,"Trying seed %d\n", seed); - SetBestHit(seed, NJ, /*nActive*/NJ->nSeq, /*OUT*/&bestjoin, /*OUT*/besthitsSeed); - - /* sort & save top hits of self. besthitsSeed is now sorted. */ - SortSaveBestHits(seed, /*IN/SORT*/besthitsSeed, /*IN-SIZE*/NJ->nSeq, - /*OUT-SIZE*/tophits->m, /*IN/OUT*/tophits); - nHasTopHits++; - - /* find "close" neighbors and compute their top hits */ - double neardist = besthitsSeed[2 * tophits->m - 1].dist * close; - /* must have at least average weight, rem higher is better - and allow a bit more than average, e.g. if we are looking for within 30% away, - 20% more gaps than usual seems OK - Alternatively, have a coverage requirement in case neighbor is short - If fastest, consider the top q/2 hits to be close neighbors, regardless - */ - double nearweight = 0; - int iClose; - for (iClose = 0; iClose < 2 * tophits->m; iClose++) - nearweight += besthitsSeed[iClose].weight; - nearweight = nearweight/(2.0 * tophits->m); /* average */ - nearweight *= (1.0-2.0*neardist/3.0); - double nearcover = 1.0 - neardist/2.0; - - if(verbose>2) fprintf(stderr,"Distance limit for close neighbors %f weight %f ungapped %d\n", - neardist, nearweight, NJ->nPos-nGaps[seed]); - for (iClose = 0; iClose < tophits->m; iClose++) { - besthit_t *closehit = &besthitsSeed[iClose]; - int closeNode = closehit->j; - if (tophits->top_hits_lists[closeNode].nHits > 0) - continue; - - /* If within close-distance, or identical, use as close neighbor */ - bool close = closehit->dist <= neardist - && (closehit->weight >= nearweight - || closehit->weight >= (NJ->nPos-nGaps[closeNode])*nearcover); - bool identical = closehit->dist < 1e-6 - && fabs(closehit->weight - (NJ->nPos - nGaps[seed])) < 1e-5 - && fabs(closehit->weight - (NJ->nPos - nGaps[closeNode])) < 1e-5; - if (useTopHits2nd && iClose < tophits->q && (close || identical)) { - nHasTopHits++; - nClose2Used++; - int nUse = MIN(tophits->q * tophits2Safety, 2 * tophits->m); - besthit_t *besthitsClose = mymalloc(sizeof(besthit_t) * nUse); - TransferBestHits(NJ, /*nActive*/NJ->nSeq, - closeNode, - /*IN*/besthitsSeed, /*SIZE*/nUse, - /*OUT*/besthitsClose, - /*updateDistance*/true); - SortSaveBestHits(closeNode, /*IN/SORT*/besthitsClose, - /*IN-SIZE*/nUse, /*OUT-SIZE*/tophits->q, - /*IN/OUT*/tophits); - tophits->top_hits_lists[closeNode].hitSource = seed; - besthitsClose = myfree(besthitsClose, sizeof(besthit_t) * nUse); - } else if (close || identical || (fastest && iClose < (tophits->q+1)/2)) { - nHasTopHits++; - nCloseUsed++; - if(verbose>2) fprintf(stderr, "Near neighbor %d (rank %d weight %f ungapped %d %d)\n", - closeNode, iClose, besthitsSeed[iClose].weight, - NJ->nPos-nGaps[seed], - NJ->nPos-nGaps[closeNode]); - - /* compute top 2*m hits */ - TransferBestHits(NJ, /*nActive*/NJ->nSeq, - closeNode, - /*IN*/besthitsSeed, /*SIZE*/2 * tophits->m, - /*OUT*/besthitsNeighbor, - /*updateDistance*/true); - SortSaveBestHits(closeNode, /*IN/SORT*/besthitsNeighbor, - /*IN-SIZE*/2 * tophits->m, /*OUT-SIZE*/tophits->m, - /*IN/OUT*/tophits); - - /* And then try for a second level of transfer. We assume we - are in a good area, because of the 1st - level of transfer, and in a small neighborhood, because q is - small (32 for 1 million sequences), so we do not make any close checks. - */ - int iClose2; - for (iClose2 = 0; iClose2 < tophits->q && iClose2 < 2 * tophits->m; iClose2++) { - int closeNode2 = besthitsNeighbor[iClose2].j; - assert(closeNode2 >= 0); - if (tophits->top_hits_lists[closeNode2].hits == NULL) { - nClose2Used++; - nHasTopHits++; - int nUse = MIN(tophits->q * tophits2Safety, 2 * tophits->m); - besthit_t *besthitsClose2 = mymalloc(sizeof(besthit_t) * nUse); - TransferBestHits(NJ, /*nActive*/NJ->nSeq, - closeNode2, - /*IN*/besthitsNeighbor, /*SIZE*/nUse, - /*OUT*/besthitsClose2, - /*updateDistance*/true); - SortSaveBestHits(closeNode2, /*IN/SORT*/besthitsClose2, - /*IN-SIZE*/nUse, /*OUT-SIZE*/tophits->q, - /*IN/OUT*/tophits); - tophits->top_hits_lists[closeNode2].hitSource = closeNode; - besthitsClose2 = myfree(besthitsClose2, sizeof(besthit_t) * nUse); - } /* end if should do 2nd-level transfer */ - } - } - } /* end loop over close candidates */ - besthitsSeed = myfree(besthitsSeed, sizeof(besthit_t)*NJ->nSeq); - besthitsNeighbor = myfree(besthitsNeighbor, sizeof(besthit_t) * 2 * tophits->m); - } /* end loop over seeds */ - - for (iNode=0; iNode<NJ->nSeq; iNode++) { - top_hits_list_t *l = &tophits->top_hits_lists[iNode]; - assert(l->hits != NULL); - assert(l->hits[0].j >= 0); - assert(l->hits[0].j < NJ->nSeq); - assert(l->hits[0].j != iNode); - tophits->visible[iNode] = l->hits[0]; - } - - if (verbose >= 2) fprintf(stderr, "#Close neighbors among leaves: 1st-level %ld 2nd-level %ld seeds %ld\n", - nCloseUsed, nClose2Used, NJ->nSeq-nCloseUsed-nClose2Used); - nGaps = myfree(nGaps, sizeof(int)*NJ->nSeq); - seeds = myfree(seeds, sizeof(int)*NJ->nSeq); - - /* Now add a "checking phase" where we ensure that the q or 2*sqrt(m) hits - of i are represented in j (if they should be) - */ - long lReplace = 0; - int nCheck = tophits->q > 0 ? tophits->q : (int)(0.5 + 2.0*sqrt(tophits->m)); - for (iNode = 0; iNode < NJ->nSeq; iNode++) { - if ((iNode % 100) == 0) - ProgressReport("Checking top hits for %6d of %6d seqs", - iNode+1, NJ->nSeq, 0, 0); - top_hits_list_t *lNode = &tophits->top_hits_lists[iNode]; - int iHit; - for (iHit = 0; iHit < nCheck && iHit < lNode->nHits; iHit++) { - besthit_t bh = HitToBestHit(iNode, lNode->hits[iHit]); - SetCriterion(NJ, /*nActive*/NJ->nSeq, /*IN/OUT*/&bh); - top_hits_list_t *lTarget = &tophits->top_hits_lists[bh.j]; - - /* If this criterion is worse than the nCheck-1 entry of the target, - then skip the check. - This logic is based on assuming that the list is sorted, - which is true initially but may not be true later. - Still, is a good heuristic. - */ - assert(nCheck > 0); - assert(nCheck <= lTarget->nHits); - besthit_t bhCheck = HitToBestHit(bh.j, lTarget->hits[nCheck-1]); - SetCriterion(NJ, /*nActive*/NJ->nSeq, /*IN/OUT*/&bhCheck); - if (bhCheck.criterion < bh.criterion) - continue; /* no check needed */ - - /* Check if this is present in the top-hit list */ - int iHit2; - bool bFound = false; - for (iHit2 = 0; iHit2 < lTarget->nHits && !bFound; iHit2++) - if (lTarget->hits[iHit2].j == iNode) - bFound = true; - if (!bFound) { - /* Find the hit with the worst criterion and replace it with this one */ - int iWorst = -1; - double dWorstCriterion = -1e20; - for (iHit2 = 0; iHit2 < lTarget->nHits; iHit2++) { - besthit_t bh2 = HitToBestHit(bh.j, lTarget->hits[iHit2]); - SetCriterion(NJ, /*nActive*/NJ->nSeq, /*IN/OUT*/&bh2); - if (bh2.criterion > dWorstCriterion) { - iWorst = iHit2; - dWorstCriterion = bh2.criterion; - } - } - if (dWorstCriterion > bh.criterion) { - assert(iWorst >= 0); - lTarget->hits[iWorst].j = iNode; - lTarget->hits[iWorst].dist = bh.dist; - lReplace++; - /* and perhaps update visible */ - besthit_t v; - bool bSuccess = GetVisible(NJ, /*nActive*/NJ->nSeq, tophits, bh.j, /*OUT*/&v); - assert(bSuccess); - if (bh.criterion < v.criterion) - tophits->visible[bh.j] = lTarget->hits[iWorst]; - } - } - } - } - - if (verbose >= 2) - fprintf(stderr, "Replaced %ld top hit entries\n", lReplace); -} - -/* Updates out-distances but does not reset or update visible set */ -void GetBestFromTopHits(int iNode, - /*IN/UPDATE*/NJ_t *NJ, - int nActive, - /*IN*/top_hits_t *tophits, - /*OUT*/besthit_t *bestjoin) { - assert(iNode >= 0); - assert(NJ->parent[iNode] < 0); - top_hits_list_t *l = &tophits->top_hits_lists[iNode]; - assert(l->nHits > 0); - assert(l->hits != NULL); - - if(!fastest) - SetOutDistance(NJ, iNode, nActive); /* ensure out-distances are not stale */ - - bestjoin->i = -1; - bestjoin->j = -1; - bestjoin->dist = 1e20; - bestjoin->criterion = 1e20; - - int iBest; - for(iBest=0; iBest < l->nHits; iBest++) { - besthit_t bh = HitToBestHit(iNode, l->hits[iBest]); - if (UpdateBestHit(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/&bh, /*update dist*/true)) { - SetCriterion(/*IN/OUT*/NJ, nActive, /*IN/OUT*/&bh); /* make sure criterion is correct */ - if (bh.criterion < bestjoin->criterion) - *bestjoin = bh; - } - } - assert(bestjoin->j >= 0); /* a hit was found */ - assert(bestjoin->i == iNode); -} - -int ActiveAncestor(/*IN*/NJ_t *NJ, int iNode) { - if (iNode < 0) - return(iNode); - while(NJ->parent[iNode] >= 0) - iNode = NJ->parent[iNode]; - return(iNode); -} - -bool UpdateBestHit(/*IN/UPDATE*/NJ_t *NJ, int nActive, /*IN/OUT*/besthit_t *hit, - bool bUpdateDist) { - int i = ActiveAncestor(/*IN*/NJ, hit->i); - int j = ActiveAncestor(/*IN*/NJ, hit->j); - if (i < 0 || j < 0 || i == j) { - hit->i = -1; - hit->j = -1; - hit->weight = 0; - hit->dist = 1e20; - hit->criterion = 1e20; - return(false); - } - if (i != hit->i || j != hit->j) { - hit->i = i; - hit->j = j; - if (bUpdateDist) { - SetDistCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/hit); - } else { - hit->dist = -1e20; - hit->criterion = 1e20; - } - } - return(true); -} - -bool GetVisible(/*IN/UPDATE*/NJ_t *NJ, int nActive, - /*IN/OUT*/top_hits_t *tophits, - int iNode, /*OUT*/besthit_t *visible) { - if (iNode < 0 || NJ->parent[iNode] >= 0) - return(false); - hit_t *v = &tophits->visible[iNode]; - if (v->j < 0 || NJ->parent[v->j] >= 0) - return(false); - *visible = HitToBestHit(iNode, *v); - SetCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/visible); - return(true); -} - -besthit_t *UniqueBestHits(/*IN/UPDATE*/NJ_t *NJ, int nActive, - /*IN/SORT*/besthit_t *combined, int nCombined, - /*OUT*/int *nUniqueOut) { - int iHit; - for (iHit = 0; iHit < nCombined; iHit++) { - besthit_t *hit = &combined[iHit]; - UpdateBestHit(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/hit, /*update*/false); - } - qsort(/*IN/OUT*/combined, nCombined, sizeof(besthit_t), CompareHitsByIJ); - - besthit_t *uniqueList = (besthit_t*)mymalloc(sizeof(besthit_t)*nCombined); - int nUnique = 0; - int iSavedLast = -1; - - /* First build the new list */ - for (iHit = 0; iHit < nCombined; iHit++) { - besthit_t *hit = &combined[iHit]; - if (hit->i < 0 || hit->j < 0) - continue; - if (iSavedLast >= 0) { - /* toss out duplicates */ - besthit_t *saved = &combined[iSavedLast]; - if (saved->i == hit->i && saved->j == hit->j) - continue; - } - assert(nUnique < nCombined); - assert(hit->j >= 0 && NJ->parent[hit->j] < 0); - uniqueList[nUnique++] = *hit; - iSavedLast = iHit; - } - *nUniqueOut = nUnique; - - /* Then do any updates to the criterion or the distances in parallel */ -#ifdef OPENMP - #pragma omp parallel for schedule(dynamic, 50) -#endif - for (iHit = 0; iHit < nUnique; iHit++) { - besthit_t *hit = &uniqueList[iHit]; - if (hit->dist < 0.0) - SetDistCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/hit); - else - SetCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/hit); - } - return(uniqueList); -} - -/* - Create a top hit list for the new node, either - from children (if there are enough best hits left) or by a "refresh" - Also set visible set for newnode - Also update visible set for other nodes if we stumble across a "better" hit -*/ - -void TopHitJoin(int newnode, - /*IN/UPDATE*/NJ_t *NJ, - int nActive, - /*IN/OUT*/top_hits_t *tophits) { - long startProfileOps = profileOps; - long startOutProfileOps = outprofileOps; - assert(NJ->child[newnode].nChild == 2); - top_hits_list_t *lNew = &tophits->top_hits_lists[newnode]; - assert(lNew->hits == NULL); - - /* Copy the hits */ - int i; - top_hits_list_t *lChild[2]; - for (i = 0; i< 2; i++) { - lChild[i] = &tophits->top_hits_lists[NJ->child[newnode].child[i]]; - assert(lChild[i]->hits != NULL && lChild[i]->nHits > 0); - } - int nCombined = lChild[0]->nHits + lChild[1]->nHits; - besthit_t *combinedList = (besthit_t*)mymalloc(sizeof(besthit_t)*nCombined); - HitsToBestHits(lChild[0]->hits, lChild[0]->nHits, NJ->child[newnode].child[0], - /*OUT*/combinedList); - HitsToBestHits(lChild[1]->hits, lChild[1]->nHits, NJ->child[newnode].child[1], - /*OUT*/combinedList + lChild[0]->nHits); - int nUnique; - /* UniqueBestHits() replaces children (used in the calls to HitsToBestHits) - with active ancestors, so all distances & criteria will be recomputed */ - besthit_t *uniqueList = UniqueBestHits(/*IN/UPDATE*/NJ, nActive, - /*IN/SORT*/combinedList, - nCombined, - /*OUT*/&nUnique); - int nUniqueAlloc = nCombined; - combinedList = myfree(combinedList, sizeof(besthit_t)*nCombined); - - /* Forget the top-hit lists of the joined nodes */ - for (i = 0; i < 2; i++) { - lChild[i]->hits = myfree(lChild[i]->hits, sizeof(hit_t) * lChild[i]->nHits); - lChild[i]->nHits = 0; - } - - /* Use the average age, rounded up, by 1 Versions 2.0 and earlier - used the maximum age, which leads to more refreshes without - improving the accuracy of the NJ phase. Intuitively, if one of - them was just refreshed then another refresh is unlikely to help. - */ - lNew->age = (lChild[0]->age+lChild[1]->age+1)/2 + 1; - - /* If top hit ages always match (perfectly balanced), then a - limit of log2(m) would mean a refresh after - m joins, which is about what we want. - */ - int tophitAgeLimit = MAX(1, (int)(0.5 + log((double)tophits->m)/log(2.0))); - - /* Either use the merged list as candidate top hits, or - move from 2nd level to 1st level, or do a refresh - UniqueBestHits eliminates hits to self, so if nUnique==nActive-1, - we've already done the exhaustive search. - - Either way, we set tophits, visible(newnode), update visible of its top hits, - and modify topvisible: if we do a refresh, then we reset it, otherwise we update - */ - bool bSecondLevel = lChild[0]->hitSource >= 0 && lChild[1]->hitSource >= 0; - bool bUseUnique = nUnique==nActive-1 - || (lNew->age <= tophitAgeLimit - && nUnique >= (bSecondLevel ? (int)(0.5 + tophits2Refresh * tophits->q) - : (int)(0.5 + tophits->m * tophitsRefresh) )); - if (bUseUnique && verbose > 2) - fprintf(stderr,"Top hits for %d from combined %d nActive=%d tophitsage %d %s\n", - newnode,nUnique,nActive,lNew->age, - bSecondLevel ? "2ndlevel" : "1stlevel"); - - if (!bUseUnique - && bSecondLevel - && lNew->age <= tophitAgeLimit) { - int source = ActiveAncestor(NJ, lChild[0]->hitSource); - if (source == newnode) - source = ActiveAncestor(NJ, lChild[1]->hitSource); - /* In parallel mode, it is possible that we would select a node as the - hit-source and then over-write that top hit with a short list. - So we need this sanity check. - */ - if (source != newnode - && source >= 0 - && tophits->top_hits_lists[source].hitSource < 0) { - - /* switch from 2nd-level to 1st-level top hits -- compute top hits list - of node from what we have so far plus the active source plus its top hits */ - top_hits_list_t *lSource = &tophits->top_hits_lists[source]; - assert(lSource->hitSource < 0); - assert(lSource->nHits > 0); - int nMerge = 1 + lSource->nHits + nUnique; - besthit_t *mergeList = mymalloc(sizeof(besthit_t) * nMerge); - memcpy(/*to*/mergeList, /*from*/uniqueList, nUnique * sizeof(besthit_t)); - - int iMerge = nUnique; - mergeList[iMerge].i = newnode; - mergeList[iMerge].j = source; - SetDistCriterion(NJ, nActive, /*IN/OUT*/&mergeList[iMerge]); - iMerge++; - HitsToBestHits(lSource->hits, lSource->nHits, newnode, /*OUT*/mergeList+iMerge); - for (i = 0; i < lSource->nHits; i++) { - SetDistCriterion(NJ, nActive, /*IN/OUT*/&mergeList[iMerge]); - iMerge++; - } - assert(iMerge == nMerge); - - uniqueList = myfree(uniqueList, nUniqueAlloc * sizeof(besthit_t)); - uniqueList = UniqueBestHits(/*IN/UPDATE*/NJ, nActive, - /*IN/SORT*/mergeList, - nMerge, - /*OUT*/&nUnique); - nUniqueAlloc = nMerge; - mergeList = myfree(mergeList, sizeof(besthit_t)*nMerge); - - assert(nUnique > 0); - bUseUnique = nUnique >= (int)(0.5 + tophits->m * tophitsRefresh); - bSecondLevel = false; - - if (bUseUnique && verbose > 2) - fprintf(stderr, "Top hits for %d from children and source %d's %d hits, nUnique %d\n", - newnode, source, lSource->nHits, nUnique); - } - } - - if (bUseUnique) { - if (bSecondLevel) { - /* pick arbitrarily */ - lNew->hitSource = lChild[0]->hitSource; - } - int nSave = MIN(nUnique, bSecondLevel ? tophits->q : tophits->m); - assert(nSave>0); - if (verbose > 2) - fprintf(stderr, "Combined %d ops so far %ld\n", nUnique, profileOps - startProfileOps); - SortSaveBestHits(newnode, /*IN/SORT*/uniqueList, /*nIn*/nUnique, - /*nOut*/nSave, /*IN/OUT*/tophits); - assert(lNew->hits != NULL); /* set by sort/save */ - tophits->visible[newnode] = lNew->hits[0]; - UpdateTopVisible(/*IN*/NJ, nActive, newnode, &tophits->visible[newnode], - /*IN/OUT*/tophits); - UpdateVisible(/*IN/UPDATE*/NJ, nActive, /*IN*/uniqueList, nSave, /*IN/OUT*/tophits); - } else { - /* need to refresh: set top hits for node and for its top hits */ - if(verbose > 2) fprintf(stderr,"Top hits for %d by refresh (%d unique age %d) nActive=%d\n", - newnode,nUnique,lNew->age,nActive); - nRefreshTopHits++; - lNew->age = 0; - - int iNode; - /* ensure all out-distances are up to date ahead of time - to avoid any data overwriting issues. - */ -#ifdef OPENMP - #pragma omp parallel for schedule(dynamic, 50) -#endif - for (iNode = 0; iNode < NJ->maxnode; iNode++) { - if (NJ->parent[iNode] < 0) { - if (fastest) { - besthit_t bh; - bh.i = iNode; - bh.j = iNode; - bh.dist = 0; - SetCriterion(/*IN/UPDATE*/NJ, nActive, &bh); - } else { - SetOutDistance(/*IN/UDPATE*/NJ, iNode, nActive); - } - } - } - - /* exhaustively get the best 2*m hits for newnode, set visible, and save the top m */ - besthit_t *allhits = (besthit_t*)mymalloc(sizeof(besthit_t)*NJ->maxnode); - assert(2 * tophits->m <= NJ->maxnode); - besthit_t bh; - SetBestHit(newnode, NJ, nActive, /*OUT*/&bh, /*OUT*/allhits); - qsort(/*IN/OUT*/allhits, NJ->maxnode, sizeof(besthit_t), CompareHitsByCriterion); - SortSaveBestHits(newnode, /*IN/SORT*/allhits, /*nIn*/NJ->maxnode, - /*nOut*/tophits->m, /*IN/OUT*/tophits); - - /* Do not need to call UpdateVisible because we set visible below */ - - /* And use the top 2*m entries to expand other best-hit lists, but only for top m */ - int iHit; -#ifdef OPENMP - #pragma omp parallel for schedule(dynamic, 50) -#endif - for (iHit=0; iHit < tophits->m; iHit++) { - if (allhits[iHit].i < 0) continue; - int iNode = allhits[iHit].j; - assert(iNode>=0); - if (NJ->parent[iNode] >= 0) continue; - top_hits_list_t *l = &tophits->top_hits_lists[iNode]; - int nHitsOld = l->nHits; - assert(nHitsOld <= tophits->m); - l->age = 0; - - /* Merge: old hits into 0->nHitsOld and hits from iNode above that */ - besthit_t *bothList = (besthit_t*)mymalloc(sizeof(besthit_t) * 3 * tophits->m); - HitsToBestHits(/*IN*/l->hits, nHitsOld, iNode, /*OUT*/bothList); /* does not compute criterion */ - for (i = 0; i < nHitsOld; i++) - SetCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/&bothList[i]); - if (nActive <= 2 * tophits->m) - l->hitSource = -1; /* abandon the 2nd-level top-hits heuristic */ - int nNewHits = l->hitSource >= 0 ? tophits->q : tophits->m; - assert(nNewHits > 0); - - TransferBestHits(/*IN/UPDATE*/NJ, nActive, iNode, - /*IN*/allhits, /*nOldHits*/2 * nNewHits, - /*OUT*/&bothList[nHitsOld], - /*updateDist*/false); /* rely on UniqueBestHits to update dist and/or criterion */ - int nUnique2; - besthit_t *uniqueList2 = UniqueBestHits(/*IN/UPDATE*/NJ, nActive, - /*IN/SORT*/bothList, nHitsOld + 2 * nNewHits, - /*OUT*/&nUnique2); - assert(nUnique2 > 0); - bothList = myfree(bothList,3 * tophits->m * sizeof(besthit_t)); - - /* Note this will overwrite l, but we saved nHitsOld */ - SortSaveBestHits(iNode, /*IN/SORT*/uniqueList2, /*nIn*/nUnique2, - /*nOut*/nNewHits, /*IN/OUT*/tophits); - /* will update topvisible below */ - tophits->visible[iNode] = tophits->top_hits_lists[iNode].hits[0]; - uniqueList2 = myfree(uniqueList2, (nHitsOld + 2 * tophits->m) * sizeof(besthit_t)); - } - - ResetTopVisible(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/tophits); /* outside of the parallel phase */ - allhits = myfree(allhits,sizeof(besthit_t)*NJ->maxnode); - } - uniqueList = myfree(uniqueList, nUniqueAlloc * sizeof(besthit_t)); - if (verbose > 2) { - fprintf(stderr, "New top-hit list for %d profile-ops %ld (out-ops %ld): source %d age %d members ", - newnode, - profileOps - startProfileOps, - outprofileOps - startOutProfileOps, - lNew->hitSource, lNew->age); - - int i; - for (i = 0; i < lNew->nHits; i++) - fprintf(stderr, " %d", lNew->hits[i].j); - fprintf(stderr,"\n"); - } -} - -void UpdateVisible(/*IN/UPDATE*/NJ_t *NJ, int nActive, - /*IN*/besthit_t *tophitsNode, - int nTopHits, - /*IN/OUT*/top_hits_t *tophits) { - int iHit; - - for(iHit = 0; iHit < nTopHits; iHit++) { - besthit_t *hit = &tophitsNode[iHit]; - if (hit->i < 0) continue; /* possible empty entries */ - assert(NJ->parent[hit->i] < 0); - assert(hit->j >= 0 && NJ->parent[hit->j] < 0); - besthit_t visible; - bool bSuccess = GetVisible(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/tophits, hit->j, /*OUT*/&visible); - if (!bSuccess || hit->criterion < visible.criterion) { - if (bSuccess) - nVisibleUpdate++; - hit_t *v = &tophits->visible[hit->j]; - v->j = hit->i; - v->dist = hit->dist; - UpdateTopVisible(NJ, nActive, hit->j, v, /*IN/OUT*/tophits); - if(verbose>5) fprintf(stderr,"NewVisible %d %d %f\n", - hit->j,v->j,v->dist); - } - } /* end loop over hits */ -} - -/* Update the top-visible list to perhaps include visible[iNode] */ -void UpdateTopVisible(/*IN*/NJ_t * NJ, int nActive, - int iIn, /*IN*/hit_t *hit, - /*IN/OUT*/top_hits_t *tophits) { - assert(tophits != NULL); - bool bIn = false; /* placed in the list */ - int i; - - /* First, if the list is not full, put it in somewhere */ - for (i = 0; i < tophits->nTopVisible && !bIn; i++) { - int iNode = tophits->topvisible[i]; - if (iNode == iIn) { - /* this node is already in the top hit list */ - bIn = true; - } else if (iNode < 0 || NJ->parent[iNode] >= 0) { - /* found an empty spot */ - bIn = true; - tophits->topvisible[i] = iIn; - } - } - - int iPosWorst = -1; - double dCriterionWorst = -1e20; - if (!bIn) { - /* Search for the worst hit */ - for (i = 0; i < tophits->nTopVisible && !bIn; i++) { - int iNode = tophits->topvisible[i]; - assert(iNode >= 0 && NJ->parent[iNode] < 0 && iNode != iIn); - besthit_t visible; - if (!GetVisible(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/tophits, iNode, /*OUT*/&visible)) { - /* found an empty spot */ - tophits->topvisible[i] = iIn; - bIn = true; - } else if (visible.i == hit->j && visible.j == iIn) { - /* the reverse hit is already in the top hit list */ - bIn = true; - } else if (visible.criterion >= dCriterionWorst) { - iPosWorst = i; - dCriterionWorst = visible.criterion; - } - } - } - - if (!bIn && iPosWorst >= 0) { - besthit_t visible = HitToBestHit(iIn, *hit); - SetCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/&visible); - if (visible.criterion < dCriterionWorst) { - if (verbose > 2) { - int iOld = tophits->topvisible[iPosWorst]; - fprintf(stderr, "TopVisible replace %d=>%d with %d=>%d\n", - iOld, tophits->visible[iOld].j, visible.i, visible.j); - } - tophits->topvisible[iPosWorst] = iIn; - } - } - - if (verbose > 2) { - fprintf(stderr, "Updated TopVisible: "); - for (i = 0; i < tophits->nTopVisible; i++) { - int iNode = tophits->topvisible[i]; - if (iNode >= 0 && NJ->parent[iNode] < 0) { - besthit_t bh = HitToBestHit(iNode, tophits->visible[iNode]); - SetDistCriterion(NJ, nActive, &bh); - fprintf(stderr, " %d=>%d:%.4f", bh.i, bh.j, bh.criterion); - } - } - fprintf(stderr,"\n"); - } -} - -/* Recompute the topvisible list */ -void ResetTopVisible(/*IN/UPDATE*/NJ_t *NJ, - int nActive, - /*IN/OUT*/top_hits_t *tophits) { - besthit_t *visibleSorted = mymalloc(sizeof(besthit_t)*nActive); - int nVisible = 0; /* #entries in visibleSorted */ - int iNode; - for (iNode = 0; iNode < NJ->maxnode; iNode++) { - /* skip joins involving stale nodes */ - if (NJ->parent[iNode] >= 0) - continue; - besthit_t v; - if (GetVisible(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/tophits, iNode, /*OUT*/&v)) { - assert(nVisible < nActive); - visibleSorted[nVisible++] = v; - } - } - assert(nVisible > 0); - - qsort(/*IN/OUT*/visibleSorted,nVisible,sizeof(besthit_t),CompareHitsByCriterion); - - /* Only keep the top m items, and try to avoid duplicating i->j with j->i - Note that visible(i) -> j does not necessarily imply visible(j) -> i, - so we store what the pairing was (or -1 for not used yet) - */ - int *inTopVisible = malloc(sizeof(int) * NJ->maxnodes); - int i; - for (i = 0; i < NJ->maxnodes; i++) - inTopVisible[i] = -1; - - if (verbose > 2) - fprintf(stderr, "top-hit search: nActive %d nVisible %d considering up to %d items\n", - nActive, nVisible, tophits->m); - - /* save the sorted indices in topvisible */ - int iSave = 0; - for (i = 0; i < nVisible && iSave < tophits->nTopVisible; i++) { - besthit_t *v = &visibleSorted[i]; - if (inTopVisible[v->i] != v->j) { /* not seen already */ - tophits->topvisible[iSave++] = v->i; - inTopVisible[v->i] = v->j; - inTopVisible[v->j] = v->i; - } - } - while(iSave < tophits->nTopVisible) - tophits->topvisible[iSave++] = -1; - myfree(visibleSorted, sizeof(besthit_t)*nActive); - myfree(inTopVisible, sizeof(int) * NJ->maxnodes); - tophits->topvisibleAge = 0; - if (verbose > 2) { - fprintf(stderr, "Reset TopVisible: "); - for (i = 0; i < tophits->nTopVisible; i++) { - int iNode = tophits->topvisible[i]; - if (iNode < 0) - break; - fprintf(stderr, " %d=>%d", iNode, tophits->visible[iNode].j); - } - fprintf(stderr,"\n"); - } -} - -/* - Find best hit to do in O(N*log(N) + m*L*log(N)) time, by - copying and sorting the visible list - updating out-distances for the top (up to m) candidates - selecting the best hit - if !fastest then - local hill-climbing for a better join, - using best-hit lists only, and updating - all out-distances in every best-hit list -*/ -void TopHitNJSearch(/*IN/UPDATE*/NJ_t *NJ, int nActive, - /*IN/OUT*/top_hits_t *tophits, - /*OUT*/besthit_t *join) { - /* first, do we have at least m/2 candidates in topvisible? - And remember the best one */ - int nCandidate = 0; - int iNodeBestCandidate = -1; - double dBestCriterion = 1e20; - - int i; - for (i = 0; i < tophits->nTopVisible; i++) { - int iNode = tophits->topvisible[i]; - besthit_t visible; - if (GetVisible(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/tophits, iNode, /*OUT*/&visible)) { - nCandidate++; - if (iNodeBestCandidate < 0 || visible.criterion < dBestCriterion) { - iNodeBestCandidate = iNode; - dBestCriterion = visible.criterion; - } - } - } - - tophits->topvisibleAge++; - /* Note we may have only nActive/2 joins b/c we try to store them once */ - if (2 * tophits->topvisibleAge > tophits->m - || (3*nCandidate < tophits->nTopVisible && 3*nCandidate < nActive)) { - /* recompute top visible */ - if (verbose > 2) - fprintf(stderr, "Resetting the top-visible list at nActive=%d\n",nActive); - - /* If age is low, then our visible set is becoming too sparse, because we have - recently recomputed the top visible subset. This is very rare but can happen - with -fastest. A quick-and-dirty solution is to walk up - the parents to get additional entries in top hit lists. To ensure that the - visible set becomes full, pick an arbitrary node if walking up terminates at self. - */ - if (tophits->topvisibleAge <= 2) { - if (verbose > 2) - fprintf(stderr, "Expanding visible set by walking up to active nodes at nActive=%d\n", nActive); - int iNode; - for (iNode = 0; iNode < NJ->maxnode; iNode++) { - if (NJ->parent[iNode] >= 0) - continue; - hit_t *v = &tophits->visible[iNode]; - int newj = ActiveAncestor(NJ, v->j); - if (newj >= 0 && newj != v->j) { - if (newj == iNode) { - /* pick arbitrarily */ - newj = 0; - while (NJ->parent[newj] >= 0 || newj == iNode) - newj++; - } - assert(newj >= 0 && newj < NJ->maxnodes - && newj != iNode - && NJ->parent[newj] < 0); - - /* Set v to point to newj */ - besthit_t bh = { iNode, newj, -1e20, -1e20, -1e20 }; - SetDistCriterion(NJ, nActive, /*IN/OUT*/&bh); - v->j = newj; - v->dist = bh.dist; - } - } - } - ResetTopVisible(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/tophits); - /* and recurse to try again */ - TopHitNJSearch(NJ, nActive, tophits, join); - return; - } - if (verbose > 2) - fprintf(stderr, "Top-visible list size %d (nActive %d m %d)\n", - nCandidate, nActive, tophits->m); - assert(iNodeBestCandidate >= 0 && NJ->parent[iNodeBestCandidate] < 0); - bool bSuccess = GetVisible(NJ, nActive, tophits, iNodeBestCandidate, /*OUT*/join); - assert(bSuccess); - assert(join->i >= 0 && NJ->parent[join->i] < 0); - assert(join->j >= 0 && NJ->parent[join->j] < 0); - - if(fastest) - return; - - int changed; - do { - changed = 0; - - besthit_t bestI; - GetBestFromTopHits(join->i, NJ, nActive, tophits, /*OUT*/&bestI); - assert(bestI.i == join->i); - if (bestI.j != join->j && bestI.criterion < join->criterion) { - changed = 1; - if (verbose>2) - fprintf(stderr,"BetterI\t%d\t%d\t%d\t%d\t%f\t%f\n", - join->i,join->j,bestI.i,bestI.j, - join->criterion,bestI.criterion); - *join = bestI; - } - - besthit_t bestJ; - GetBestFromTopHits(join->j, NJ, nActive, tophits, /*OUT*/&bestJ); - assert(bestJ.i == join->j); - if (bestJ.j != join->i && bestJ.criterion < join->criterion) { - changed = 1; - if (verbose>2) - fprintf(stderr,"BetterJ\t%d\t%d\t%d\t%d\t%f\t%f\n", - join->i,join->j,bestJ.i,bestJ.j, - join->criterion,bestJ.criterion); - *join = bestJ; - } - if(changed) nHillBetter++; - } while(changed); -} - -int NGaps(/*IN*/NJ_t *NJ, int iNode) { - assert(iNode < NJ->nSeq); - int nGaps = 0; - int p; - for(p=0; p<NJ->nPos; p++) { - if (NJ->profiles[iNode]->codes[p] == NOCODE) - nGaps++; - } - return(nGaps); -} - -int CompareHitsByCriterion(const void *c1, const void *c2) { - const besthit_t *hit1 = (besthit_t*)c1; - const besthit_t *hit2 = (besthit_t*)c2; - if (hit1->criterion < hit2->criterion) return(-1); - if (hit1->criterion > hit2->criterion) return(1); - return(0); -} - -int CompareHitsByIJ(const void *c1, const void *c2) { - const besthit_t *hit1 = (besthit_t*)c1; - const besthit_t *hit2 = (besthit_t*)c2; - return hit1->i != hit2->i ? hit1->i - hit2->i : hit1->j - hit2->j; -} - -void SortSaveBestHits(int iNode, /*IN/SORT*/besthit_t *besthits, - int nIn, int nOut, - /*IN/OUT*/top_hits_t *tophits) { - assert(nIn > 0); - assert(nOut > 0); - top_hits_list_t *l = &tophits->top_hits_lists[iNode]; - /* */ - qsort(/*IN/OUT*/besthits,nIn,sizeof(besthit_t),CompareHitsByCriterion); - - /* First count how many we will save - Not sure if removing duplicates is actually necessary. - */ - int nSave = 0; - int jLast = -1; - int iBest; - for (iBest = 0; iBest < nIn && nSave < nOut; iBest++) { - if (besthits[iBest].i < 0) - continue; - assert(besthits[iBest].i == iNode); - int j = besthits[iBest].j; - if (j != iNode && j != jLast && j >= 0) { - nSave++; - jLast = j; - } - } - - assert(nSave > 0); - -#ifdef OPENMP - omp_set_lock(&tophits->locks[iNode]); -#endif - if (l->hits != NULL) { - l->hits = myfree(l->hits, l->nHits * sizeof(hit_t)); - l->nHits = 0; - } - l->hits = mymalloc(sizeof(hit_t) * nSave); - l->nHits = nSave; - int iSave = 0; - jLast = -1; - for (iBest = 0; iBest < nIn && iSave < nSave; iBest++) { - int j = besthits[iBest].j; - if (j != iNode && j != jLast && j >= 0) { - l->hits[iSave].j = j; - l->hits[iSave].dist = besthits[iBest].dist; - iSave++; - jLast = j; - } - } -#ifdef OPENMP - omp_unset_lock(&tophits->locks[iNode]); -#endif - assert(iSave == nSave); -} - -void TransferBestHits(/*IN/UPDATE*/NJ_t *NJ, - int nActive, - int iNode, - /*IN*/besthit_t *oldhits, - int nOldHits, - /*OUT*/besthit_t *newhits, - bool updateDistances) { - assert(iNode >= 0); - assert(NJ->parent[iNode] < 0); - - int iBest; - for(iBest = 0; iBest < nOldHits; iBest++) { - besthit_t *old = &oldhits[iBest]; - besthit_t *new = &newhits[iBest]; - new->i = iNode; - new->j = ActiveAncestor(/*IN*/NJ, old->j); - new->dist = old->dist; /* may get reset below */ - new->weight = old->weight; - new->criterion = old->criterion; - - if(new->j < 0 || new->j == iNode) { - new->weight = 0; - new->dist = -1e20; - new->criterion = 1e20; - } else if (new->i != old->i || new->j != old->j) { - if (updateDistances) - SetDistCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/new); - else { - new->dist = -1e20; - new->criterion = 1e20; - } - } else { - if (updateDistances) - SetCriterion(/*IN/UPDATE*/NJ, nActive, /*IN/OUT*/new); - else - new->criterion = 1e20; /* leave dist alone */ - } - } -} - -void HitsToBestHits(/*IN*/hit_t *hits, int nHits, int iNode, /*OUT*/besthit_t *newhits) { - int i; - for (i = 0; i < nHits; i++) { - hit_t *hit = &hits[i]; - besthit_t *bh = &newhits[i]; - bh->i = iNode; - bh->j = hit->j; - bh->dist = hit->dist; - bh->criterion = 1e20; - bh->weight = -1; /* not the true value -- we compute these directly when needed */ - } -} - -besthit_t HitToBestHit(int i, hit_t hit) { - besthit_t bh; - bh.i = i; - bh.j = hit.j; - bh.dist = hit.dist; - bh.criterion = 1e20; - bh.weight = -1; - return(bh); -} - -char *OpenMPString(void) { -#ifdef OPENMP - static char buf[100]; - sprintf(buf, ", OpenMP (%d threads)", omp_get_max_threads()); - return(buf); -#else - return(""); -#endif -} - -/* Algorithm 26.2.17 from Abromowitz and Stegun, Handbook of Mathematical Functions - Absolute accuracy of only about 1e-7, which is enough for us -*/ -double pnorm(double x) -{ - double b1 = 0.319381530; - double b2 = -0.356563782; - double b3 = 1.781477937; - double b4 = -1.821255978; - double b5 = 1.330274429; - double p = 0.2316419; - double c = 0.39894228; - - if(x >= 0.0) { - double t = 1.0 / ( 1.0 + p * x ); - return (1.0 - c * exp( -x * x / 2.0 ) * t * - ( t *( t * ( t * ( t * b5 + b4 ) + b3 ) + b2 ) + b1 )); - } - /*else*/ - double t = 1.0 / ( 1.0 - p * x ); - return ( c * exp( -x * x / 2.0 ) * t * - ( t *( t * ( t * ( t * b5 + b4 ) + b3 ) + b2 ) + b1 )); -} - -void *mymalloc(size_t sz) { - if (sz == 0) return(NULL); - void *new = malloc(sz); - if (new == NULL) { - fprintf(stderr, "Out of memory\n"); - exit(1); - } - szAllAlloc += sz; - mymallocUsed += sz; -#ifdef TRACK_MEMORY - struct mallinfo mi = mallinfo(); - if (mi.arena+mi.hblkhd > maxmallocHeap) - maxmallocHeap = mi.arena+mi.hblkhd; -#endif - /* gcc malloc should always return 16-byte-aligned values... */ - assert(IS_ALIGNED(new)); - return (new); -} - -void *mymemdup(void *data, size_t sz) { - if(data==NULL) return(NULL); - void *new = mymalloc(sz); - memcpy(/*to*/new, /*from*/data, sz); - return(new); -} - -void *myrealloc(void *data, size_t szOld, size_t szNew, bool bCopy) { - if (data == NULL && szOld == 0) - return(mymalloc(szNew)); - if (data == NULL || szOld == 0 || szNew == 0) { - fprintf(stderr,"Empty myrealloc\n"); - exit(1); - } - if (szOld == szNew) - return(data); - void *new = NULL; - if (bCopy) { - /* Try to reduce memory fragmentation by allocating anew and copying - Seems to help in practice */ - new = mymemdup(data, szNew); - myfree(data, szOld); - } else { - new = realloc(data,szNew); - if (new == NULL) { - fprintf(stderr, "Out of memory\n"); - exit(1); - } - assert(IS_ALIGNED(new)); - szAllAlloc += (szNew-szOld); - mymallocUsed += (szNew-szOld); -#ifdef TRACK_MEMORY - struct mallinfo mi = mallinfo(); - if (mi.arena+mi.hblkhd > maxmallocHeap) - maxmallocHeap = mi.arena+mi.hblkhd; -#endif - } - return(new); -} - -void *myfree(void *p, size_t sz) { - if(p==NULL) return(NULL); - free(p); - mymallocUsed -= sz; - return(NULL); -} - -/******************************************************************************/ -/* Minimization of a 1-dimensional function by Brent's method (Numerical Recipes) - * Borrowed from Tree-Puzzle 5.1 util.c under GPL - * Modified by M.N.P to pass in the accessory data for the optimization function, - * to use 2x bounds around the starting guess and expand them if necessary, - * and to use both a fractional and an absolute tolerance - */ - -#define ITMAX 100 -#define CGOLD 0.3819660 -#define TINY 1.0e-20 -#define ZEPS 1.0e-10 -#define SHFT(a,b,c,d) (a)=(b);(b)=(c);(c)=(d); -#define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a)) - -/* Brents method in one dimension */ -double brent(double ax, double bx, double cx, double (*f)(double, void *), void *data, - double ftol, double atol, - double *foptx, double *f2optx, double fax, double fbx, double fcx) -{ - int iter; - double a,b,d=0,etemp,fu,fv,fw,fx,p,q,r,tol1,tol2,u,v,w,x,xm; - double xw,wv,vx; - double e=0.0; - - a=(ax < cx ? ax : cx); - b=(ax > cx ? ax : cx); - x=bx; - fx=fbx; - if (fax < fcx) { - w=ax; - fw=fax; - v=cx; - fv=fcx; - } else { - w=cx; - fw=fcx; - v=ax; - fv=fax; - } - for (iter=1;iter<=ITMAX;iter++) { - xm=0.5*(a+b); - tol1=ftol*fabs(x); - tol2=2.0*(tol1+ZEPS); - if (fabs(x-xm) <= (tol2-0.5*(b-a)) - || fabs(a-b) < atol) { - *foptx = fx; - xw = x-w; - wv = w-v; - vx = v-x; - *f2optx = 2.0*(fv*xw + fx*wv + fw*vx)/ - (v*v*xw + x*x*wv + w*w*vx); - return x; - } - if (fabs(e) > tol1) { - r=(x-w)*(fx-fv); - q=(x-v)*(fx-fw); - p=(x-v)*q-(x-w)*r; - q=2.0*(q-r); - if (q > 0.0) p = -p; - q=fabs(q); - etemp=e; - e=d; - if (fabs(p) >= fabs(0.5*q*etemp) || p <= q*(a-x) || p >= q*(b-x)) - d=CGOLD*(e=(x >= xm ? a-x : b-x)); - else { - d=p/q; - u=x+d; - if (u-a < tol2 || b-u < tol2) - d=SIGN(tol1,xm-x); - } - } else { - d=CGOLD*(e=(x >= xm ? a-x : b-x)); - } - u=(fabs(d) >= tol1 ? x+d : x+SIGN(tol1,d)); - fu=(*f)(u,data); - if (fu <= fx) { - if (u >= x) a=x; else b=x; - SHFT(v,w,x,u) - SHFT(fv,fw,fx,fu) - } else { - if (u < x) a=u; else b=u; - if (fu <= fw || w == x) { - v=w; - w=u; - fv=fw; - fw=fu; - } else if (fu <= fv || v == x || v == w) { - v=u; - fv=fu; - } - } - } - *foptx = fx; - xw = x-w; - wv = w-v; - vx = v-x; - *f2optx = 2.0*(fv*xw + fx*wv + fw*vx)/ - (v*v*xw + x*x*wv + w*w*vx); - return x; -} /* brent */ -#undef ITMAX -#undef CGOLD -#undef ZEPS -#undef SHFT -#undef SIGN - -/* one-dimensional minimization - as input a lower and an upper limit and a trial - value for the minimum is needed: xmin < xguess < xmax - the function and a fractional tolerance has to be specified - onedimenmin returns the optimal x value and the value of the function - and its second derivative at this point - */ -double onedimenmin(double xmin, double xguess, double xmax, double (*f)(double,void*), void *data, - double ftol, double atol, - /*OUT*/double *fx, /*OUT*/double *f2x) -{ - double optx, ax, bx, cx, fa, fb, fc; - - /* first attempt to bracketize minimum */ - if (xguess == xmin) { - ax = xmin; - bx = 2.0*xguess; - cx = 10.0*xguess; - } else if (xguess <= 2.0 * xmin) { - ax = xmin; - bx = xguess; - cx = 5.0*xguess; - } else { - ax = 0.5*xguess; - bx = xguess; - cx = 2.0*xguess; - } - if (cx > xmax) - cx = xmax; - if (bx >= cx) - bx = 0.5*(ax+cx); - if (verbose > 4) - fprintf(stderr, "onedimenmin lo %.4f guess %.4f hi %.4f range %.4f %.4f\n", - ax, bx, cx, xmin, xmax); - /* ideally this range includes the true minimum, i.e., - fb < fa and fb < fc - if not, we gradually expand the boundaries until it does, - or we near the boundary of the allowed range and use that - */ - fa = (*f)(ax,data); - fb = (*f)(bx,data); - fc = (*f)(cx,data); - while(fa < fb && ax > xmin) { - ax = (ax+xmin)/2.0; - if (ax < 2.0*xmin) /* give up on shrinking the region */ - ax = xmin; - fa = (*f)(ax,data); - } - while(fc < fb && cx < xmax) { - cx = (cx+xmax)/2.0; - if (cx > xmax * 0.95) - cx = xmax; - fc = (*f)(cx,data); - } - optx = brent(ax, bx, cx, f, data, ftol, atol, fx, f2x, fa, fb, fc); - - if (verbose > 4) - fprintf(stderr, "onedimenmin reaches optimum f(%.4f) = %.4f f2x %.4f\n", optx, *fx, *f2x); - return optx; /* return optimal x */ -} /* onedimenmin */ - -/* Numerical code for the gamma distribution is modified from the PhyML 3 code - (GNU public license) of Stephane Guindon -*/ - -double LnGamma (double alpha) -{ -/* returns ln(gamma(alpha)) for alpha>0, accurate to 10 decimal places. - Stirling's formula is used for the central polynomial part of the procedure. - Pike MC & Hill ID (1966) Algorithm 291: Logarithm of the gamma function. - Communications of the Association for Computing Machinery, 9:684 -*/ - double x=alpha, f=0, z; - if (x<7) { - f=1; z=x-1; - while (++z<7) f*=z; - x=z; f=-(double)log(f); - } - z = 1/(x*x); - return f + (x-0.5)*(double)log(x) - x + .918938533204673 - + (((-.000595238095238*z+.000793650793651)*z-.002777777777778)*z - +.083333333333333)/x; -} - -double IncompleteGamma(double x, double alpha, double ln_gamma_alpha) -{ -/* returns the incomplete gamma ratio I(x,alpha) where x is the upper - limit of the integration and alpha is the shape parameter. - returns (-1) if in error - ln_gamma_alpha = ln(Gamma(alpha)), is almost redundant. - (1) series expansion if (alpha>x || x<=1) - (2) continued fraction otherwise - RATNEST FORTRAN by - Bhattacharjee GP (1970) The incomplete gamma integral. Applied Statistics, - 19: 285-287 (AS32) -*/ - int i; - double p=alpha, g=ln_gamma_alpha; - double accurate=1e-8, overflow=1e30; - double factor, gin=0, rn=0, a=0,b=0,an=0,dif=0, term=0, pn[6]; - - if (x==0) return (0); - if (x<0 || p<=0) return (-1); - - factor=(double)exp(p*(double)log(x)-x-g); - if (x>1 && x>=p) goto l30; - /* (1) series expansion */ - gin=1; term=1; rn=p; - l20: - rn++; - term*=x/rn; gin+=term; - - if (term > accurate) goto l20; - gin*=factor/p; - goto l50; - l30: - /* (2) continued fraction */ - a=1-p; b=a+x+1; term=0; - pn[0]=1; pn[1]=x; pn[2]=x+1; pn[3]=x*b; - gin=pn[2]/pn[3]; - l32: - a++; b+=2; term++; an=a*term; - for (i=0; i<2; i++) pn[i+4]=b*pn[i+2]-an*pn[i]; - if (pn[5] == 0) goto l35; - rn=pn[4]/pn[5]; dif=fabs(gin-rn); - if (dif>accurate) goto l34; - if (dif<=accurate*rn) goto l42; - l34: - gin=rn; - l35: - for (i=0; i<4; i++) pn[i]=pn[i+2]; - if (fabs(pn[4]) < overflow) goto l32; - for (i=0; i<4; i++) pn[i]/=overflow; - goto l32; - l42: - gin=1-factor*gin; - - l50: - return (gin); -} - -double PGamma(double x, double alpha) -{ - /* scale = 1/alpha */ - return IncompleteGamma(x*alpha,alpha,LnGamma(alpha)); -} - -/* helper function to subtract timval structures */ -/* Subtract the `struct timeval' values X and Y, - storing the result in RESULT. - Return 1 if the difference is negative, otherwise 0. */ -int timeval_subtract (struct timeval *result, struct timeval *x, struct timeval *y) -{ - /* Perform the carry for the later subtraction by updating y. */ - if (x->tv_usec < y->tv_usec) { - int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1; - y->tv_usec -= 1000000 * nsec; - y->tv_sec += nsec; - } - if (x->tv_usec - y->tv_usec > 1000000) { - int nsec = (x->tv_usec - y->tv_usec) / 1000000; - y->tv_usec += 1000000 * nsec; - y->tv_sec -= nsec; - } - - /* Compute the time remaining to wait. - tv_usec is certainly positive. */ - result->tv_sec = x->tv_sec - y->tv_sec; - result->tv_usec = x->tv_usec - y->tv_usec; - - /* Return 1 if result is negative. */ - return x->tv_sec < y->tv_sec; -} - -double clockDiff(/*IN*/struct timeval *clock_start) { - struct timeval time_now, elapsed; - gettimeofday(/*OUT*/&time_now,NULL); - timeval_subtract(/*OUT*/&elapsed,/*IN*/&time_now,/*IN*/clock_start); - return(elapsed.tv_sec + elapsed.tv_usec*1e-6); -} - - -/* The random number generator is taken from D E Knuth - http://www-cs-faculty.stanford.edu/~knuth/taocp.html -*/ - -/* This program by D E Knuth is in the public domain and freely copyable. - * It is explained in Seminumerical Algorithms, 3rd edition, Section 3.6 - * (or in the errata to the 2nd edition --- see - * http://www-cs-faculty.stanford.edu/~knuth/taocp.html - * in the changes to Volume 2 on pages 171 and following). */ - -/* N.B. The MODIFICATIONS introduced in the 9th printing (2002) are - included here; there's no backwards compatibility with the original. */ - -/* This version also adopts Brendan McKay's suggestion to - accommodate naive users who forget to call ran_start(seed). */ - -/* If you find any bugs, please report them immediately to - * taocp@cs.stanford.edu - * (and you will be rewarded if the bug is genuine). Thanks! */ - -/************ see the book for explanations and caveats! *******************/ -/************ in particular, you need two's complement arithmetic **********/ - -#define KK 100 /* the long lag */ -#define LL 37 /* the short lag */ -#define MM (1L<<30) /* the modulus */ -#define mod_diff(x,y) (((x)-(y))&(MM-1)) /* subtraction mod MM */ - -long ran_x[KK]; /* the generator state */ - -#ifdef __STDC__ -void ran_array(long aa[],int n) -#else - void ran_array(aa,n) /* put n new random numbers in aa */ - long *aa; /* destination */ - int n; /* array length (must be at least KK) */ -#endif -{ - register int i,j; - for (j=0;j<KK;j++) aa[j]=ran_x[j]; - for (;j<n;j++) aa[j]=mod_diff(aa[j-KK],aa[j-LL]); - for (i=0;i<LL;i++,j++) ran_x[i]=mod_diff(aa[j-KK],aa[j-LL]); - for (;i<KK;i++,j++) ran_x[i]=mod_diff(aa[j-KK],ran_x[i-LL]); -} - -/* the following routines are from exercise 3.6--15 */ -/* after calling ran_start, get new randoms by, e.g., "x=ran_arr_next()" */ - -#define QUALITY 1009 /* recommended quality level for high-res use */ -long ran_arr_buf[QUALITY]; -long ran_arr_dummy=-1, ran_arr_started=-1; -long *ran_arr_ptr=&ran_arr_dummy; /* the next random number, or -1 */ - -#define TT 70 /* guaranteed separation between streams */ -#define is_odd(x) ((x)&1) /* units bit of x */ - -#ifdef __STDC__ -void ran_start(long seed) -#else - void ran_start(seed) /* do this before using ran_array */ - long seed; /* selector for different streams */ -#endif -{ - register int t,j; - long x[KK+KK-1]; /* the preparation buffer */ - register long ss=(seed+2)&(MM-2); - for (j=0;j<KK;j++) { - x[j]=ss; /* bootstrap the buffer */ - ss<<=1; if (ss>=MM) ss-=MM-2; /* cyclic shift 29 bits */ - } - x[1]++; /* make x[1] (and only x[1]) odd */ - for (ss=seed&(MM-1),t=TT-1; t; ) { - for (j=KK-1;j>0;j--) x[j+j]=x[j], x[j+j-1]=0; /* "square" */ - for (j=KK+KK-2;j>=KK;j--) - x[j-(KK-LL)]=mod_diff(x[j-(KK-LL)],x[j]), - x[j-KK]=mod_diff(x[j-KK],x[j]); - if (is_odd(ss)) { /* "multiply by z" */ - for (j=KK;j>0;j--) x[j]=x[j-1]; - x[0]=x[KK]; /* shift the buffer cyclically */ - x[LL]=mod_diff(x[LL],x[KK]); - } - if (ss) ss>>=1; else t--; - } - for (j=0;j<LL;j++) ran_x[j+KK-LL]=x[j]; - for (;j<KK;j++) ran_x[j-LL]=x[j]; - for (j=0;j<10;j++) ran_array(x,KK+KK-1); /* warm things up */ - ran_arr_ptr=&ran_arr_started; -} - -#define ran_arr_next() (*ran_arr_ptr>=0? *ran_arr_ptr++: ran_arr_cycle()) -long ran_arr_cycle() -{ - if (ran_arr_ptr==&ran_arr_dummy) - ran_start(314159L); /* the user forgot to initialize */ - ran_array(ran_arr_buf,QUALITY); - ran_arr_buf[KK]=-1; - ran_arr_ptr=ran_arr_buf+1; - return ran_arr_buf[0]; -} - -/* end of code from Knuth */ - -double knuth_rand() { - return(9.31322574615479e-10 * ran_arr_next()); /* multiply by 2**-30 */ -} - -hashstrings_t *MakeHashtable(char **strings, int nStrings) { - hashstrings_t *hash = (hashstrings_t*)mymalloc(sizeof(hashstrings_t)); - hash->nBuckets = 8*nStrings; - hash->buckets = (hashbucket_t*)mymalloc(sizeof(hashbucket_t) * hash->nBuckets); - int i; - for (i=0; i < hash->nBuckets; i++) { - hash->buckets[i].string = NULL; - hash->buckets[i].nCount = 0; - hash->buckets[i].first = -1; - } - for (i=0; i < nStrings; i++) { - hashiterator_t hi = FindMatch(hash, strings[i]); - if (hash->buckets[hi].string == NULL) { - /* save a unique entry */ - assert(hash->buckets[hi].nCount == 0); - hash->buckets[hi].string = strings[i]; - hash->buckets[hi].nCount = 1; - hash->buckets[hi].first = i; - } else { - /* record a duplicate entry */ - assert(hash->buckets[hi].string != NULL); - assert(strcmp(hash->buckets[hi].string, strings[i]) == 0); - assert(hash->buckets[hi].first >= 0); - hash->buckets[hi].nCount++; - } - } - return(hash); -} - -hashstrings_t *FreeHashtable(hashstrings_t* hash) { - if (hash != NULL) { - myfree(hash->buckets, sizeof(hashbucket_t) * hash->nBuckets); - myfree(hash, sizeof(hashstrings_t)); - } - return(NULL); -} - -#define MAXADLER 65521 -hashiterator_t FindMatch(hashstrings_t *hash, char *string) { - /* Adler-32 checksum */ - unsigned int hashA = 1; - unsigned int hashB = 0; - char *p; - for (p = string; *p != '\0'; p++) { - hashA = ((unsigned int)*p + hashA); - hashB = hashA+hashB; - } - hashA %= MAXADLER; - hashB %= MAXADLER; - hashiterator_t hi = (hashB*65536+hashA) % hash->nBuckets; - while(hash->buckets[hi].string != NULL - && strcmp(hash->buckets[hi].string, string) != 0) { - hi++; - if (hi >= hash->nBuckets) - hi = 0; - } - return(hi); -} - -char *GetHashString(hashstrings_t *hash, hashiterator_t hi) { - return(hash->buckets[hi].string); -} - -int HashCount(hashstrings_t *hash, hashiterator_t hi) { - return(hash->buckets[hi].nCount); -} - -int HashFirst(hashstrings_t *hash, hashiterator_t hi) { - return(hash->buckets[hi].first); -} - -uniquify_t *UniquifyAln(alignment_t *aln) { - int nUniqueSeq = 0; - char **uniqueSeq = (char**)mymalloc(aln->nSeq * sizeof(char*)); /* iUnique -> seq */ - int *uniqueFirst = (int*)mymalloc(aln->nSeq * sizeof(int)); /* iUnique -> iFirst in aln */ - int *alnNext = (int*)mymalloc(aln->nSeq * sizeof(int)); /* i in aln -> next, or -1 */ - int *alnToUniq = (int*)mymalloc(aln->nSeq * sizeof(int)); /* i in aln -> iUnique; many -> -1 */ - - int i; - for (i = 0; i < aln->nSeq; i++) { - uniqueSeq[i] = NULL; - uniqueFirst[i] = -1; - alnNext[i] = -1; - alnToUniq[i] = -1; - } - hashstrings_t *hashseqs = MakeHashtable(aln->seqs, aln->nSeq); - for (i=0; i<aln->nSeq; i++) { - hashiterator_t hi = FindMatch(hashseqs,aln->seqs[i]); - int first = HashFirst(hashseqs,hi); - if (first == i) { - uniqueSeq[nUniqueSeq] = aln->seqs[i]; - uniqueFirst[nUniqueSeq] = i; - alnToUniq[i] = nUniqueSeq; - nUniqueSeq++; - } else { - int last = first; - while (alnNext[last] != -1) - last = alnNext[last]; - assert(last>=0); - alnNext[last] = i; - assert(alnToUniq[last] >= 0 && alnToUniq[last] < nUniqueSeq); - alnToUniq[i] = alnToUniq[last]; - } - } - assert(nUniqueSeq>0); - hashseqs = FreeHashtable(hashseqs); - - uniquify_t *uniquify = (uniquify_t*)mymalloc(sizeof(uniquify_t)); - uniquify->nSeq = aln->nSeq; - uniquify->nUnique = nUniqueSeq; - uniquify->uniqueFirst = uniqueFirst; - uniquify->alnNext = alnNext; - uniquify->alnToUniq = alnToUniq; - uniquify->uniqueSeq = uniqueSeq; - return(uniquify); -} - -uniquify_t *FreeUniquify(uniquify_t *unique) { - if (unique != NULL) { - myfree(unique->uniqueFirst, sizeof(int)*unique->nSeq); - myfree(unique->alnNext, sizeof(int)*unique->nSeq); - myfree(unique->alnToUniq, sizeof(int)*unique->nSeq); - myfree(unique->uniqueSeq, sizeof(char*)*unique->nSeq); - myfree(unique,sizeof(uniquify_t)); - unique = NULL; - } - return(unique); -} - -traversal_t InitTraversal(NJ_t *NJ) { - traversal_t worked = (bool*)mymalloc(sizeof(bool)*NJ->maxnodes); - int i; - for (i=0; i<NJ->maxnodes; i++) - worked[i] = false; - return(worked); -} - -void SkipTraversalInto(int node, /*IN/OUT*/traversal_t traversal) { - traversal[node] = true; -} - -int TraversePostorder(int node, NJ_t *NJ, /*IN/OUT*/traversal_t traversal, - /*OPTIONAL OUT*/bool *pUp) { - if (pUp) - *pUp = false; - while(1) { - assert(node >= 0); - - /* move to a child if possible */ - bool found = false; - int iChild; - for (iChild=0; iChild < NJ->child[node].nChild; iChild++) { - int child = NJ->child[node].child[iChild]; - if (!traversal[child]) { - node = child; - found = true; - break; - } - } - if (found) - continue; /* keep moving down */ - if (!traversal[node]) { - traversal[node] = true; - return(node); - } - /* If we've already done this node, need to move up */ - if (node == NJ->root) - return(-1); /* nowhere to go -- done traversing */ - node = NJ->parent[node]; - /* If we go up to someplace that was already marked as visited, this is due - to a change in topology, so return it marked as "up" */ - if (pUp && traversal[node]) { - *pUp = true; - return(node); - } - } -} - -traversal_t FreeTraversal(traversal_t traversal, NJ_t *NJ) { - myfree(traversal, sizeof(bool)*NJ->maxnodes); - return(NULL); -} - -profile_t **UpProfiles(NJ_t *NJ) { - profile_t **upProfiles = (profile_t**)mymalloc(sizeof(profile_t*)*NJ->maxnodes); - int i; - for (i=0; i<NJ->maxnodes; i++) upProfiles[i] = NULL; - return(upProfiles); -} - -profile_t *GetUpProfile(/*IN/OUT*/profile_t **upProfiles, NJ_t *NJ, int outnode, bool useML) { - assert(outnode != NJ->root && outnode >= NJ->nSeq); /* not for root or leaves */ - if (upProfiles[outnode] != NULL) - return(upProfiles[outnode]); - - int depth; - int *pathToRoot = PathToRoot(NJ, outnode, /*OUT*/&depth); - int i; - /* depth-1 is root */ - for (i = depth-2; i>=0; i--) { - int node = pathToRoot[i]; - - if (upProfiles[node] == NULL) { - /* Note -- SetupABCD may call GetUpProfile, but it should do it farther - up in the path to the root - */ - profile_t *profiles[4]; - int nodeABCD[4]; - SetupABCD(NJ, node, /*OUT*/profiles, /*IN/OUT*/upProfiles, /*OUT*/nodeABCD, useML); - if (useML) { - /* If node is a child of root, then the 4th profile is of the 2nd root-sibling of node - Otherwise, the 4th profile is the up-profile of the parent of node, and that - is the branch-length we need - */ - double lenC = NJ->branchlength[nodeABCD[2]]; - double lenD = NJ->branchlength[nodeABCD[3]]; - if (verbose > 3) { - fprintf(stderr, "Computing UpProfile for node %d with lenC %.4f lenD %.4f pair-loglk %.3f\n", - node, lenC, lenD, - PairLogLk(profiles[2],profiles[3],lenC+lenD,NJ->nPos,NJ->transmat,&NJ->rates, /*site_lk*/NULL)); - PrintNJInternal(stderr, NJ, /*useLen*/true); - } - upProfiles[node] = PosteriorProfile(/*C*/profiles[2], /*D*/profiles[3], - lenC, lenD, - NJ->transmat, &NJ->rates, NJ->nPos, NJ->nConstraints); - } else { - profile_t *profilesCDAB[4] = { profiles[2], profiles[3], profiles[0], profiles[1] }; - double weight = QuartetWeight(profilesCDAB, NJ->distance_matrix, NJ->nPos); - if (verbose>3) - fprintf(stderr, "Compute upprofile of %d from %d and parents (vs. children %d %d) with weight %.3f\n", - node, nodeABCD[2], nodeABCD[0], nodeABCD[1], weight); - upProfiles[node] = AverageProfile(profiles[2], profiles[3], - NJ->nPos, NJ->nConstraints, - NJ->distance_matrix, - weight); - } - } - } - FreePath(pathToRoot,NJ); - assert(upProfiles[outnode] != NULL); - return(upProfiles[outnode]); -} - -profile_t *DeleteUpProfile(/*IN/OUT*/profile_t **upProfiles, NJ_t *NJ, int node) { - assert(node>=0 && node < NJ->maxnodes); - if (upProfiles[node] != NULL) - upProfiles[node] = FreeProfile(upProfiles[node], NJ->nPos, NJ->nConstraints); /* returns NULL */ - return(NULL); -} - -profile_t **FreeUpProfiles(profile_t **upProfiles, NJ_t *NJ) { - int i; - int nUsed = 0; - for (i=0; i < NJ->maxnodes; i++) { - if (upProfiles[i] != NULL) - nUsed++; - DeleteUpProfile(upProfiles, NJ, i); - } - myfree(upProfiles, sizeof(profile_t*)*NJ->maxnodes); - if (verbose >= 3) - fprintf(stderr,"FreeUpProfiles -- freed %d\n", nUsed); - return(NULL); -} - -int *PathToRoot(NJ_t *NJ, int node, /*OUT*/int *outDepth) { - int *pathToRoot = (int*)mymalloc(sizeof(int)*NJ->maxnodes); - int depth = 0; - int ancestor = node; - while(ancestor >= 0) { - pathToRoot[depth] = ancestor; - ancestor = NJ->parent[ancestor]; - depth++; - } - *outDepth = depth; - return(pathToRoot); -} - -int *FreePath(int *path, NJ_t *NJ) { - myfree(path, sizeof(int)*NJ->maxnodes); - return(NULL); -} - -transition_matrix_t *CreateGTR(double *r/*ac ag at cg ct gt*/, double *f/*acgt*/) { - double matrix[4][MAXCODES]; - assert(nCodes==4); - int i, j; - /* Place rates onto a symmetric matrix, but correct by f(target), so that - stationary distribution f[] is maintained - Leave diagonals as 0 (CreateTransitionMatrix will fix them) - */ - int imat = 0; - for (i = 0; i < nCodes; i++) { - matrix[i][i] = 0; - for (j = i+1; j < nCodes; j++) { - double rate = r[imat++]; - assert(rate > 0); - /* Want t(matrix) * f to be 0 */ - matrix[i][j] = rate * f[i]; - matrix[j][i] = rate * f[j]; - } - } - /* Compute average mutation rate */ - double total_rate = 0; - for (i = 0; i < nCodes; i++) - for (j = 0; j < nCodes; j++) - total_rate += f[i] * matrix[i][j]; - assert(total_rate > 1e-6); - double inv = 1.0/total_rate; - for (i = 0; i < nCodes; i++) - for (j = 0; j < nCodes; j++) - matrix[i][j] *= inv; - return(CreateTransitionMatrix(matrix,f)); -} - -transition_matrix_t *CreateTransitionMatrix(/*IN*/double matrix[MAXCODES][MAXCODES], - /*IN*/double stat[MAXCODES]) { - int i,j,k; - transition_matrix_t *transmat = mymalloc(sizeof(transition_matrix_t)); - double sqrtstat[20]; - for (i = 0; i < nCodes; i++) { - transmat->stat[i] = stat[i]; - transmat->statinv[i] = 1.0/stat[i]; - sqrtstat[i] = sqrt(stat[i]); - } - - double sym[20*20]; /* symmetrized matrix M' */ - /* set diagonals so columns sums are 0 before symmetrization */ - for (i = 0; i < nCodes; i++) - for (j = 0; j < nCodes; j++) - sym[nCodes*i+j] = matrix[i][j]; - for (j = 0; j < nCodes; j++) { - double sum = 0; - sym[nCodes*j+j] = 0; - for (i = 0; i < nCodes; i++) - sum += sym[nCodes*i+j]; - sym[nCodes*j+j] = -sum; - } - /* M' = S**-1 M S */ - for (i = 0; i < nCodes; i++) - for (j = 0; j < nCodes; j++) - sym[nCodes*i+j] *= sqrtstat[j]/sqrtstat[i]; - - /* eigen decomposition of M' -- note that eigenW is the transpose of what we want, - which is eigenvectors in columns */ - double eigenW[20*20], eval[20], e[20]; - for (i = 0; i < nCodes*nCodes; i++) - eigenW[i] = sym[i]; - tred2(eigenW, nCodes, nCodes, eval, e); - tqli(eval, e, nCodes , nCodes, eigenW); - - /* save eigenvalues */ - for (i = 0; i < nCodes; i++) - transmat->eigenval[i] = eval[i]; - - /* compute eigen decomposition of M into t(codeFreq): V = S*W */ - /* compute inverse of V in eigeninv: V**-1 = t(W) S**-1 */ - for (i = 0; i < nCodes; i++) { - for (j = 0; j < nCodes; j++) { - transmat->eigeninv[i][j] = eigenW[nCodes*i+j] / sqrtstat[j]; - transmat->eigeninvT[j][i] = transmat->eigeninv[i][j]; - } - } - for (i = 0; i < nCodes; i++) - for (j = 0; j < nCodes; j++) - transmat->codeFreq[i][j] = eigenW[j*nCodes+i] * sqrtstat[i]; - /* codeFreq[NOCODE] is the rotation of (1,1,...) not (1/nCodes,1/nCodes,...), which - gives correct posterior probabilities - */ - for (j = 0; j < nCodes; j++) { - transmat->codeFreq[NOCODE][j] = 0.0; - for (i = 0; i < nCodes; i++) - transmat->codeFreq[NOCODE][j] += transmat->codeFreq[i][j]; - } - /* save some posterior probabilities for approximating later: - first, we compute P(B | A, t) for t = approxMLnearT, by using - V * exp(L*t) * V**-1 */ - double expvalues[MAXCODES]; - for (i = 0; i < nCodes; i++) - expvalues[i] = exp(approxMLnearT * transmat->eigenval[i]); - double LVinv[MAXCODES][MAXCODES]; /* exp(L*t) * V**-1 */ - for (i = 0; i < nCodes; i++) { - for (j = 0; j < nCodes; j++) - LVinv[i][j] = transmat->eigeninv[i][j] * expvalues[i]; - } - /* matrix transform for converting A -> B given t: transt[i][j] = P(j->i | t) */ - double transt[MAXCODES][MAXCODES]; - for (i = 0; i < nCodes; i++) { - for (j = 0; j < nCodes; j++) { - transt[i][j] = 0; - for (k = 0; k < nCodes; k++) - transt[i][j] += transmat->codeFreq[i][k] * LVinv[k][j]; - } - } - /* nearP[i][j] = P(parent = j | both children are i) = P(j | i,i) ~ stat(j) * P(j->i | t)**2 */ - for (i = 0; i < nCodes; i++) { - double nearP[MAXCODES]; - double tot = 0; - for (j = 0; j < nCodes; j++) { - assert(transt[j][i] > 0); - assert(transmat->stat[j] > 0); - nearP[j] = transmat->stat[j] * transt[i][j] * transt[i][j]; - tot += nearP[j]; - } - assert(tot > 0); - for (j = 0; j < nCodes; j++) - nearP[j] *= 1.0/tot; - /* save nearP in transmat->nearP[i][] */ - for (j = 0; j < nCodes; j++) - transmat->nearP[i][j] = nearP[j]; - /* multiply by 1/stat and rotate nearP */ - for (j = 0; j < nCodes; j++) - nearP[j] /= transmat->stat[j]; - for (j = 0; j < nCodes; j++) { - double rot = 0; - for (k = 0; k < nCodes; k++) - rot += nearP[k] * transmat->codeFreq[i][j]; - transmat->nearFreq[i][j] = rot; - } - } - return(transmat); - assert(0); -} - -distance_matrix_t *TransMatToDistanceMat(transition_matrix_t *transmat) { - if (transmat == NULL) - return(NULL); - distance_matrix_t *dmat = mymalloc(sizeof(distance_matrix_t)); - int i, j; - for (i=0; i<nCodes; i++) { - for (j=0; j<nCodes; j++) { - dmat->distances[i][j] = 0; /* never actually used */ - dmat->eigeninv[i][j] = transmat->eigeninv[i][j]; - dmat->codeFreq[i][j] = transmat->codeFreq[i][j]; - } - } - /* eigentot . rotated-vector is the total frequency of the unrotated vector - (used to normalize in NormalizeFreq() - For transition matrices, we rotate by transpose of eigenvectors, so - we need to multiply by the inverse matrix by 1....1 to get this vector, - or in other words, sum the columns - */ - for(i = 0; i<nCodes; i++) { - dmat->eigentot[i] = 0.0; - for (j = 0; j<nCodes; j++) - dmat->eigentot[i] += transmat->eigeninv[i][j]; - } - return(dmat); -} - -/* Numerical recipes code for eigen decomposition (actually taken from RAxML rev_functions.c) */ -void tred2 (double *a, const int n, const int np, double *d, double *e) -{ -#define a(i,j) a[(j-1)*np + (i-1)] -#define e(i) e[i-1] -#define d(i) d[i-1] - int i, j, k, l; - double f, g, h, hh, scale; - for (i = n; i > 1; i--) { - l = i-1; - h = 0; - scale = 0; - if ( l > 1 ) { - for ( k = 1; k <= l; k++ ) - scale += fabs(a(i,k)); - if (scale == 0) - e(i) = a(i,l); - else { - for (k = 1; k <= l; k++) { - a(i,k) /= scale; - h += a(i,k) * a(i,k); - } - f = a(i,l); - g = -sqrt(h); - if (f < 0) g = -g; - e(i) = scale *g; - h -= f*g; - a(i,l) = f-g; - f = 0; - for (j = 1; j <=l ; j++) { - a(j,i) = a(i,j) / h; - g = 0; - for (k = 1; k <= j; k++) - g += a(j,k)*a(i,k); - for (k = j+1; k <= l; k++) - g += a(k,j)*a(i,k); - e(j) = g/h; - f += e(j)*a(i,j); - } - hh = f/(h+h); - for (j = 1; j <= l; j++) { - f = a(i,j); - g = e(j) - hh * f; - e(j) = g; - for (k = 1; k <= j; k++) - a(j,k) -= f*e(k) + g*a(i,k); - } - } - } else - e(i) = a(i,l); - d(i) = h; - } - d(1) = 0; - e(1) = 0; - for (i = 1; i <= n; i++) { - l = i-1; - if (d(i) != 0) { - for (j = 1; j <=l; j++) { - g = 0; - for (k = 1; k <= l; k++) - g += a(i,k)*a(k,j); - for (k=1; k <=l; k++) - a(k,j) -= g * a(k,i); - } - } - d(i) = a(i,i); - a(i,i) = 1; - for (j=1; j<=l; j++) - a(i,j) = a(j,i) = 0; - } - - return; -#undef a -#undef e -#undef d -} - -double pythag(double a, double b) { - double absa = fabs(a), absb = fabs(b); - return (absa > absb) ? - absa * sqrt(1+ (absb/absa)*(absb/absa)) : - absb == 0 ? - 0 : - absb * sqrt(1+ (absa/absb)*(absa/absb)); -} - -void tqli(double *d, double *e, int n, int np, double *z) -{ -#define z(i,j) z[(j-1)*np + (i-1)] -#define e(i) e[i-1] -#define d(i) d[i-1] - - int i = 0, iter = 0, k = 0, l = 0, m = 0; - double b = 0, c = 0, dd = 0, f = 0, g = 0, p = 0, r = 0, s = 0; - - for(i=2; i<=n; i++) - e(i-1) = e(i); - e(n) = 0; - - for (l = 1; l <= n; l++) - { - iter = 0; - labelExtra: - - for (m = l; (m < n); m++) - { - dd = fabs(d(m))+fabs(d(m+1)); - - if (fabs(e(m))+dd == dd) - break; - } - - if (m != l) - { - assert(iter < 30); - - iter++; - g = (d(l+1)-d(l))/(2*e(l)); - r = pythag(g,1.); - g = d(m)-d(l)+e(l)/(g+(g<0?-r:r)); - s = 1; - c = 1; - p = 0; - - for (i = m-1; i>=l; i--) - { - f = s*e(i); - b = c*e(i); - r = pythag(f,g); - - e(i+1) = r; - if (r == 0) - { - d (i+1) -= p; - e (m) = 0; - - goto labelExtra; - } - s = f/r; - c = g/r; - g = d(i+1)-p; - r = (d(i)-g)*s + 2*c*b; - p = s*r; - d(i+1) = g + p; - g = c*r - b; - for (k=1; k <= n; k++) - { - f = z(k,i+1); - z(k,i+1) = s * z(k,i) + c*f; - z(k,i) = c * z(k,i) - s*f; - } - } - d(l) -= p; - e(l) = g; - e(m) = 0; - - goto labelExtra; - } - } - - return; -#undef z -#undef e -#undef d - -} - -#ifdef USE_SSE3 -inline float mm_sum(register __m128 sum) { -#if 1 - /* stupider but faster */ - float f[4] ALIGNED; - _mm_store_ps(f,sum); - return(f[0]+f[1]+f[2]+f[3]); -#else - /* first we get sum[0]+sum[1], sum[2]+sum[3] by selecting 0/1 and 2/3 */ - sum = _mm_add_ps(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(0,1,2,3))); - /* then get sum[0]+sum[1]+sum[2]+sum[3] by selecting 0/1 and 0/1 */ - sum = _mm_add_ps(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(0,1,0,1))); - float f; - _mm_store_ss(&f, sum); /* save the lowest word */ - return(f); -#endif -} -#endif - -void vector_multiply(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, int n, /*OUT*/numeric_t *fOut) { -#ifdef USE_SSE3 - int i; - for (i = 0; i < n; i += 4) { - __m128 a, b, c; - a = _mm_load_ps(f1+i); - b = _mm_load_ps(f2+i); - c = _mm_mul_ps(a, b); - _mm_store_ps(fOut+i,c); - } -#else - int i; - for (i = 0; i < n; i++) - fOut[i] = f1[i]*f2[i]; -#endif -} - -numeric_t vector_multiply_sum(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, int n) { -#ifdef USE_SSE3 - if (n == 4) - return(f1[0]*f2[0]+f1[1]*f2[1]+f1[2]*f2[2]+f1[3]*f2[3]); - __m128 sum = _mm_setzero_ps(); - int i; - for (i = 0; i < n; i += 4) { - __m128 a, b, c; - a = _mm_load_ps(f1+i); - b = _mm_load_ps(f2+i); - c = _mm_mul_ps(a, b); - sum = _mm_add_ps(c, sum); - } - return(mm_sum(sum)); -#else - int i; - numeric_t out = 0.0; - for (i=0; i < n; i++) - out += f1[i]*f2[i]; - return(out); -#endif -} - -/* sum(f1*f2*f3) */ -numeric_t vector_multiply3_sum(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, /*IN*/numeric_t* f3, int n) { -#ifdef USE_SSE3 - __m128 sum = _mm_setzero_ps(); - int i; - for (i = 0; i < n; i += 4) { - __m128 a1, a2, a3; - a1 = _mm_load_ps(f1+i); - a2 = _mm_load_ps(f2+i); - a3 = _mm_load_ps(f3+i); - sum = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(a1,a2),a3),sum); - } - return(mm_sum(sum)); -#else - int i; - numeric_t sum = 0.0; - for (i = 0; i < n; i++) - sum += f1[i]*f2[i]*f3[i]; - return(sum); -#endif -} - -numeric_t vector_dot_product_rot(/*IN*/numeric_t *f1, /*IN*/numeric_t *f2, /*IN*/numeric_t *fBy, int n) { -#ifdef USE_SSE3 - __m128 sum1 = _mm_setzero_ps(); - __m128 sum2 = _mm_setzero_ps(); - int i; - for (i = 0; i < n; i += 4) { - __m128 a1, a2, aBy; - a1 = _mm_load_ps(f1+i); - a2 = _mm_load_ps(f2+i); - aBy = _mm_load_ps(fBy+i); - sum1 = _mm_add_ps(_mm_mul_ps(a1, aBy), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(a2, aBy), sum2); - } - return(mm_sum(sum1)*mm_sum(sum2)); -#else - int i; - numeric_t out1 = 0.0; - numeric_t out2 = 0.0; - for (i=0; i < n; i++) { - out1 += f1[i]*fBy[i]; - out2 += f2[i]*fBy[i]; - } - return(out1*out2); -#endif -} - -numeric_t vector_sum(/*IN*/numeric_t *f1, int n) { -#ifdef USE_SSE3 - if (n==4) - return(f1[0]+f1[1]+f1[2]+f1[3]); - __m128 sum = _mm_setzero_ps(); - int i; - for (i = 0; i < n; i+=4) { - __m128 a; - a = _mm_load_ps(f1+i); - sum = _mm_add_ps(a, sum); - } - return(mm_sum(sum)); -#else - numeric_t out = 0.0; - int i; - for (i = 0; i < n; i++) - out += f1[i]; - return(out); -#endif -} - -void vector_multiply_by(/*IN/OUT*/numeric_t *f, /*IN*/numeric_t fBy, int n) { - int i; -#ifdef USE_SSE3 - __m128 c = _mm_set1_ps(fBy); - for (i = 0; i < n; i += 4) { - __m128 a, b; - a = _mm_load_ps(f+i); - b = _mm_mul_ps(a,c); - _mm_store_ps(f+i,b); - } -#else - for (i = 0; i < n; i++) - f[i] *= fBy; -#endif -} - -void vector_add_mult(/*IN/OUT*/numeric_t *fTot, /*IN*/numeric_t *fAdd, numeric_t weight, int n) { -#ifdef USE_SSE3 - int i; - __m128 w = _mm_set1_ps(weight); - for (i = 0; i < n; i += 4) { - __m128 tot, add; - tot = _mm_load_ps(fTot+i); - add = _mm_load_ps(fAdd+i); - _mm_store_ps(fTot+i, _mm_add_ps(tot, _mm_mul_ps(add,w))); - } -#else - int i; - for (i = 0; i < n; i++) - fTot[i] += fAdd[i] * weight; -#endif -} - -void matrixt_by_vector4(/*IN*/numeric_t mat[4][MAXCODES], /*IN*/numeric_t vec[4], /*OUT*/numeric_t out[4]) { -#ifdef USE_SSE3 - /*__m128 v = _mm_load_ps(vec);*/ - __m128 o = _mm_setzero_ps(); - int j; - /* result is a sum of vectors: sum(k) v[k] * mat[k][] */ - for (j = 0; j < 4; j++) { - __m128 m = _mm_load_ps(&mat[j][0]); - __m128 vj = _mm_load1_ps(&vec[j]); /* is it faster to shuffle v? */ - o = _mm_add_ps(o, _mm_mul_ps(vj,m)); - } - _mm_store_ps(out, o); -#else - int j,k; - for (j = 0; j < 4; j++) { - double sum = 0; - for (k = 0; k < 4; k++) - sum += vec[k] * mat[k][j]; - out[j] = sum; - } -#endif -} - -transition_matrix_t *ReadAATransitionMatrix(/*IN*/char *filename) { - assert(nCodes==20); - double stat[20]; - static double matrix[MAXCODES][MAXCODES]; - static char buf[BUFFER_SIZE]; - FILE *fp = fopen(filename, "r"); - if (fp == NULL) { - fprintf(stderr, "Cannot read transition matrix file %s\n", filename); - exit(1); - } - char expected[2*MAXCODES+20]; - int posE = 0; - int i, j; - for (i = 0; i < 20; i++) { - expected[posE++] = codesStringAA[i]; - expected[posE++] = '\t'; - } - expected[posE++] = '*'; - expected[posE++] = '\n'; - expected[posE++] = '\0'; - - if (fgets(buf, sizeof(buf), fp) == NULL) { - fprintf(stderr, "Error reading header line from transition matrix file\n"); - exit(1); - } - if (strcmp(buf, expected) != 0) { - fprintf(stderr, "Invalid header line in transition matrix file, it must match:\n%s\n", expected); - exit(1); - } - for (i = 0; i < 20; i++) { - if (fgets(buf, sizeof(buf), fp) == NULL) { - fprintf(stderr, "Error reading matrix line\n"); - exit(1); - } - char *field = strtok(buf,"\t\r\n"); - if (field == NULL || strlen(field) != 1 || field[0] != codesStringAA[i]) { - fprintf(stderr, "Line for amino acid %c does not have the expected beginning\n", codesStringAA[i]); - exit(1); - } - for (j = 0; j < 20; j++) { - field = strtok(NULL, "\t\r\n"); - if (field == NULL) { - fprintf(stderr, "Not enough fields for amino acid %c\n", codesStringAA[i]); - exit(1); - } - matrix[i][j] = atof(field); - } - field = strtok(NULL, "\t\r\n"); - if (field == NULL) { - fprintf(stderr, "Not enough fields for amino acid %c\n", codesStringAA[i]); - exit(1); - } - stat[i] = atof(field); - } - - double tol = 1e-5; - /* Verify that stat is positive and sums to 1 */ - double statTot = 0; - for (i = 0; i < 20; i++) { - if (stat[i] < tol) { - fprintf(stderr, "stationary frequency for amino acid %c must be positive\n", codesStringAA[i]); - exit(1); - } - statTot += stat[i]; - } - if (fabs(statTot - 1) > tol) { - fprintf(stderr, "stationary frequencies must sum to 1 -- actual sum is %g\n", statTot); - exit(1); - } - - /* Verify that diagonals are negative and dot product of stat and diagonals is -1 */ - double totRate = 0; - for (i = 0; i < 20; i++) { - double diag = matrix[i][i]; - if (diag > -tol) { - fprintf(stderr, "transition rate(%c,%c) must be negative\n", - codesStringAA[i], codesStringAA[i]); - exit(1); - } - totRate += stat[i] * diag; - } - if (fabs(totRate + 1) > tol) { - fprintf(stderr, "Dot product of matrix diagonal and stationary frequencies must be -1 -- actual dot product is %g\n", - totRate); - exit(1); - } - - /* Verify that each off-diagonal entry is nonnegative and that each column sums to 0 */ - for (j = 0; j < 20; j++) { - double colSum = 0; - for (i = 0; i < 20; i++) { - double value = matrix[i][j]; - colSum += value; - if (i != j && value < 0) { - fprintf(stderr, "Off-diagonal matrix entry for (%c,%c) is negative\n", - codesStringAA[i], codesStringAA[j]); - exit(1); - } - } - if (fabs(colSum) > tol) { - fprintf(stderr, "Sum of column %c must be zero -- actual sum is %g\n", - codesStringAA[j], colSum); - exit(1); - } - } - return CreateTransitionMatrix(matrix, stat); -} - -distance_matrix_t matrixBLOSUM45 = - { - /*distances*/ - { - {0, 1.31097856157468, 1.06573001937323, 1.2682782988532, 0.90471293383305, 1.05855446876905, 1.05232790675508, 0.769574440593014, 1.27579668305679, 0.964604099952603, 0.987178199640556, 1.05007594438157, 1.05464162250736, 1.1985987403937, 0.967404475245526, 0.700490199584332, 0.880060189098976, 1.09748548316685, 1.28141710375267, 0.800038509951648}, - {1.31097856157468, 0, 0.8010890222701, 0.953340718498495, 1.36011107208122, 0.631543775840481, 0.791014908659279, 1.15694899265629, 0.761152570032029, 1.45014917711188, 1.17792001455227, 0.394661075648738, 0.998807558909651, 1.135143404599, 1.15432562628921, 1.05309036790541, 1.05010474413616, 1.03938321130789, 0.963216908696184, 1.20274751778601}, - {1.06573001937323, 0.8010890222701, 0, 0.488217214273568, 1.10567116937273, 0.814970207038261, 0.810176440932339, 0.746487413974582, 0.61876156253224, 1.17886558630004, 1.52003670190022, 0.808442678243754, 1.2889025816028, 1.16264109995678, 1.18228799147301, 0.679475681649858, 0.853658619686283, 1.68988558988005, 1.24297493464833, 1.55207513886163}, - {1.2682782988532, 0.953340718498495, 0.488217214273568, 0, 1.31581050011876, 0.769778474953791, 0.482077627352988, 0.888361752320536, 0.736360849050364, 1.76756333403346, 1.43574761894039, 0.763612910719347, 1.53386612356483, 1.74323672079854, 0.886347403928663, 0.808614044804528, 1.01590147813779, 1.59617804551619, 1.1740494822217, 1.46600946033173}, - {0.90471293383305, 1.36011107208122, 1.10567116937273, 1.31581050011876, 0, 1.3836789310481, 1.37553994252576, 1.26740695314856, 1.32361065635259, 1.26087264215993, 1.02417540515351, 1.37259631233791, 1.09416720447891, 0.986982088723923, 1.59321190226694, 0.915638787768407, 0.913042853922533, 1.80744143643002, 1.3294417177004, 0.830022143283238}, - {1.05855446876905, 0.631543775840481, 0.814970207038261, 0.769778474953791, 1.3836789310481, 0, 0.506942797642807, 1.17699648087288, 0.614595446514896, 1.17092829494457, 1.19833088638994, 0.637341078675405, 0.806490842729072, 1.83315144709714, 0.932064479113502, 0.850321696813199, 1.06830084665916, 1.05739353225849, 0.979907428113788, 1.5416250309563}, - {1.05232790675508, 0.791014908659279, 0.810176440932339, 0.482077627352988, 1.37553994252576, 0.506942797642807, 0, 1.17007322676118, 0.769786956320484, 1.46659942462342, 1.19128214039009, 0.633592151371708, 1.27269395724349, 1.44641491621774, 0.735428579892476, 0.845319988414402, 1.06201695511881, 1.324395996498, 1.22734387448031, 1.53255698189437}, - {0.769574440593014, 1.15694899265629, 0.746487413974582, 0.888361752320536, 1.26740695314856, 1.17699648087288, 1.17007322676118, 0, 1.1259007054424, 1.7025415585924, 1.38293205218175, 1.16756929156758, 1.17264582493965, 1.33271035269688, 1.07564768421292, 0.778868281341681, 1.23287107008366, 0.968539655354582, 1.42479529031801, 1.41208067821187}, - {1.27579668305679, 0.761152570032029, 0.61876156253224, 0.736360849050364, 1.32361065635259, 0.614595446514896, 0.769786956320484, 1.1259007054424, 0, 1.4112324673522, 1.14630894167097, 0.967795284542623, 0.771479459384692, 1.10468029976148, 1.12334774065132, 1.02482926701639, 1.28754326478771, 1.27439749294131, 0.468683841672724, 1.47469999960758}, - {0.964604099952603, 1.45014917711188, 1.17886558630004, 1.76756333403346, 1.26087264215993, 1.17092829494457, 1.46659942462342, 1.7025415585924, 1.4112324673522, 0, 0.433350517223017, 1.463460928818, 0.462965544381851, 0.66291968000662, 1.07010201755441, 1.23000200130049, 0.973485453109068, 0.963546200571036, 0.708724769805536, 0.351200119909572}, - {0.987178199640556, 1.17792001455227, 1.52003670190022, 1.43574761894039, 1.02417540515351, 1.19833088638994, 1.19128214039009, 1.38293205218175, 1.14630894167097, 0.433350517223017, 0, 1.49770950074319, 0.473800072611076, 0.538473125003292, 1.37979627224964, 1.5859723170438, 0.996267398224516, 0.986095542821092, 0.725310666139274, 0.570542199221932}, - {1.05007594438157, 0.394661075648738, 0.808442678243754, 0.763612910719347, 1.37259631233791, 0.637341078675405, 0.633592151371708, 1.16756929156758, 0.967795284542623, 1.463460928818, 1.49770950074319, 0, 1.0079761868248, 1.44331961488922, 0.924599080166146, 1.06275728888356, 1.05974425835993, 1.04892430642749, 0.972058829603409, 1.21378822764856}, - {1.05464162250736, 0.998807558909651, 1.2889025816028, 1.53386612356483, 1.09416720447891, 0.806490842729072, 1.27269395724349, 1.17264582493965, 0.771479459384692, 0.462965544381851, 0.473800072611076, 1.0079761868248, 0, 0.72479754849538, 1.1699868662153, 1.34481214251794, 1.06435197383538, 1.05348497728858, 0.774878150710318, 0.609532859331199}, - {1.1985987403937, 1.135143404599, 1.16264109995678, 1.74323672079854, 0.986982088723923, 1.83315144709714, 1.44641491621774, 1.33271035269688, 1.10468029976148, 0.66291968000662, 0.538473125003292, 1.44331961488922, 0.72479754849538, 0, 1.32968844979665, 1.21307373491949, 0.960087571600877, 0.475142555482979, 0.349485367759138, 0.692733248746636}, - {0.967404475245526, 1.15432562628921, 1.18228799147301, 0.886347403928663, 1.59321190226694, 0.932064479113502, 0.735428579892476, 1.07564768421292, 1.12334774065132, 1.07010201755441, 1.37979627224964, 0.924599080166146, 1.1699868662153, 1.32968844979665, 0, 0.979087429691819, 0.97631161216338, 1.21751652292503, 1.42156458605332, 1.40887880416009}, - {0.700490199584332, 1.05309036790541, 0.679475681649858, 0.808614044804528, 0.915638787768407, 0.850321696813199, 0.845319988414402, 0.778868281341681, 1.02482926701639, 1.23000200130049, 1.5859723170438, 1.06275728888356, 1.34481214251794, 1.21307373491949, 0.979087429691819, 0, 0.56109848274013, 1.76318885009194, 1.29689226231656, 1.02015839286433}, - {0.880060189098976, 1.05010474413616, 0.853658619686283, 1.01590147813779, 0.913042853922533, 1.06830084665916, 1.06201695511881, 1.23287107008366, 1.28754326478771, 0.973485453109068, 0.996267398224516, 1.05974425835993, 1.06435197383538, 0.960087571600877, 0.97631161216338, 0.56109848274013, 0, 1.39547634461879, 1.02642577026706, 0.807404666228614}, - {1.09748548316685, 1.03938321130789, 1.68988558988005, 1.59617804551619, 1.80744143643002, 1.05739353225849, 1.324395996498, 0.968539655354582, 1.27439749294131, 0.963546200571036, 0.986095542821092, 1.04892430642749, 1.05348497728858, 0.475142555482979, 1.21751652292503, 1.76318885009194, 1.39547634461879, 0, 0.320002937404137, 1.268589159299}, - {1.28141710375267, 0.963216908696184, 1.24297493464833, 1.1740494822217, 1.3294417177004, 0.979907428113788, 1.22734387448031, 1.42479529031801, 0.468683841672724, 0.708724769805536, 0.725310666139274, 0.972058829603409, 0.774878150710318, 0.349485367759138, 1.42156458605332, 1.29689226231656, 1.02642577026706, 0.320002937404137, 0, 0.933095433689795}, - {0.800038509951648, 1.20274751778601, 1.55207513886163, 1.46600946033173, 0.830022143283238, 1.5416250309563, 1.53255698189437, 1.41208067821187, 1.47469999960758, 0.351200119909572, 0.570542199221932, 1.21378822764856, 0.609532859331199, 0.692733248746636, 1.40887880416009, 1.02015839286433, 0.807404666228614, 1.268589159299, 0.933095433689795, 0} - }, - /*eigeninv*/ - { - {-0.216311217101265, -0.215171653035930, -0.217000020881064, -0.232890860601250, -0.25403526530177, -0.211569372858927, -0.218073620637049, -0.240585637190076, -0.214507049619293, -0.228476323330312, -0.223235445346107, -0.216116483840334, -0.206903836810903, -0.223553828183343, -0.236937609127783, -0.217652789023588, -0.211982652566286, -0.245995223308316, -0.206187718714279, -0.227670670439422}, - {-0.0843931919568687, -0.0342164464991033, 0.393702284928246, -0.166018266253027, 0.0500896782860136, -0.262731388032538, 0.030139964190519, -0.253997503551094, -0.0932603349591988, -0.32884667697173, 0.199966846276877, -0.117543453869516, 0.196248237055757, -0.456448703853250, 0.139286961076387, 0.241166801918811, -0.0783508285295053, 0.377438091416498, 0.109499076984234, 0.128581669647144}, - {-0.0690428674271772, 0.0133858672878363, -0.208289917312908, 0.161232925220819, 0.0735806288007248, -0.316269599838174, -0.0640708424745702, -0.117078801507436, 0.360805085405857, 0.336899760384943, 0.0332447078185156, 0.132954055834276, 0.00595209121998118, -0.157755611190327, -0.199839273133436, 0.193688928807663, 0.0970290928040946, 0.374683975138541, -0.478110944870958, -0.243290196936098}, - {0.117284581850481, 0.310399467781876, -0.143513477698805, 0.088808130300351, 0.105747812943691, -0.373871701179853, 0.189069306295134, 0.133258225034741, -0.213043549687694, 0.301303731259140, -0.182085224761849, -0.161971915020789, 0.229301173581378, -0.293586313243755, -0.0260480060747498, -0.0217953684540699, 0.0202675755458796, -0.160134624443657, 0.431950096999465, -0.329885160320501}, - {0.256496969244703, 0.0907408349583135, 0.0135731083898029, 0.477557831930769, -0.0727379669280703, 0.101732675207959, -0.147293025369251, -0.348325291603251, -0.255678082078362, -0.187092643740172, -0.177164064346593, -0.225921480146133, 0.422318841046522, 0.319959853469398, -0.0623652546300045, 0.0824203908606883, -0.102057926881110, 0.120728407576411, -0.156845807891241, -0.123528163091204}, - {-0.00906668858975576, -0.0814722888231236, -0.0762715085459023, 0.055819989938286, -0.0540516675257271, -0.0070589302769034, -0.315813159989213, -0.0103527463419808, -0.194634331372293, -0.0185860407566822, 0.50134169352609, 0.384531812730061, -0.0405008616742061, 0.0781033650669525, 0.069334900096687, 0.396455180448549, -0.204065801866462, -0.215272089630713, 0.171046818996465, -0.396393364716348}, - {0.201971098571663, 0.489747667606921, 0.00226258734592836, 0.0969514005747054, 0.0853921636903791, 0.0862068740282345, -0.465412154271164, -0.130516676347786, 0.165513616974634, 0.0712238027886633, 0.140746943067963, -0.325919272273406, -0.421213488261598, -0.163508199065965, 0.269695802810568, -0.110296405171437, -0.106834099902202, 0.00509414588152415, 0.00909215239544615, 0.0500401865589727}, - {0.515854176692456, -0.087468413428258, 0.102796468891449, -0.06046105990993, -0.212014383772414, -0.259853648383794, -0.0997372883043333, -0.109934574535736, 0.284891018406112, -0.250578342940183, 0.142174204994568, 0.210384918947619, 0.118803190788946, -0.0268434355996836, 0.0103721198836548, -0.355555176478458, 0.428042332431476, -0.150610175411631, 0.0464090887952940, -0.140238796382057}, - {-0.239392215229762, -0.315483492656425, 0.100205194952396, 0.197830195325302, 0.40178804665223, 0.195809461460298, -0.407817115321684, 0.0226836686147386, -0.169780276210306, 0.0818161585952184, -0.172886230584939, 0.174982644851064, 0.0868786992159535, -0.198450519980824, 0.168581078329968, -0.361514336004068, 0.238668430084722, 0.165494019791904, 0.110437707249228, -0.169592003035203}, - {-0.313151735678025, 0.10757884850664, -0.49249098807229, 0.0993472335619114, -0.148695715250836, 0.0573801136941699, -0.190040373500722, 0.254848437434773, 0.134147888304352, -0.352719341442756, 0.0839609323513986, -0.207904182300122, 0.253940523323376, -0.109832138553288, 0.0980084518687944, 0.209026594443723, 0.406236051871548, -0.0521120230935943, 0.0554108014592302, 0.134681046631955}, - {-0.102905214421384, 0.235803606800009, 0.213414976431981, -0.253606415825635, 0.00945656859370683, 0.259551282655855, 0.159527348902192, 0.083218761193016, -0.286815935191867, 0.0135069477264877, 0.336758103107357, -0.271707359524149, -0.0400009875851839, 0.0871186292716414, -0.171506310409388, -0.0954276577211755, 0.393467571460712, 0.111732846649458, -0.239886066474217, -0.426474828195231}, - {-0.0130795552324104, 0.0758967690968058, -0.165099404017689, -0.46035152559912, 0.409888158016031, -0.0235053940299396, 0.0699393201709723, -0.161320910316996, 0.226111732196825, -0.177811841258496, -0.219073917645916, -0.00703219376737286, 0.162831878334912, 0.271670554900684, 0.451033612762052, 0.0820942662443393, -0.0904983490498446, -0.0587000279313978, -0.0938852980928252, -0.306078621571843}, - {0.345092040577428, -0.257721588971295, -0.301689123771848, -0.0875212184538126, 0.161012613069275, 0.385104899829821, 0.118355290985046, -0.241723794416731, 0.083201920119646, -0.0809095291508749, -0.0820275390511991, -0.115569770103317, -0.250105681098033, -0.164197583037664, -0.299481453795592, 0.255906951902366, 0.129042051416371, 0.203761730442746, 0.347550071284268, -0.109264854744020}, - {0.056345924962239, 0.072536751679082, 0.303127492633681, -0.368877185781648, -0.343024497082421, 0.206879529669083, -0.413012709639426, 0.078538816203612, 0.103382383425097, 0.288319996147499, -0.392663258459423, 0.0319588502083897, 0.220316797792669, -0.0563686494606947, -0.0869286063283735, 0.323677017794391, 0.0984875197088935, -0.0303289828821742, 0.0450197853450979, -0.0261771221270139}, - {-0.253701638374729, -0.148922815783583, 0.111794052194159, 0.157313977830326, -0.269846001260543, -0.222989872703583, 0.115441028189268, -0.350456582262355, -0.0409581422905941, 0.174078744248002, -0.130673397086811, -0.123963802708056, -0.351609207081548, 0.281548012920868, 0.340382662112428, 0.180262131025562, 0.3895263830793, 0.0121546812430960, 0.214830943227063, -0.0617782909660214}, - {-0.025854479416026, 0.480654788977767, -0.138024550829229, -0.130191670810919, 0.107816875829919, -0.111243997319276, -0.0679814460571245, -0.183167991080677, -0.363355166018786, -0.183934891092050, -0.216097125080962, 0.520240628803255, -0.179616013606479, 0.0664131536100941, -0.178350708111064, 0.0352047611606709, 0.223857228692892, 0.128363679623513, -0.000403433628490731, 0.224972110977704}, - {0.159207394033448, -0.0371517305736114, -0.294302634912281, -0.0866954375908417, -0.259998567870054, 0.284966673982689, 0.205356416771391, -0.257613708650298, -0.264820519037270, 0.293359248624603, 0.0997476397434102, 0.151390539497369, 0.165571346773648, -0.347569523551258, 0.43792310820533, -0.0723248163210163, 0.0379214984816955, -0.0542758730251438, -0.258020301801603, 0.128680501102363}, - {0.316853842351797, -0.153950010941153, -0.13387065213508, -0.0702971390607613, -0.202558481846057, -0.172941438694837, -0.068882524588574, 0.524738203063889, -0.271670479920716, -0.112864756695310, -0.146831636946145, -0.0352336188578041, -0.211108490884767, 0.097857111349555, 0.276459740956662, 0.0231297536754823, -0.0773173324868396, 0.487208384389438, -0.0734191389266824, -0.113198765573319}, - {-0.274285525741087, 0.227334266052039, -0.0973746625709059, -0.00965256583655389, -0.402438444750043, 0.198586229519026, 0.0958135064575833, -0.108934376958686, 0.253641732094319, -0.0551918478254021, 0.0243640218331436, 0.181936272247179, 0.090952738347629, 0.0603352483029044, -0.0043821671755761, -0.347720824658591, -0.267879988539971, 0.403804652116592, 0.337654323971186, -0.241509293972297}, - {-0.0197089518344238, 0.139681034626696, 0.251980475788267, 0.341846624362846, -0.075141195125153, 0.2184951591319, 0.268870823491343, 0.150392399018138, 0.134592404015057, -0.337050200539163, -0.313109373497998, 0.201993318439135, -0.217140733851970, -0.337622749083808, 0.135253284365068, 0.181729249828045, -0.00627813335422765, -0.197218833324039, -0.194060005031698, -0.303055888528004} - }, - /*eigenval*/ - { - 20.29131, 0.5045685, 0.2769945, 0.1551147, 0.03235484, -0.04127639, -0.3516426, -0.469973, -0.5835191, -0.6913107, -0.7207972, -0.7907875, -0.9524307, -1.095310, -1.402153, -1.424179, -1.936704, -2.037965, -3.273561, -5.488734 - }, - /*eigentot and codeFreq left out, these are initialized elsewhere*/ - }; - -/* The JTT92 matrix, D. T. Jones, W. R. Taylor, & J. M. Thorton, CABIOS 8:275 (1992) - Derived from the PhyML source code (models.c) by filling in the other side of the symmetric matrix, - scaling the entries by the stationary rate (to give the rate of a->b not b|a), to set the diagonals - so the rows sum to 0, to rescale the matrix so that the implied rate of evolution is 1. - The resulting matrix is the transpose (I think). -*/ -#if 0 -{ - int i,j; - for (i=0; i<20; i++) for (j=0; j<i; j++) daa[j*20+i] = daa[i*20+j]; - for (i = 0; i < 20; i++) for (j = 0; j < 20; j++) daa[i*20+j] *= pi[j] / 100.0; - double mr = 0; /* mean rate */ - for (i = 0; i < 20; i++) { - double sum = 0; - for (j = 0; j < 20; j++) - sum += daa[i*20+j]; - daa[i*20+i] = -sum; - mr += pi[i] * sum; - } - for (i = 0; i < 20*20; i++) - daa[i] /= mr; -} -#endif - -double statJTT92[MAXCODES] = {0.07674789,0.05169087,0.04264509,0.05154407,0.01980301,0.04075195,0.06182989,0.07315199,0.02294399,0.05376110,0.09190390,0.05867583,0.02382594,0.04012589,0.05090097,0.06876503,0.05856501,0.01426057,0.03210196,0.06600504}; -double matrixJTT92[MAXCODES][MAXCODES] = { - { -1.247831,0.044229,0.041179,0.061769,0.042704,0.043467,0.08007,0.136501,0.02059,0.027453,0.022877,0.02669,0.041179,0.011439,0.14794,0.288253,0.362223,0.006863,0.008388,0.227247 }, - { 0.029789,-1.025965,0.023112,0.008218,0.058038,0.159218,0.014895,0.070364,0.168463,0.011299,0.019517,0.33179,0.022599,0.002568,0.038007,0.051874,0.032871,0.064714,0.010272,0.008731 }, - { 0.022881,0.019068,-1.280568,0.223727,0.014407,0.03644,0.024576,0.034322,0.165676,0.019915,0.005085,0.11144,0.012712,0.004237,0.006356,0.213134,0.098304,0.00339,0.029661,0.00678 }, - { 0.041484,0.008194,0.270413,-1.044903,0.005121,0.025095,0.392816,0.066579,0.05736,0.005634,0.003585,0.013316,0.007682,0.002049,0.007682,0.030217,0.019462,0.002049,0.023559,0.015877 }, - { 0.011019,0.022234,0.00669,0.001968,-0.56571,0.001771,0.000984,0.011609,0.013577,0.003345,0.004526,0.001377,0.0061,0.015348,0.002755,0.043878,0.008264,0.022628,0.041124,0.012199 }, - { 0.02308,0.125524,0.034823,0.019841,0.003644,-1.04415,0.130788,0.010528,0.241735,0.003644,0.029154,0.118235,0.017411,0.00162,0.066406,0.021461,0.020651,0.007288,0.009718,0.008098 }, - { 0.064507,0.017816,0.035632,0.471205,0.003072,0.198435,-0.944343,0.073107,0.015973,0.007372,0.005529,0.111197,0.011058,0.003072,0.011058,0.01843,0.019659,0.006143,0.0043,0.027646 }, - { 0.130105,0.099578,0.058874,0.09449,0.042884,0.018898,0.086495,-0.647831,0.016717,0.004361,0.004361,0.019625,0.010176,0.003634,0.017444,0.146096,0.023986,0.039976,0.005815,0.034162 }, - { 0.006155,0.074775,0.089138,0.025533,0.01573,0.1361,0.005927,0.005243,-1.135695,0.003648,0.012767,0.010259,0.007523,0.009119,0.026217,0.016642,0.010487,0.001824,0.130629,0.002508 }, - { 0.01923,0.011752,0.025106,0.005876,0.009081,0.004808,0.00641,0.003205,0.008547,-1.273602,0.122326,0.011218,0.25587,0.047542,0.005342,0.021367,0.130873,0.004808,0.017094,0.513342 }, - { 0.027395,0.0347,0.010958,0.006392,0.021003,0.065748,0.008219,0.005479,0.051137,0.209115,-0.668139,0.012784,0.354309,0.226465,0.093143,0.053877,0.022829,0.047485,0.021916,0.16437 }, - { 0.020405,0.376625,0.153332,0.015158,0.004081,0.170239,0.105525,0.015741,0.026235,0.012243,0.008162,-0.900734,0.037896,0.002332,0.012243,0.027401,0.06005,0.00583,0.004664,0.008162 }, - { 0.012784,0.010416,0.007102,0.003551,0.007339,0.01018,0.004261,0.003314,0.007812,0.113397,0.091854,0.015388,-1.182051,0.01018,0.003788,0.006865,0.053503,0.005682,0.004261,0.076466 }, - { 0.00598,0.001993,0.003987,0.001595,0.031098,0.001595,0.001993,0.001993,0.015948,0.035484,0.098877,0.001595,0.017144,-0.637182,0.006778,0.03668,0.004784,0.021131,0.213701,0.024719 }, - { 0.098117,0.037426,0.007586,0.007586,0.007081,0.082944,0.009104,0.012138,0.058162,0.005058,0.051587,0.010621,0.008092,0.008598,-0.727675,0.144141,0.059679,0.003035,0.005058,0.011632 }, - { 0.258271,0.069009,0.343678,0.040312,0.152366,0.036213,0.020498,0.137334,0.049878,0.02733,0.040312,0.032113,0.019814,0.06286,0.194728,-1.447863,0.325913,0.023914,0.043045,0.025964 }, - { 0.276406,0.037242,0.135003,0.022112,0.02444,0.029677,0.018621,0.019203,0.026768,0.142567,0.014548,0.059936,0.131511,0.006983,0.068665,0.27757,-1.335389,0.006983,0.01222,0.065174 }, - { 0.001275,0.017854,0.001134,0.000567,0.016295,0.002551,0.001417,0.007793,0.001134,0.001275,0.007368,0.001417,0.003401,0.00751,0.00085,0.004959,0.0017,-0.312785,0.010061,0.003542 }, - { 0.003509,0.006379,0.022328,0.014673,0.066664,0.007655,0.002233,0.002552,0.182769,0.010207,0.007655,0.002552,0.005741,0.170967,0.00319,0.020095,0.006698,0.022647,-0.605978,0.005103 }, - { 0.195438,0.011149,0.010493,0.020331,0.040662,0.013117,0.029512,0.030824,0.007214,0.630254,0.11805,0.009182,0.211834,0.040662,0.015084,0.024922,0.073453,0.016396,0.010493,-1.241722 } -}; - -double statWAG01[MAXCODES] = {0.0866279,0.043972, 0.0390894,0.0570451,0.0193078,0.0367281,0.0580589,0.0832518,0.0244314,0.048466, 0.086209, 0.0620286,0.0195027,0.0384319,0.0457631,0.0695179,0.0610127,0.0143859,0.0352742,0.0708956}; -double matrixWAG01[MAXCODES][MAXCODES] = { - {-1.117151, 0.050147, 0.046354, 0.067188, 0.093376, 0.082607, 0.143908, 0.128804, 0.028817, 0.017577, 0.036177, 0.082395, 0.081234, 0.019138, 0.130789, 0.306463, 0.192846, 0.010286, 0.021887, 0.182381}, - {0.025455, -0.974318, 0.029321, 0.006798, 0.024376, 0.140086, 0.020267, 0.026982, 0.098628, 0.008629, 0.022967, 0.246964, 0.031527, 0.004740, 0.031358, 0.056495, 0.025586, 0.053714, 0.017607, 0.011623}, - {0.020916, 0.026065, -1.452438, 0.222741, 0.010882, 0.063328, 0.038859, 0.046176, 0.162306, 0.022737, 0.005396, 0.123567, 0.008132, 0.003945, 0.008003, 0.163042, 0.083283, 0.002950, 0.044553, 0.008051}, - {0.044244, 0.008819, 0.325058, -0.989665, 0.001814, 0.036927, 0.369645, 0.051822, 0.055719, 0.002361, 0.005077, 0.028729, 0.006212, 0.002798, 0.025384, 0.064166, 0.022443, 0.007769, 0.019500, 0.009120}, - {0.020812, 0.010703, 0.005375, 0.000614, -0.487357, 0.002002, 0.000433, 0.006214, 0.005045, 0.003448, 0.007787, 0.001500, 0.007913, 0.008065, 0.002217, 0.028525, 0.010395, 0.014531, 0.011020, 0.020307}, - {0.035023, 0.117008, 0.059502, 0.023775, 0.003809, -1.379785, 0.210830, 0.012722, 0.165524, 0.004391, 0.033516, 0.150135, 0.059565, 0.003852, 0.035978, 0.039660, 0.033070, 0.008316, 0.008777, 0.011613}, - {0.096449, 0.026759, 0.057716, 0.376214, 0.001301, 0.333275, -1.236894, 0.034593, 0.034734, 0.007763, 0.009400, 0.157479, 0.019202, 0.004944, 0.041578, 0.042955, 0.050134, 0.009540, 0.011961, 0.035874}, - {0.123784, 0.051085, 0.098345, 0.075630, 0.026795, 0.028838, 0.049604, -0.497615, 0.021792, 0.002661, 0.005356, 0.032639, 0.015212, 0.004363, 0.021282, 0.117240, 0.019732, 0.029444, 0.009052, 0.016361}, - {0.008127, 0.054799, 0.101443, 0.023863, 0.006384, 0.110105, 0.014616, 0.006395, -0.992342, 0.003543, 0.012807, 0.022832, 0.010363, 0.017420, 0.017851, 0.018979, 0.012136, 0.006733, 0.099319, 0.003035}, - {0.009834, 0.009511, 0.028192, 0.002006, 0.008654, 0.005794, 0.006480, 0.001549, 0.007029, -1.233162, 0.161294, 0.016472, 0.216559, 0.053891, 0.005083, 0.016249, 0.074170, 0.010808, 0.021372, 0.397837}, - {0.036002, 0.045028, 0.011900, 0.007673, 0.034769, 0.078669, 0.013957, 0.005547, 0.045190, 0.286902, -0.726011, 0.023303, 0.439180, 0.191376, 0.037625, 0.031191, 0.029552, 0.060196, 0.036066, 0.162890}, - {0.058998, 0.348377, 0.196082, 0.031239, 0.004820, 0.253558, 0.168246, 0.024319, 0.057967, 0.021081, 0.016767, -1.124580, 0.060821, 0.005783, 0.036254, 0.062960, 0.090292, 0.008952, 0.008675, 0.019884}, - {0.018288, 0.013983, 0.004057, 0.002124, 0.007993, 0.031629, 0.006450, 0.003564, 0.008272, 0.087143, 0.099354, 0.019123, -1.322098, 0.024370, 0.003507, 0.010109, 0.031033, 0.010556, 0.008769, 0.042133}, - {0.008490, 0.004143, 0.003879, 0.001885, 0.016054, 0.004030, 0.003273, 0.002014, 0.027402, 0.042734, 0.085315, 0.003583, 0.048024, -0.713669, 0.006512, 0.022020, 0.006934, 0.061698, 0.260332, 0.026213}, - {0.069092, 0.032635, 0.009370, 0.020364, 0.005255, 0.044829, 0.032773, 0.011698, 0.033438, 0.004799, 0.019973, 0.026747, 0.008229, 0.007754, -0.605590, 0.077484, 0.038202, 0.006695, 0.010376, 0.015124}, - {0.245933, 0.089317, 0.289960, 0.078196, 0.102703, 0.075066, 0.051432, 0.097899, 0.054003, 0.023306, 0.025152, 0.070562, 0.036035, 0.039831, 0.117705, -1.392239, 0.319421, 0.038212, 0.057419, 0.016981}, - {0.135823, 0.035501, 0.129992, 0.024004, 0.032848, 0.054936, 0.052685, 0.014461, 0.030308, 0.093371, 0.020915, 0.088814, 0.097083, 0.011008, 0.050931, 0.280341, -1.154973, 0.007099, 0.018643, 0.088894}, - {0.001708, 0.017573, 0.001086, 0.001959, 0.010826, 0.003257, 0.002364, 0.005088, 0.003964, 0.003208, 0.010045, 0.002076, 0.007786, 0.023095, 0.002105, 0.007908, 0.001674, -0.466694, 0.037525, 0.005516}, - {0.008912, 0.014125, 0.040205, 0.012058, 0.020133, 0.008430, 0.007267, 0.003836, 0.143398, 0.015555, 0.014757, 0.004934, 0.015861, 0.238943, 0.007998, 0.029135, 0.010779, 0.092011, -0.726275, 0.011652}, - {0.149259, 0.018739, 0.014602, 0.011335, 0.074565, 0.022417, 0.043805, 0.013932, 0.008807, 0.581952, 0.133956, 0.022726, 0.153161, 0.048356, 0.023429, 0.017317, 0.103293, 0.027186, 0.023418, -1.085487}, -}; - -/* Le-Gascuel 2008 model data from Harry Yoo - https://github.com/hyoo/FastTree -*/ -double statLG08[MAXCODES] = {0.079066, 0.055941, 0.041977, 0.053052, 0.012937, 0.040767, 0.071586, 0.057337, 0.022355, 0.062157, 0.099081, 0.0646, 0.022951, 0.042302, 0.04404, 0.061197, 0.053287, 0.012066, 0.034155, 0.069147}; - -double matrixLG08[MAXCODES][MAXCODES] = { - {-1.08959879,0.03361031,0.02188683,0.03124237,0.19680136,0.07668542,0.08211337,0.16335306,0.02837339,0.01184642,0.03125763,0.04242021,0.08887270,0.02005907,0.09311189,0.37375830,0.16916131,0.01428853,0.01731216,0.20144931}, - {0.02378006,-0.88334349,0.04206069,0.00693409,0.02990323,0.15707674,0.02036079,0.02182767,0.13574610,0.00710398,0.01688563,0.35388551,0.02708281,0.00294931,0.01860218,0.04800569,0.03238902,0.03320688,0.01759004,0.00955956}, - {0.01161996,0.03156149,-1.18705869,0.21308090,0.02219603,0.07118238,0.02273938,0.06034785,0.18928374,0.00803870,0.00287235,0.09004368,0.01557359,0.00375798,0.00679131,0.16825837,0.08398226,0.00190474,0.02569090,0.00351296}, - {0.02096312,0.00657599,0.26929909,-0.86328733,0.00331871,0.02776660,0.27819699,0.04482489,0.04918511,0.00056712,0.00079981,0.01501150,0.00135537,0.00092395,0.02092662,0.06579888,0.02259266,0.00158572,0.00716768,0.00201422}, - {0.03220119,0.00691547,0.00684065,0.00080928,-0.86781864,0.00109716,0.00004527,0.00736456,0.00828668,0.00414794,0.00768465,0.00017162,0.01156150,0.01429859,0.00097521,0.03602269,0.01479316,0.00866942,0.01507844,0.02534728}, - {0.03953956,0.11446966,0.06913053,0.02133682,0.00345736,-1.24953177,0.16830979,0.01092385,0.19623161,0.00297003,0.02374496,0.13185209,0.06818543,0.00146170,0.02545052,0.04989165,0.04403378,0.00962910,0.01049079,0.00857458}, - {0.07434507,0.02605508,0.03877888,0.37538659,0.00025048,0.29554848,-0.84254259,0.02497249,0.03034386,0.00316875,0.00498760,0.12936820,0.01243696,0.00134660,0.03002373,0.04380857,0.04327684,0.00557310,0.00859294,0.01754095}, - {0.11846020,0.02237238,0.08243001,0.04844538,0.03263985,0.01536392,0.02000178,-0.50414422,0.01785951,0.00049912,0.00253779,0.01700817,0.00800067,0.00513658,0.01129312,0.09976552,0.00744439,0.01539442,0.00313512,0.00439779}, - {0.00802225,0.05424651,0.10080372,0.02072557,0.01431930,0.10760560,0.00947583,0.00696321,-1.09324335,0.00243405,0.00818899,0.01558729,0.00989143,0.01524917,0.01137533,0.02213166,0.01306114,0.01334710,0.11863394,0.00266053}, - {0.00931296,0.00789336,0.01190322,0.00066446,0.01992916,0.00452837,0.00275137,0.00054108,0.00676776,-1.41499789,0.25764421,0.00988722,0.26563382,0.06916358,0.00486570,0.00398456,0.06425393,0.00694043,0.01445289,0.66191466}, - {0.03917027,0.02990732,0.00677980,0.00149374,0.05885464,0.05771026,0.00690325,0.00438541,0.03629495,0.41069624,-0.79375308,0.01362360,0.62543296,0.25688578,0.02467704,0.01806113,0.03001512,0.06139358,0.02968934,0.16870919}, - {0.03465896,0.40866276,0.13857164,0.01827910,0.00085698,0.20893479,0.11674330,0.01916263,0.04504313,0.01027583,0.00888247,-0.97644156,0.04241650,0.00154510,0.02521473,0.04836478,0.07344114,0.00322392,0.00852278,0.01196402}, - {0.02579765,0.01111131,0.00851489,0.00058635,0.02051079,0.03838702,0.00398738,0.00320253,0.01015515,0.09808327,0.14487451,0.01506968,-1.54195698,0.04128536,0.00229163,0.00796306,0.04636929,0.01597787,0.01104642,0.04357735}, - {0.01073203,0.00223024,0.00378708,0.00073673,0.04675419,0.00151673,0.00079574,0.00378966,0.02885576,0.04707045,0.10967574,0.00101178,0.07609486,-0.81061579,0.00399600,0.01530562,0.00697985,0.10394083,0.33011973,0.02769432}, - {0.05186360,0.01464471,0.00712508,0.01737179,0.00331981,0.02749383,0.01847072,0.00867414,0.02240973,0.00344749,0.01096857,0.01718973,0.00439734,0.00416018,-0.41664685,0.05893117,0.02516738,0.00418956,0.00394655,0.01305787}, - {0.28928853,0.05251612,0.24529879,0.07590089,0.17040121,0.07489439,0.03745080,0.10648187,0.06058559,0.00392302,0.01115539,0.04581702,0.02123285,0.02214217,0.08188943,-1.42842431,0.39608294,0.01522956,0.02451220,0.00601987}, - {0.11400727,0.03085239,0.10660988,0.02269274,0.06093244,0.05755704,0.03221430,0.00691855,0.03113348,0.05508469,0.01614250,0.06057985,0.10765893,0.00879238,0.03045173,0.34488735,-1.23444419,0.00750412,0.01310009,0.11660005}, - {0.00218053,0.00716244,0.00054751,0.00036065,0.00808574,0.00284997,0.00093936,0.00323960,0.00720403,0.00134729,0.00747646,0.00060216,0.00840002,0.02964754,0.00114785,0.00300276,0.00169919,-0.44275283,0.03802969,0.00228662}, - {0.00747852,0.01073967,0.02090366,0.00461457,0.03980863,0.00878929,0.00409985,0.00186756,0.18125441,0.00794180,0.01023445,0.00450612,0.01643896,0.26654152,0.00306072,0.01368064,0.00839668,0.10764993,-0.71435091,0.00851526}, - {0.17617706,0.01181629,0.00578676,0.00262530,0.13547871,0.01454379,0.01694332,0.00530363,0.00822937,0.73635171,0.11773937,0.01280613,0.13129028,0.04526924,0.02050210,0.00680190,0.15130413,0.01310401,0.01723920,-1.33539639} -};