diff --git a/README.md b/README.md index e5d5100f245c7f665f8fa0f9defb9abe54784d42..0d8761712f9bf85b986ed8822c2876246c19f7a0 100644 --- a/README.md +++ b/README.md @@ -48,28 +48,31 @@ Launch _MSTclust_ without option to read the following documentation: OPTIONS: - -i <infile> input file containing either tab-delimited profiles or a + -i <infile> input file containing either tab-delimited profiles or a lower-triangular distance matrix (mandatory) -o <basenmae> basename for output files (mandatory) -r <string> selecting only specified rows (default: "1-") - -l <string> (input tab-delimited profiles) field(s) containing profile + -l <string> (input tab-delimited profiles) field(s) containing profile labels (default: "1") - -p <string> (input tab-delimited profiles) fields containing profiles + -p <string> (input tab-delimited profiles) fields containing profiles (default: "2-") - -e <float> (input tab-delimited profiles) maximum allowed proportion + -e <float> (input tab-delimited profiles) maximum allowed proportion of empty entries per profile (default: 0.25) -c <float> inclusive cutoff to define cluster(s) (default: not set) - -S <integer> seed value for data perturbation and subsampling analyses + -g <integer> generalized mean exponent for computing silhouettes; + negative or positive infinity can be set using -inf or inf, + respectively (default: 1 i.e. arithmetic mean) + -S <integer> seed value for data perturbation and subsampling analyses (default: 0) - -L <integer> profile length to carry out data perturbation analysis + -L <integer> profile length to carry out data perturbation analysis (default: not set) - -B <integer> number of bins to carry out data subsampling analyses + -B <integer> number of bins to carry out data subsampling analyses (default: not set) - -R <integer> number of replicates for data perturbation and subsampling + -R <integer> number of replicates for data perturbation and subsampling analyses (default: 100) - -h writing a single-linkage hierarchical classification tree + -h writing a single-linkage hierarchical classification tree into an output file (default: not set) - -m writing a minimum spanning tree into an output file + -m writing a minimum spanning tree into an output file (default: not set) ``` diff --git a/Technical.Notes.pdf b/Technical.Notes.pdf index 36111b14ac9179bcba3e94527e32dda69d4067eb..4bb66486c506f028b923280d1da45b3f73c34a50 100644 Binary files a/Technical.Notes.pdf and b/Technical.Notes.pdf differ diff --git a/src/MSTclust.java b/src/MSTclust.java index ff72b90b558ea3dc6e9197a33867f5399588aab0..e2c7a3f42926d8f87f1be4b5b923e2656b1dc645 100644 --- a/src/MSTclust.java +++ b/src/MSTclust.java @@ -26,19 +26,29 @@ ######################################################################################################## */ +/* + ######################################################################################################## + ## v0.2.200601ac + + new option -g to compute silhouettes using generalized mean with any integer exponent + ######################################################################################################## +*/ + import java.io.*; import java.util.*; public class MSTclust { //### constants ############################################################################################### - final static String VERSION = "0.1.200523ac"; + final static String VERSION = "0.2.200528ac"; final static String NOTHING = "N.o./.T.h.I.n.G"; - final static short EMPTY = 0; //Short.MIN_VALUE;; + final static short EMPTY = 0; final static String BLANK = " "; - final static double EPSILON = 0.000000001; + final static double EPSILON = 0.000000001; //## NOTE: little constant to increase the cutoff value + final static double DELTA = 0.00000000001; //## NOTE: little constant to deal with zero-values when computing geometric means final static double CI_MIN = 0.025; final static double CI_MAX = 0.975; + final static int NINF = Integer.MIN_VALUE; + final static int PINF = Integer.MAX_VALUE; //### io ###################################################################################################### static BufferedReader in; @@ -56,6 +66,7 @@ public class MSTclust { static boolean hctree; // option -h ; hierarchical tree (default: not set) static boolean mstree; // option -m ; minimum spanning tree (default: not set) static double cutoff; // option -c ; cutoff value for clustering (default: -1) + static int exponent; // option -g ; generalized mean exponent value for estimating silhouettes (default: 1) static boolean outtab; // option -t ; to output stats in tab-delimited format (default: not set) static int nloc; // option -L ; no. loci for noising (default: 0) static int bins; // option -B ; no. bins for subsampling (default: 0) @@ -115,28 +126,31 @@ public class MSTclust { System.out.println(""); System.out.println(" OPTIONS:"); System.out.println(""); - System.out.println(" -i <infile> input file containing either tab-delimited profiles or a"); + System.out.println(" -i <infile> input file containing either tab-delimited profiles or a"); System.out.println(" lower-triangular distance matrix (mandatory)"); System.out.println(" -o <basenmae> basename for output files (mandatory)"); System.out.println(" -r <string> selecting only specified rows (default: \"1-\")"); - System.out.println(" -l <string> (input tab-delimited profiles) field(s) containing profile"); + System.out.println(" -l <string> (input tab-delimited profiles) field(s) containing profile"); System.out.println(" labels (default: \"1\")"); - System.out.println(" -p <string> (input tab-delimited profiles) fields containing profiles"); + System.out.println(" -p <string> (input tab-delimited profiles) fields containing profiles"); System.out.println(" (default: \"2-\")"); - System.out.println(" -e <float> (input tab-delimited profiles) maximum allowed proportion"); + System.out.println(" -e <float> (input tab-delimited profiles) maximum allowed proportion"); System.out.println(" of empty entries per profile (default: 0.25)"); System.out.println(" -c <float> inclusive cutoff to define cluster(s) (default: not set)"); - System.out.println(" -S <integer> seed value for data perturbation and subsampling analyses"); + System.out.println(" -g <integer> generalized mean exponent for computing silhouettes;"); + System.out.println(" negative or positive infinity can be set using -inf or inf,"); + System.out.println(" respectively (default: 1 i.e. arithmetic mean)"); + System.out.println(" -S <integer> seed value for data perturbation and subsampling analyses"); System.out.println(" (default: 0)"); - System.out.println(" -L <integer> profile length to carry out data perturbation analysis"); + System.out.println(" -L <integer> profile length to carry out data perturbation analysis"); System.out.println(" (default: not set)"); - System.out.println(" -B <integer> number of bins to carry out data subsampling analyses"); + System.out.println(" -B <integer> number of bins to carry out data subsampling analyses"); System.out.println(" (default: not set)"); - System.out.println(" -R <integer> number of replicates for data perturbation and subsampling"); + System.out.println(" -R <integer> number of replicates for data perturbation and subsampling"); System.out.println(" analyses (default: 100)"); - System.out.println(" -h writing a single-linkage hierarchical classification tree"); + System.out.println(" -h writing a single-linkage hierarchical classification tree"); System.out.println(" into an output file (default: not set)"); - System.out.println(" -m writing a minimum spanning tree into an output file"); + System.out.println(" -m writing a minimum spanning tree into an output file"); System.out.println(" (default: not set)"); System.out.println(""); System.exit(0); @@ -156,26 +170,32 @@ public class MSTclust { hctree = false; // option -h mstree = false; // option -m cutoff = -1; // option -c + exponent = 1; // option -g nloc = 0; // option -L bins = 0; // option -B reps = 100; // option -R seed = 0; // option -S o = -1; while ( ++o < args.length ) { - if ( args[o].equals("-i") ) { infile = new File(args[++o]); continue; } - if ( args[o].equals("-o") ) { basename = args[++o]; continue; } - if ( args[o].equals("-l") ) { lfld = args[++o]; continue; } - if ( args[o].equals("-p") ) { pfld = args[++o]; continue; } - if ( args[o].equals("-r") ) { rows = args[++o]; continue; } - if ( args[o].equals("-t") ) { outtab = true; continue; } - if ( args[o].equals("-h") ) { hctree = true; continue; } - if ( args[o].equals("-m") ) { mstree = true; continue; } + if ( args[o].equals("-i") ) { infile = new File(args[++o]); continue; } + if ( args[o].equals("-o") ) { basename = args[++o]; continue; } + if ( args[o].equals("-l") ) { lfld = args[++o]; continue; } + if ( args[o].equals("-p") ) { pfld = args[++o]; continue; } + if ( args[o].equals("-r") ) { rows = args[++o]; continue; } + if ( args[o].equals("-t") ) { outtab = true; continue; } + if ( args[o].equals("-h") ) { hctree = true; continue; } + if ( args[o].equals("-m") ) { mstree = true; continue; } if ( args[o].equals("-e") ) { try { missp = Double.parseDouble(args[++o]); } catch ( NumberFormatException e ) { System.err.println("incorrect value (option -e)"); System.exit(1); } continue; } if ( args[o].equals("-c") ) { try { cutoff = Double.parseDouble(args[++o]); } catch ( NumberFormatException e ) { System.err.println("incorrect value (option -c)"); System.exit(1); } continue; } if ( args[o].equals("-L") ) { try { nloc = Integer.parseInt(args[++o]); } catch ( NumberFormatException e ) { System.err.println("incorrect value (option -L)"); System.exit(1); } continue; } if ( args[o].equals("-B") ) { try { bins = Integer.parseInt(args[++o]); } catch ( NumberFormatException e ) { System.err.println("incorrect value (option -B)"); System.exit(1); } continue; } if ( args[o].equals("-R") ) { try { reps = Integer.parseInt(args[++o]); } catch ( NumberFormatException e ) { System.err.println("incorrect value (option -R)"); System.exit(1); } continue; } if ( args[o].equals("-S") ) { try { seed = Long.parseLong(args[++o]); } catch ( NumberFormatException e ) { System.err.println("incorrect value (option -S)"); System.exit(1); } continue; } + if ( args[o].equals("-g") ) { + line=args[++o].toLowerCase(); + if ( line.equals("inf") || line.equals("+inf") ) { exponent = PINF; continue; } + if ( line.equals("-inf") ) { exponent = NINF; continue; } + try { exponent = Integer.parseInt(line); } catch ( NumberFormatException e ) { System.err.println("incorrect value (option -g)"); System.exit(1); } continue; } } //### testing mandatory options -i, and -o if ( infile.toString().equals(NOTHING) ) { System.err.println("input file not specified (option -i)"); System.exit(1); } @@ -409,7 +429,7 @@ public class MSTclust { //########################################################################################################## //### estimating silhouette indexes and sorting clusters ### //########################################################################################################## - silh = silhouette(clust, dm); + silh = silhouette(clust, dm, exponent); silhouette = avg(silh); outcaption = outcaption.append("\tsilhouette"); //## NOTE: updating outtab outstats = outstats.append(String.format(Locale.US, "\t%.6f", silhouette)); //## NOTE: updating outtab @@ -530,7 +550,7 @@ public class MSTclust { sdm = noisedm(dm, nloc, aurand); // noised distance matrix noiclust = mstc(sdm, cutoff); // clustering from the noised distances mm = abcd(contingency(noiclust, clust)); // mismatch matrix between the two partitions: noised vs. original - rsi[r] = avg(silhouette(noiclust, sdm)); + rsi[r] = avg(silhouette(noiclust, sdm, exponent)); aw = awal(mm); rw1[r] = aw[0]; rw2[r] = aw[1]; @@ -585,7 +605,7 @@ public class MSTclust { clustsub = mstc(sdm, cutoff); // clustering of the subsampled elements subclust = subpartition(clust, subsample); // restriction of the original clustering to the subsample elements mm = abcd(contingency(clustsub, subclust)); // mismatch matrix between the two partitions, clustsub vs. original - rsi[r] = (1 + avg(silhouette(clustsub, sdm))) / 2; //## NOTE: AUC are computed on (silhouette+1)/2, therefore varying between 0 and 1 + rsi[r] = (1 + avg(silhouette(clustsub, sdm, exponent))) / 2; //## NOTE: AUC are computed on (silhouette+1)/2, therefore varying between 0 and 1 aw = awal(mm); rw1[r] = aw[0]; rw2[r] = aw[1]; @@ -678,12 +698,6 @@ public class MSTclust { final static float dp(final short[] p1, final short[] p2) { int a1, a2, m = ((a1=p1.length) < (a2=p2.length)) ? a1 : a2, dn = m, up = 0; while ( --m >= 0 ) up += ( (a1=p1[m]) == EMPTY || (a2=p2[m]) == EMPTY ) ? (--dn) * 0 : ( a1 != a2 ) ? 0b1 : 0; //## fast - //while ( --m >= 0 ) dn = ( (a1=p1[m]) == EMPTY || (a2=p2[m]) == EMPTY ) ? (--dn) : ( a1 == a2 ) ? dn : dn + (++up) * 0; //## slow - //while ( --m >= 0 ) dn -= ( (a1=p1[m]) == EMPTY || (a2=p2[m]) == EMPTY ) ? 0b1 : ( a1 != a2 ) ? (++up) * 0 : 0; //## less fast - //while ( --m >= 0 ) up += ( ( ( (a1=p1[m]) != EMPTY && (a2=p2[m]) != EMPTY ) || --dn < 0 ) && a1 != a2 ) ? 0b1 : 0; //## less fast - //while ( --m >= 0 && ( ( (a1=p1[m]) == EMPTY || (a2=p2[m]) == EMPTY ) : --dn > 0 : ( a1 != a2 ) ? ++up > 0 : true ) ) {} //## quite fast - //while ( --m >= 0 && ( ( ( (a1=p1[m]) == EMPTY || (a2=p2[m]) == EMPTY ) && --dn > 0 ) || ( a1 == a2 ) || ++up > 0 ) ) {} //## less fast - //while ( --m >= 0 && ( (a1=p1[m]) != EMPTY && (a2=p2[m]) != EMPTY && ( a1 == a2 || ++up > 0 ) || --dn > 0 ) ) {} //## quite fast return (float) (up / (double) dn); } @@ -807,32 +821,6 @@ public class MSTclust { return dn; } - // returns a matrix of noised distances from the specified distance matrix dm - // noised distances dn ~ N(dm, vm), where: - // + dm is the original pairwise distance estimated from the specified no. loci (lgt) - // + vm = dm(1-dm)/lgt - // therefore dn = dm + sqrt(vm) * x, where x ~ N(0, 1) - final static float[][] noisedm(final float[][] dm, final int lgt, final long seed) { - Random rand = new Random(seed); - int n = dm.length; - float[][] dn = new float[n][]; - double dij, d, se; int j, i = n; - while ( (j=--i) >= 0 ) { - dn[i] = new float[i]; - while ( --j >= 0 ) - if ( (dij=dm[i][j]) != 0 ) { - se = Math.sqrt(dij*(1-dij)/lgt); - d = dij + se * rand.nextGaussian(); if ( d >= 0 ) { dn[i][j] = (float) d; continue; } - d = dij + se * rand.nextGaussian(); if ( d >= 0 ) { dn[i][j] = (float) d; continue; } - d = dij + se * rand.nextGaussian(); if ( d >= 0 ) { dn[i][j] = (float) d; continue; } - d = dij + se * rand.nextGaussian(); if ( d >= 0 ) { dn[i][j] = (float) d; continue; } - d = dij + se * rand.nextGaussian(); if ( d >= 0 ) { dn[i][j] = (float) d; continue; } - dn[i][j] = 0; - } - } - return dn; - } - // returns an array containing rate*size sorted integers randomly drawn in [0,size[ final static int[] subsample(final int size, final double rate, final long seed) { Random rand = new Random(seed); @@ -885,31 +873,50 @@ public class MSTclust { return cs; } - // returns the silhouette index for every element for the specified partition array from the specified distance matrix - final static double[] silhouette(final int[] partition, final float[][] dm) { + // returns the silhouette index for every element using the generalized mean for the specified partition array from the specified distance matrix + final static double[] silhouette(final int[] partition, final float[][] dm, final int exponent) { int n = partition.length; // no. elements double[] s = new double[n]; int k = psize(partition); // no. classes if ( k == 1 ) return s; int[] csize = csizes(partition); // size of each class - double[][] d = new double[n][k]; // d[i][c] = avg distance between element i and class c - double ai, bi, dic; - int c, ci, cj, j, i = n; + double[][] d = new double[n][k]; // d[i][c] = mean distance between element i and class c + double ai, bi, dic; int c, ci, cj, j, i = n; while ( (j=--i) >= 0 ) { + switch ( exponent ) { + case NINF: Arrays.fill(d[i], Double.MAX_VALUE); break; + case PINF: Arrays.fill(d[i], Double.MIN_VALUE); break; + } ci = partition[i]; while ( --j >= 0 ) { - cj = partition[j]; - d[i][cj] += (dic=dm[i][j]); - d[j][ci] += dic; + cj = partition[j]; dic = dm[i][j]; + switch ( exponent ) { + case -1: d[i][cj] += (ai=(1.0/dic)); d[j][ci] += ai; break; //## NOTE: harmonic mean + //case 0: d[i][cj] *= dic; d[j][ci] *= dic; break; //## NOTE: geometric mean + case 0: d[i][cj] += (ai=Math.log(dic+DELTA)); d[j][ci] += ai; break; //## NOTE: geometric mean + case 1: d[i][cj] += dic; d[j][ci] += dic; break; //## NOTE: arithmetic mean + case 2: d[i][cj] += (ai=(dic*dic)); d[j][ci] += ai; break; //## NOTE: quadratic mean + case NINF: d[i][cj] = ( (ai=d[i][cj]) < dic ) ? ai : dic; d[j][ci] = ( (ai=d[j][ci]) < dic ) ? ai : dic; break; //## NOTE: minimum + case PINF: d[i][cj] = ( (ai=d[i][cj]) > dic ) ? ai : dic; d[j][ci] = ( (ai=d[j][ci]) > dic ) ? ai : dic; break; //## NOTE: maximum + default: d[i][cj] += (ai=Math.pow(dic, exponent)); d[j][ci] += ai; break; //## NOTE: power mean + } } } i = n; while ( --i >= 0 ) { ci = partition[i]; if ( csize[ci] == 1 ) continue; - ai = d[i][ci] / (csize[ci] -1); - bi = Double.POSITIVE_INFINITY; c = k; - while ( --c >= 0 ) if ( c != ci ) bi = ( bi > (dic=d[i][c]/csize[c]) ) ? dic : bi; + ai = bi = Double.POSITIVE_INFINITY; c = k; + switch ( exponent ) { + case -1: ai = (csize[ci]-1) / d[i][ci]; while ( --c >= 0 ) if ( c != ci ) bi = ( bi > (dic=(csize[c]/d[i][c])) ) ? dic : bi; break; //## NOTE: harmonic mean + //case 0: ai = Math.pow(d[i][ci], 1.0/(csize[ci]-1)); while ( --c >= 0 ) if ( c != ci ) bi = ( bi > (dic=Math.pow(d[i][c], 1.0/csize[c])) ) ? dic : bi; break; //## NOTE: geometric mean + case 0: ai = Math.exp(d[i][ci]/(csize[ci]-1)) - DELTA; while ( --c >= 0 ) if ( c != ci ) bi = ( bi > (dic=(Math.exp(d[i][c]/csize[c])-DELTA)) ) ? dic : bi; break; //## NOTE: geometric mean + case 1: ai = d[i][ci] / (csize[ci]-1); while ( --c >= 0 ) if ( c != ci ) bi = ( bi > (dic=(d[i][c]/csize[c])) ) ? dic : bi; break; //## NOTE: arithmetic mean + case 2: ai = Math.sqrt(d[i][ci] / (csize[ci]-1)); while ( --c >= 0 ) if ( c != ci ) bi = ( bi > (dic=Math.sqrt(d[i][c]/csize[c])) ) ? dic : bi; break; //## NOTE: quadratic mean + case NINF: + case PINF: ai = d[i][ci]; while ( --c >= 0 ) if ( c != ci ) bi = ( bi > (dic=d[i][c]) ) ? dic : bi; break; //## NOTE: maximum/minimum + default: ai = Math.pow(d[i][ci]/(csize[ci]-1), 1.0/exponent); while ( --c >= 0 ) if ( c != ci ) bi = ( bi > (dic=Math.pow(d[i][c]/csize[c], 1.0/exponent)) ) ? dic : bi; break; //## NOTE: power mean + } s[i] = (ai == bi) ? 0 : (ai < bi) ? 1 - ai/bi : bi/ai - 1; } csize = null; @@ -1001,112 +1008,5 @@ public class MSTclust { return auc(x, y) / ( x[x.length-1] - x[0] ); } - - /* - // computes a single-linkage clustering with specified cutoff - // returns the partitions within an array - final static int[] slinkHC(final float[][] dm, final double cutoff) { - int i, n = dm.length; // size - float[][] d = Arrays.copyOf(dm, n); // dissimilarity copy - BitSet b = new BitSet(n); b.set(0, n); // true if cluster c(i) is non-empty - ArrayList<TreeSet<Integer>> c = new ArrayList<TreeSet<Integer>>(n); // clusters - int[] p = new int[n]; // partition array - i = -1; while ( ++i < n ) { c.add(new TreeSet<Integer>()); c.get(i).add(i); } - double dij, min, minmin = 0; int j, x = -1, y = -1; - double up, dn; int z; - while ( --n > 0 ) { - //### searching for x,y to agglomerate - min = cutoff; - i = -1; - while ( (i=b.nextSetBit(++i)) != (j=-1) && min > minmin ) - while ( (j=b.nextSetBit(++j)) < i && min > minmin ) - min = ( d[i][j] < min ) ? d[x=i][y=j] : min; - minmin = ( min >= minmin ) ? min : minmin; - if ( min == cutoff ) break; - //### updating d between x+y and every i != x,y - i = -1; - while ( (i=b.nextSetBit(++i)) != -1 ) - if ( i != x && i != y ) d[(i<x)?x:i][(i<x)?i:x] = Math.min(d[(i<x)?x:i][(i<x)?i:x], d[(i<y)?y:i][(i<y)?i:y]); - //### adding y into x, and clearing y - c.get(x).addAll(c.get(y)); - b.clear(y); - //c.get(y).clear(); // <= useless - } - i = x = -1; - while ( (i=b.nextSetBit(++i)) != -1 && ++x >= 0 ) - for (Integer e: c.get(i)) p[e] = x; - b = null; - c = null; - d = null; - //System.out.println(c.toString()); - //System.out.println(Arrays.toString(p)); - return p; - } - */ - /* - // computes the single-linkage clustering with specified cutoff using Kruskall algorithm - // returns the partitions within an array - final static int[] slinkK(final float[][] dm, final double cutoff) { - int n = dm.length; // no. elements - ArrayList<Set<Integer>> c = new ArrayList<Set<Integer>>(n); // clusters - int[] p = new int[n]; // partition id for each element - int ne = (--n) * (++n) / 2; // no. edges - float[][] e = new float[ne][3]; // edge list - int j, x = -1, i = -1; - while ( (j=++i) < n ) { - c.add(new HashSet<Integer>()); - c.get(i).add(i); - p[i] = i; - while ( --j >= 0 ) { - if ( (e[++x][2]=dm[i][j]) > cutoff ) { --x; continue; } - e[x][0] = i; - e[x][1] = j; - } - } - e = Arrays.copyOf(e, (ne=++x)); - Arrays.sort(e, Comparator.comparingDouble(a -> a[2])); - //## Kruskall - BitSet b = new BitSet(n); b.set(0, n); // active classes - x = -1; - while ( ++x < ne ) - if ( (i=p[(int)e[x][0]]) != (j=p[(int)e[x][1]]) ) { - for (int y: c.get(i)) p[y] = j; - c.get(j).addAll(c.get(i)); - b.clear(i); - c.get(i).clear(); - } - c = null; - e = null; - //System.out.println(Arrays.toString(p)); - int[] rwr = new int[b.cardinality()]; - i = j = -1; while ( (i=b.nextSetBit(++i)) >= 0 ) rwr[++j] = i; - while ( --n >= 0 ) p[n] = Arrays.binarySearch(rwr, p[n]); - b = null; - rwr = null; - //System.out.println(Arrays.toString(p)); - return p; - } - */ - /* - // returns a float array of specified size by splitting the specified String using blank space field separator and getting only specified fields - final static float[] splitRaw(final String line, final int size, final BitSet fields) { - float[] a = new float[size]; - StringBuilder sb = new StringBuilder(""); - int l = line.length(), i = -1, j = 0; - for (char ch: line.toCharArray()) { - if ( ch == ' ' ) { - if ( fields.get(++i) ) { - a[j] = (float) Double.parseDouble(sb.toString()); - if ( ++j == size ) break; - } - sb = sb.delete(0, sb.length()); - continue; - } - sb = sb.append(ch); - } - if ( j < size ) a[j] = (float) Double.parseDouble(sb.toString()); - return a; - } - */ }