GIPhy
MSTclust

Repository

javac MSTclust.java
echo Main-Class: MSTclust > MANIFEST.MF
jar -cmvf MANIFEST.MF MSTclust.jar MSTclust.class
rm MANIFEST.MF MSTclust.class
java -jar MSTclust.jar [options]
javac MSTclust.java
native-image MSTclust MSTclust
rm MSTclust.class
./MSTclust [options]
 MSTclust

 Minimum Spanning Tree-based clustering

 USAGE:  MSTclust  -i <infile>  -o <basename>  [options]

 OPTIONS:

 -i <infile>    input file containing  either tab-delimited  profiles or a
                lower-triangular distance matrix (mandatory)
 -o <basenmae>  basename for output files (mandatory)
 -r <string>    selecting only specified rows (default: "1-")
 -l <string>    (input tab-delimited profiles) field(s) containing profile
                labels (default: "1")
 -p <string>    (input tab-delimited profiles)  fields containing profiles
                (default: "2-")
 -e <float>     (input tab-delimited profiles)  maximum allowed proportion
                of empty entries per profile (default: 0.25)
 -c <float>     inclusive cutoff to define cluster(s) (default: not set)
 -S <integer>   seed value for data perturbation  and subsampling analyses
                (default: 0)
 -L <integer>   profile length  to carry  out data  perturbation  analysis
                (default: not set)
 -B <integer>   number of  bins to  carry out  data  subsampling  analyses
                (default: not set)
 -R <integer>   number of replicates for data perturbation and subsampling
                analyses (default: 100)
 -h             writing a single-linkage  hierarchical classification tree
                into an output file (default: not set)
 -m             writing  a  minimum  spanning  tree  into an  output  file
                (default: not set)
MSTclust  -i profiles.L1.P2-2039.tsv  -o data  -l 1  -p 2-2039  -r 2-  -e 0.05  -h  -m
no. fields              2038
reading file ...        [ok]
no. elements            414
8.19% missing entries   discarding profile 5 (5)
remaining elements      413
computing distances ... [ok]
distance matrix         data.d
hierarchical tree       data.nwk
minimum spanning tree   data.graphml
MSTclust  -i data.d  -o clust  -c 0.007
reading file ...        [ok]
no. elements            413
clustering ...          [ok]
no. classes             16
silhouette              0.552998
clustering info         clust.txt
MSTclust  -i data.d  -o clust  -c 0.007  -L 2038  -B 9  -R 1000
reading file ...        [ok]
no. elements            413
clustering ...          [ok]
no. classes             16
silhouette              0.552998
clustering info         clust.txt
noising ...             [ok]
noise silhouette        0.544606  [0.252816 , 0.877700]
noise aWallace1         0.702105  [0.306373 , 1.000000]
noise aWallace2         0.984209  [0.754643 , 1.000000]
subsampling 1/9 ...     10.0% (41)    S 0.5653 [0.0000 , 0.8534]  aW1 1.0000 [1.0000 , 1.0000]  aW2 0.9987 [0.9863 , 1.0000]
subsampling 2/9 ...     20.0% (83)    S 0.5400 [0.4614 , 0.7296]  aW1 1.0000 [1.0000 , 1.0000]  aW2 0.9985 [0.9933 , 1.0000]
subsampling 3/9 ...     30.0% (124)   S 0.5263 [0.4750 , 0.6392]  aW1 1.0000 [1.0000 , 1.0000]  aW2 0.9987 [0.9953 , 1.0000]
subsampling 4/9 ...     40.0% (165)   S 0.5260 [0.4812 , 0.6056]  aW1 1.0000 [1.0000 , 1.0000]  aW2 0.9989 [0.9961 , 1.0000]
subsampling 5/9 ...     50.0% (206)   S 0.5307 [0.4914 , 0.5882]  aW1 1.0000 [1.0000 , 1.0000]  aW2 0.9991 [0.9971 , 1.0000]
subsampling 6/9 ...     60.0% (248)   S 0.5335 [0.4959 , 0.5843]  aW1 1.0000 [1.0000 , 1.0000]  aW2 0.9992 [0.9977 , 1.0000]
subsampling 7/9 ...     70.0% (289)   S 0.5398 [0.5029 , 0.5821]  aW1 1.0000 [1.0000 , 1.0000]  aW2 0.9995 [0.9983 , 1.0000]
subsampling 8/9 ...     80.0% (330)   S 0.5439 [0.5093 , 0.5788]  aW1 1.0000 [1.0000 , 1.0000]  aW2 0.9997 [0.9986 , 1.0000]
subsampling 9/9 ...     90.0% (372)   S 0.5493 [0.5169 , 0.5738]  aW1 1.0000 [1.0000 , 1.0000]  aW2 0.9998 [0.9993 , 1.0000]
nAUC silhouette         0.768597  [0.729724 , 0.813833]
nAUC aWallace1          1.000000  [1.000000 , 1.000000]
nAUC aWallace2          0.999118  [0.996141 , 1.000000]
grep "data" data.graphml | awk -F '[<>]' '{print$3}' | sort -g | uniq |
  while read c ; do  MSTclust  -i data.d  -o out  -c $c  -L 2038  -B 9  -t ; done  2>/dev/null
MSTclust  -i data.d  -o clust  -c 0.016691213  -L 2038  -B 9  -t  2>&1  |  tail -2
n   c           k  silhouette  noise silhouette [low avg up]  noise aWallace1 [low avg up]  noise aWallace2 [low avg up]  nAUC silhouette [low avg up]  nAUC aWallace1 [low avg up]  nAUC aWallace2 [low avg up]
413 0.016691213 3  0.829778    0.763112 0.838848 0.918818     0.444911 0.977796 1.000000    0.998107 0.999338 1.000000    0.825384 0.899548 0.938995    1.000000 1.000000 1.000000   0.828055 0.978866 1.000000
grep -F " s=" clust.txt | sed 's/ s=//' | sed 1d