package t_kmeans;
import java.io.*;
import java.util.*;
import java.text.DecimalFormat;
import java.util.List;
public class BasicKMeans {
static private String fname= "palsy.dat";
static List attributes = null;
static int maxNum = 0;
static DecimalFormat d2 = new DecimalFormat("##.####");
/**
* @param args the command line arguments
*/
public static void main(String[] args) {
// TODO code application logic here
// Get the data file name
// Read SVMlight format data (*.dat)
attributes = readSVMLightFile(fname);
if(attributes!=null){
System.out.println("Num of attributes : " + maxNum);
System.out.println("Num of instances : " + attributes.size());
Kmeans_cluster(attributes);
}
}// end of mains
// Implementing K-means
static void Kmeans_cluster(List attributes)
{
// Algorithm
// 1.Randomly generate k centroid points: C={C1,…,Ck}
// 2.Partition objects into k nonempty subsets (S1,…,Sk) by assigning each object xito the nearest centroid point by calculating distance metrics d(i,j) between the object xiand the centroid points Cj.
// 3.Stop if no more new assignment: output C, M(C), (S1,…,Sk)
// 4.Update k centroid points C1,…,Ck using the k subsets (S1,…,Sk)
// 5.Go back to Step 2
int k = 2;
double[][] centroids = new double[k][maxNum];
double[][] temporycentroids = new double[k][maxNum];
Vector[] cluster_Array = new Vector[k]; // creating k no of cluster
boolean flag = false;
// implementation
if (k > attributes.size())
System.exit(0);
System.out.println("The Total cluster number: " + k);
//1.Randomly generate k centroid points
centroids = randomCentroids(k);
// Step 2 and 3
// Calculate distance matric between each data point with centroids
do{
cluster_Array = cluster_Array(centroids,k);
//testing the number of k clusters
for(int idx = 0; idx < k; idx++)
{
System.out.println(cluster_Array[idx].size()+ " record " + " in Cluster "+ idx );
} // end of step 2 and 3
// searching new k number centroids points
int attribute_id = 1;
for (int idx = 0; idx < k; idx++)
{
for (int idx1 = 0; idx1 < maxNum; idx1++)
{
temporycentroids [idx][idx1] = calculate_centroids(cluster_Array[idx],attribute_id);
attribute_id++;
}
attribute_id = 1;
}
for (int idx = 0; idx < k; idx++)
{
System.out.println("centroid point " + idx);
for (int idx1 = 0; idx1 < maxNum; idx1++)
{
System.out.print(idx1+". ");
System.out.println(temporycentroids[idx][idx1]);
System.out.print(" ");
}
System.out.println();
} // end of step 4
// Step 5
if (compare_centroids(centroids,temporycentroids,k,maxNum) == true)
{
flag = true;
System.out.println("Centroid points are same.");
for(int idx = 0; idx < k; idx++)
System.out.println(cluster_Array[idx].size()+ " record " + " in Cluster "+ idx );
for (int idx = 0; idx < k; idx++)
{
System.out.println("centroid point " + idx);
for (int idx1 = 0; idx1 < maxNum; idx1++)
{
System.out.print(idx1+". ");
System.out.println(temporycentroids[idx][idx1]);
System.out.print(" ");
}
System.out.println();
}
}
else // update centroids
{
System.out.println("centroid points are different.");
centroids = temporycentroids;
for (int idx = 0; idx < k; idx++)
{
System.out.println("updated centroid point " + idx);
for (int idx1 = 0; idx1 < maxNum; idx1++)
{
System.out.print(idx1+". ");
System.out.println(centroids[idx][idx1]);
System.out.print(" ");
}
System.out.println();
}
}
}while(flag == false);
}
// KmeanCluster
// calculate_centroids start
static double calculate_centroids(List attributes, int attribute_id){
int N = attributes.size();
double centroid = 0.00;
if(N > 0)
{
for(int idx = 0; idx < N; idx++)
centroid = centroid + Double.valueOf(d2.format(getAttributeValue(attributes,idx,attribute_id)));
centroid = Double.valueOf(d2.format(centroid / N));
}
return centroid;
} // end of calculate_centroids function
// start randomCentroids
static double[][] randomCentroids(int k){
int[] all = new int[k];
for (int idx=0; idx<k; idx++){
all[idx] = 0; // initialization
}
Random rd = new Random();
int i = 0;
while (i < k){
int selectedNumber = rd.nextInt(attributes.size());
if (adding(all,i,i+1))
i++;
}
System.out.println("The data point which is done random : " );
for( int idx1=0; idx1<k ; idx1++)
System.out.println(all[idx1]+1 + " ");
double [][] temp = new double [k][maxNum];
for(int row = 0; row< k; row++)
{ int attrib_id = 1;
for(int col = 0; col < maxNum; col++)
{
temp[row][col] = Double.valueOf(d2.format(getAttributeValue(attributes,all[row],attrib_id)));
attrib_id++;
}
}
for(int row = 0; row< k; row++)
{
System.out.print("centroid " + row + ": ");
for(int col = 0; col < maxNum; col++)
{
System.out.println(temp[row][col] + " ");
}
System.out.println();
}
return temp;
}
// end of randomCentroids function
// add function start
static boolean adding(int[] list, int size, int value){
for ( int idx=0; idx<size; idx++){
if (list[idx] == value) //Does random value exist or not in list[]?
return false; //if same
}
list[size] = value;
return true;
}
/*
* end of add function
*/
// start cluster_array function
static Vector[] cluster_Array(double [][] centroids, int k){
//create given number of cluster
Vector[] cluster_Array = new Vector[k]; // create k number of cluster array
int closet_cluster = 0; // to keep nearest centroid
double closet_distance = 0.0; // to keep nearest distance
for(int idx=0; idx < k; idx++)
cluster_Array[idx] = new Vector(); // insert each List into each cluster
for (int idx = 0; idx < attributes.size(); idx++)
{
List fv1 = (List)attributes.get(idx);
for (int idx1 = 0; idx1 < k; idx1++)
{
double[] fv2 = centroids[idx1];
double distance = Double.valueOf(d2.format(distanceMetric(fv1, fv2))); // round to 2 decimal double value
if (idx1==0)
{
closet_distance = distance;
closet_cluster = idx1;
}
else if (idx1 > 0)
{
if(distance <= closet_distance)
{
closet_distance = distance;
closet_cluster = idx1;
}
}
}
cluster_Array[closet_cluster].add((List)attributes.get(idx));
}
return cluster_Array;
}
/*
* Comparing clusters ... old cluster and new one
* implemented by Thu Thu Tun
*/
static boolean compare_centroids(double [][] c1, double[][] c2, int k, int maxNum){
for (int idx = 0; idx < k; idx++)
for (int idx1 = 0; idx1 < maxNum; idx1++)
if (c1[idx][idx1] != c2[idx][idx1])
return false;
System.out.println("temp centroid");
for (int idx = 0; idx < k; idx++)
{
System.out.println("temp centroid point " + idx);
for (int idx1 = 0; idx1 < maxNum; idx1++)
{
System.out.print(idx1+". ");
System.out.println(c1[idx][idx1]);
System.out.print(" ");
}
System.out.println();
}
return true;
}
/**
* Read SVMLight data file into Vector data structure
*
* @param fname
* @return List = a list of feature vectors = [ [ label, [id, value], [id, value],...], .... ]
* a feature vector = [ label, [id, value], [id, value],...]
* label = int
* id = double
* value = double
*/
private static List readSVMLightFile(String fname) {
try{
List featureV = new Vector();
// Reading input by lines:
BufferedReader in = new BufferedReader( new FileReader(fname) );
String str = new String("");
while ((str = in.readLine()) != null){
str = str.trim();
if(str.length() > 0){
String line[];
String fs[];
line = str.split("\\s",2);
String clabel = line[0].trim().replace("+", "");
fs = line[1].trim().split("\\s"); // feature list #:v ...
List fl = new Vector();
fl.add(Integer.parseInt(clabel));
for(int idx = 0; idx < fs.length; idx++)
{
double f[] = {0,0};
String attr[] = fs[idx].split(":");
int aid = Integer.valueOf(attr[0].trim());
if( aid > maxNum) maxNum = aid;
f[0] = aid; // attribute id
f[1] = Double.valueOf(attr[1].trim()); // attribute value
fl.add(f);
}
featureV.add(fl);
}
}
in.close();
return featureV;
}
catch(IOException e){
System.out.println("Error reading file: " + e);
return null;
}
}//end of read file
/**
* Calculate Euclidean distance metric between two sparse feature vectors
* @param i feature vector = [ label, [id, value], [id, value],...]
* @param j feature vector = [ label, [id, value], [id, value],...]
* modified by Thu Thu Tun
* @return
*/
static double distanceMetric(List fv1, double[] fv2){
double[] fv = new double[maxNum];
for(int idx = 0; idx < maxNum; idx++){
fv[idx] = 0;
}
Iterator itr = fv1.iterator();
int k = 0;
while(itr.hasNext())
{
if( k > 0){
double f[] = (double[])itr.next();
fv[(int)f[0]-1] = f[1];
}
else
itr.next();
k++;
}
for (int idx = 0; idx < maxNum; idx++)
{
fv[idx] -= fv2[idx];
}
double distance = 0;
for(int idx = 0; idx < maxNum; idx++)
{
distance += Double.valueOf(d2.format(fv[idx]*fv[idx]));
}
return Double.valueOf(d2.format(Math.sqrt(distance)));
}
/**
* Retrive the attribute value
* @param attributes
* @param data_index 0 based: 0,...,dataSize-1
* @param attribute_id 1 based: 1,...,dimension
* @return
*/
static double getAttributeValue(List attributes, int data_index, int attribute_id)
{
List fv = (List)attributes.get(data_index); // feature vector
Iterator itr = fv.iterator();
int k = 0;
while(itr.hasNext())
{
if( k > 0)
{
double f[] = (double[])itr.next();
if( attribute_id == (int)f[0] )
{
return f[1];
}
}
else
itr.next();
k++;
}
return 0;
}
/**
* Retrive the class label
* @param attributes
* @param data_index 0 based: 0,...,dataSize-1
* @return
*/
static int getClassLabel(List attributes, int data_index)
{
List fv = (List)attributes.get(data_index); // feature vector
return (Integer)fv.get(0);
}
}