Thursday 5 March 2009

K-means clustering in C++

The following program use in-line k-means algorithm to train the centroids of the K-means clustering.

Before using this code you have to change data loading function in loadTrainingData.

The beauty of this program is you do not need to worry about what is the best value of K. You could just run for range of K values (ExperimentBestKValue) and and choose the best perform K value to re-train and predict the your target (ExperimentBestK). A kind of Evolutionary algorithm... don't you agree?

// KMeanClustring.cpp : Defines the entry point for the console application.
// Author: Rudra Poudel

//include libraries
#include "stdafx.h"
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<float.h>
#include<conio.h>
#include<time.h>

//******************************
//** Setting input parameter
//******************************
#define EXPERIMENTAL_TRIALS 1000 //Numbers of trails to estimate the average value of K's success
#define EXPERIMENTAL_TRIALS_BEST_K 1 //10000 //Numbers of trails to estimate the best centroids with K centroids
#define EXPERIMENTAL_K_FROM 2 //Starting K value for experiment
#define EXPERIMENTAL_K_TO 100 //End K value for experiment
#define TESTING_SAMPLE_SIZE 50 //Testing sample size will be taken form the end of the input data
#define NUMBER_OF_TESTING_DATA 100 //Number of testing data
//#define K 20 // value of K
#define LEARNING_RATE 0.1 // Learning rate
#define MAX_TRAIN_ITERATION 50000 //Maximum nos of iteration to train the networks
#define INPUTS_COUNT 365 // Number of input patterns i.e. training sets
#define NN 6 // Number of inputs/neurons (6 Inputs )

#define PRINT_OPTION 2
//********************************

//Random generator parameters
#define IA 16807
#define IM 2147483647
#define AM (1. /IM)
#define IQ 127773
#define IR 2836
#define NTAB 32
#define NDIV (1+(IM-1)/NTAB)
#define EPS 1.2e-7
#define RNMX (1.0-EPS)
long idum;

enum PrintOptions
{
poNone = 0, //Dont print to DAT file
poTraining = 2 //Printing whole training process
};

//Function decleration
void ExperimentBestKValue();
void ExperimentBestK(int exp_k);
float getRand(long *idum);
bool loadTrainingData(double input[INPUTS_COUNT][NN], double targets[INPUTS_COUNT]);
void moveTowards(double centroid[NN], double datum[NN]);
int getClosestCentroid(double datum[NN], double** centroids, int maxK);
double getDistance(double datum[NN], double centroid[NN]);
bool isOldAndNewSame(double** oldCentroids, double** centroids, int maxK);
double assignClusterTarget(double inputs[INPUTS_COUNT][NN], double** centroids, double targets[INPUTS_COUNT], int maxK, int testingSampleSize);
double predictTarget(double inputs[INPUTS_COUNT][NN], double** centroids, double targets[INPUTS_COUNT], int maxK, int testingSampleSize);


///<FunctionName>_tmain</FunctionName>
///<arg name='argc'>Size of argv[]</arg>
///<arg name='argv[]'>Array inputs</arg>
///<Summary>Main entry point of the function. It will train the all possible target patterns for NN-1 inputs for 2-class problem.</Summary>
///<Output> 0 for success, 1 for fail.</Output>
int _tmain(int argc, _TCHAR* argv[])
{
//ExperimentBestKValue();
//ExperimentBestK(EXPERIMENTAL_TRIALS_BEST_K);
ExperimentBestK(23);
return 0;
}

///<FunctionName>ExperimentBestKValue</FunctionName>
///<Summary>Function to experiment.</Summary>
///<Output>SuccessPecentage.dat- which contains average success percent for different value of K</Output>
void ExperimentBestKValue(){
int i,j, epoch, higestSuccessPecntOfAllK=-1;
double sumTotalSuccessPecent,higestSuccessPecnt=-100, higestSuccessPecntForCurrentK=-100,tempSuccessPecent;
char filename[25];
FILE *pSuccessPec;

//Input patterns, last one is always 1 as it is input for bias
double inputs[INPUTS_COUNT][NN];
double targets[INPUTS_COUNT];
double **centroids;
double **oldCentroids;
//int centroidsTargets[K]; //0 or 1 which ever greater
//int clusterPoints[K][2]; //0= Numbers of points; i=miss classified

idum = -time(0);
if(!loadTrainingData(inputs,targets))
return;

if(PRINT_OPTION==poTraining){
sprintf(filename,"SuccessPecentage.dat");
pSuccessPec = fopen(filename,"w");
}

for(int exp_k=EXPERIMENTAL_K_FROM;exp_k<=EXPERIMENTAL_K_TO;exp_k++){
centroids = (double **) malloc(exp_k * sizeof(double *));
oldCentroids = (double **) malloc(exp_k * sizeof(double *) );
for(i=0;i<exp_k;i++) {
centroids[i] = (double *) malloc(NN * sizeof(double));
oldCentroids[i] = (double *) malloc(NN * sizeof(double));
}

sumTotalSuccessPecent = 0;
higestSuccessPecntForCurrentK = -100;
for(int exp_trials=0;exp_trials<EXPERIMENTAL_TRIALS;exp_trials++){

for(i=0;i<exp_k;i++){
for(j=0;j<NN;j++){
centroids[i][j] = getRand(&idum);
}
}

epoch=0;
do{//START: do ... while loop
for(i=0;i<exp_k;i++){
for(j=0;j<NN;j++){
oldCentroids[i][j] = centroids[i][j];
}
}

for (i=0;i<(INPUTS_COUNT-NUMBER_OF_TESTING_DATA-TESTING_SAMPLE_SIZE);i++) {
moveTowards(centroids[getClosestCentroid(inputs[i],centroids,exp_k)], inputs[i]);
}
epoch++;
//printf("epoch: %d\n",epoch);

}while(!isOldAndNewSame(oldCentroids,centroids,exp_k) && epoch<MAX_TRAIN_ITERATION);
//END: do ... while loop
tempSuccessPecent = assignClusterTarget(inputs, centroids, targets,exp_k, TESTING_SAMPLE_SIZE);
if(higestSuccessPecntForCurrentK<tempSuccessPecent){
higestSuccessPecntForCurrentK = tempSuccessPecent;
}
if(higestSuccessPecnt<tempSuccessPecent){
higestSuccessPecnt = tempSuccessPecent;
higestSuccessPecntOfAllK = exp_k;
}
sumTotalSuccessPecent += tempSuccessPecent;
}//END: for ... EXPERIMENTAL_TRIALS
if(PRINT_OPTION==poTraining){
fprintf(pSuccessPec,"%d\t%f\t%f\n",exp_k, sumTotalSuccessPecent/EXPERIMENTAL_TRIALS,higestSuccessPecntForCurrentK);
}
for(i=0;i<exp_k;i++) {
free(centroids[i]);
free(oldCentroids[i]);
}
free(centroids);
free(oldCentroids);

printf("K = %d finished.\n", exp_k);
}//END: for ... EXPERIMENTAL_K_FROM


if(PRINT_OPTION==poTraining){
fclose(pSuccessPec);

sprintf(filename,"BestKandPecentage.dat");
FILE* pFile = fopen(filename,"w");
fprintf(pFile,"%d %f\n",higestSuccessPecntOfAllK,higestSuccessPecnt);
fclose(pFile);
}

/*printf("\n");
printf("\n\nPress any to to close the program...");
getche();*/
}




///<FunctionName>ExperimentBestK</FunctionName>
///<arg name='exp_K'>Value of K</arg>
///<Summary>Function to experiment with multi starting point of K and choose the best predicted K.</Summary>
///<Output>Training_Function.dat- Best K for clustring</Output>
void ExperimentBestK(int exp_k){
int i,j, epoch;
double successPecent, bestSuccessPercent=-100;
char filename[25];
FILE *pFile;

//Input patterns, last one is always 1 as it is input for bias
double inputs[INPUTS_COUNT][NN];
double targets[INPUTS_COUNT];
double **centroids;
double **oldCentroids;
double **bestCentroids;
//int centroidsTargets[K]; //0 or 1 which ever greater
//int clusterPoints[K][2]; //0= Numbers of points; i=miss classified

idum = -time(0);
if(!loadTrainingData(inputs,targets))
return;

if(PRINT_OPTION==poTraining){
sprintf(filename,"KMeanCentroids.dat");
pFile = fopen(filename,"w");
}

centroids = (double **) malloc(exp_k * sizeof(double *));
oldCentroids = (double **) malloc(exp_k * sizeof(double *) );
bestCentroids = (double **) malloc(exp_k * sizeof(double *) );
for(i=0;i<exp_k;i++) {
centroids[i] = (double *) malloc(NN * sizeof(double));
oldCentroids[i] = (double *) malloc(NN * sizeof(double));
bestCentroids[i] = (double *) malloc(NN * sizeof(double));
}

for(int exp_trials=0;exp_trials<EXPERIMENTAL_TRIALS_BEST_K;exp_trials++){

for(i=0;i<exp_k;i++){
for(j=0;j<NN;j++){
centroids[i][j] = getRand(&idum);
}
}

epoch=0;
do{//START: do ... while loop
for(i=0;i<exp_k;i++){
for(j=0;j<NN;j++){
oldCentroids[i][j] = centroids[i][j];
}
}

for (i=0;i<(INPUTS_COUNT-NUMBER_OF_TESTING_DATA);i++) {
moveTowards(centroids[getClosestCentroid(inputs[i],centroids,exp_k)], inputs[i]);
}
epoch++;
//printf("epoch: %d\n",epoch);

}while(!isOldAndNewSame(oldCentroids,centroids,exp_k) && epoch<MAX_TRAIN_ITERATION);
//END: do ... while loop
successPecent = assignClusterTarget(inputs, centroids, targets,exp_k,INPUTS_COUNT-NUMBER_OF_TESTING_DATA);
if(bestSuccessPercent<successPecent){
bestSuccessPercent = successPecent;
for(i=0;i<exp_k;i++){
for(j=0;j<NN;j++){
bestCentroids[i][j] = centroids[i][j];
}
}

}
}//END: for ... EXPERIMENTAL_TRIALS

if(PRINT_OPTION==poTraining){

fprintf(pFile,"%d %f\n\n",exp_k, bestSuccessPercent);
//Printing Centroids
for(i=0;i<exp_k;i++){
for(j=0;j<NN;j++){
fprintf(pFile,"%f\t",bestCentroids[i][j]);
}
printf("\n");
}
}

successPecent = predictTarget(inputs, bestCentroids, targets,exp_k, INPUTS_COUNT-NUMBER_OF_TESTING_DATA);
if(PRINT_OPTION==poTraining){

fprintf(pFile,"\n%d %f",exp_k, successPecent);
printf("\n");
fclose(pFile);
}
for(i=0;i<exp_k;i++) {
free(centroids[i]);
free(oldCentroids[i]);
free(bestCentroids[i]);
}
free(centroids);
free(oldCentroids);
free(bestCentroids);

printf("\n");
printf("\n\nPress any to to close the program...");
getche();
}


///<FunctionName>getRand</FunctionName>
///<arg name='*idum'>Seed for random number generator</arg>
///<Summary>Function follows the Random Number Generators from Numerical Recipes in C, v2.0 (C) Copr. 1986-92 Numerical Recipes Software.</Summary>
///<Output>Random number between 0-1 range.</Output>
float getRand(long *idum)
{
int j;
long k;
static long iy=0;
static long iv[NTAB];
float temp;

if (*idum <= 0 || !iy) {
if (-(*idum) < 1) *idum = 1;
else *idum = -(*idum);
for (j=NTAB+7; j>=0; j--) {
k = (*idum)/IQ;
*idum=IA*(*idum-k*IQ)-IR*k;
if (*idum <0) *idum += IM;
if (j < NTAB) iv[j] = *idum;
}
iy = iv[0];
}
k = (*idum)/IQ;
*idum=IA*(*idum-k*IQ)-IR*k;
if (*idum < 0) *idum += IM;
j = iy/NDIV;
iy=iv[j];
iv[j] = *idum;
if ((temp=AM*iy) > RNMX) return RNMX;
else return temp;
}


///<FunctionName>loadTrainingData</FunctionName>
///<arg name='input[INPUTS_COUNT][NN]'>Training data</arg>
///<arg name='targets[INPUTS_COUNT]'>Target class for each input with same index number</arg>
///<Summary>Load the training data form txt file.</Summary>
///<Output>ture for success, false for fail.</Output>
bool loadTrainingData(double input[INPUTS_COUNT][NN], double targets[INPUTS_COUNT]){
int tempInt;
float tempFloat;
FILE *pData;

pData = fopen("ex1-data.txt","r");
if(pData==NULL) {
printf("Error: can't access data file.\n");
return false; // terminate with error
}

for(int i=0;i<INPUTS_COUNT;i++){
for(int j=0;j<NN;j++){
fscanf (pData, "%f", &tempFloat);
input[i][j]= tempFloat;
}
fscanf (pData, "%d", &tempInt);
targets[i]= tempInt;
}
fclose(pData);

return true;
}

///<FunctionName>getClosestCentroid</FunctionName>
///<arg name='datum[NN]'>One training data</arg>
///<arg name='** centroids'>K Centroids</arg>
///<Summary>Retrun the closest centroids of the given training data among K-centroids.</Summary>
///<Output>Index of closest centroids.</Output>
int getClosestCentroid(double datum[NN], double** centroids, int maxK) {
double distance;
double min = 99999999999;
int k = -1;
for (int i=0;i<maxK;i++) {
distance = getDistance(datum, centroids[i]);
if (distance < min) { k = i; min = distance; }
}
return(k);
}

///<FunctionName>getDistance</FunctionName>
///<arg name='datum[NN]'>One training data</arg>
///<arg name='centroid[NN]'>One Centroid</arg>
///<Summary>Retrun the distance betweent given training data and centroid.</Summary>
///<Output>Distance betweent given training data and centroid.</Output>
double getDistance(double datum[NN], double centroid[NN]) {
double distance = 0.0;
for (int i=0;i<NN;i++) {
distance += pow(datum[i] - centroid[i], 2);
}
return(sqrt(distance));
}


///<FunctionName>moveTowards</FunctionName>
///<arg name='centroid[NN]'>One Centroid</arg>
///<arg name='datum[NN]'>One training data</arg>
///<Summary>Change centroid postion according to distance between points and centroid and global learning rate.</Summary>
///<Output>Void. Node centroid changed by reference</Output>
void moveTowards(double centroid[NN], double datum[NN]) {
for (int i=0;i<NN;i++) {
centroid[i] += ((datum[i] - centroid[i]) * LEARNING_RATE); }
}

///<FunctionName>isOldAndNewSame</FunctionName>
///<arg name='** oldCentroids'>Old centroids</arg>
///<arg name='** centroids'>New centroids</arg>
///<arg name='maxK'>Number of K</arg>
///<Summary>Compare whether all old centroids and new centroids are same or not.</Summary>
///<Output>True if both centroids are same else false.</Output>
bool isOldAndNewSame(double** oldCentroids, double** centroids, int maxK) {
for(int i=0;i<maxK;i++){
for(int j=0;j<NN;j++){
if(oldCentroids[i][j]!=centroids[i][j])
return false;
}
}
return true;
}


///<FunctionName>assignClusterTarget</FunctionName>
///<arg name='inputs[INPUTS_COUNT][NN]'>Input values</arg>
///<arg name='** centroids'>Centroids</arg>
///<arg name='targets[INPUTS_COUNT]'>Targets</arg>
///<arg name='maxK'>Number of K</arg>
///<arg name='testingSampleSize'>Size of testing sample</arg>
///<Summary>Calculate the centroids targets.</Summary>
///<Output>Return the correct classified percent of the last TESTING_SAMPLE_SIZE data.</Output>
double assignClusterTarget(double inputs[INPUTS_COUNT][NN], double** centroids, double targets[INPUTS_COUNT], int maxK,int testingSampleSize){
int totalCorrectClassified=0, totalMissClassified=0;
int** centroidsTargetCounter; //0= Target 0; 1= Target 1
int* centroidsTarget; //Target as eighter 0 or 1

centroidsTargetCounter = (int**) malloc(sizeof(int *) * maxK);
centroidsTarget = (int*) malloc(sizeof(int) * maxK);
for(int i=0;i<maxK;i++) {
centroidsTargetCounter[i] = (int*) malloc(sizeof(double) * 2);
}

for(int i=0;i<maxK;i++){
for(int j=0;j<2;j++){
centroidsTargetCounter[i][j] = 0;
}
}
int testingAdjust;
if((INPUTS_COUNT-NUMBER_OF_TESTING_DATA)==testingSampleSize)
testingAdjust = 0;
else
testingAdjust = testingSampleSize;

for (int i=0;i<(INPUTS_COUNT-NUMBER_OF_TESTING_DATA-testingAdjust);i++) {
int nearestCentroid = getClosestCentroid(inputs[i],centroids,maxK);
if(targets[i]==0)
centroidsTargetCounter[nearestCentroid][0]++;
else
centroidsTargetCounter[nearestCentroid][1]++;
}

for (int i=0;i<maxK;i++) {
if(centroidsTargetCounter[i][0]>centroidsTargetCounter[i][1]){
centroidsTarget[i]=0;
}
else{
centroidsTarget[i]=1;
}
}

for (int i=(INPUTS_COUNT-NUMBER_OF_TESTING_DATA-testingSampleSize);i<(INPUTS_COUNT-NUMBER_OF_TESTING_DATA);i++) {
int nearestCentroid = getClosestCentroid(inputs[i],centroids, maxK);
if(targets[i]==centroidsTarget[nearestCentroid])
totalCorrectClassified++;
else
totalMissClassified++;
}

for(int i=0;i<maxK;i++) {
free(centroidsTargetCounter[i]);
}
free(centroidsTargetCounter);
free(centroidsTarget);

//printf("%f\n",( ((double)totalCorrectClassified/testingSampleSize) * 100));

return ( ( (((double)totalCorrectClassified)/testingSampleSize) * 100));
}

///<FunctionName>predictTarget</FunctionName>
///<arg name='inputs[INPUTS_COUNT][NN]'>Input values</arg>
///<arg name='** centroids'>Centroids</arg>
///<arg name='targets[INPUTS_COUNT]'>Targets</arg>
///<arg name='maxK'>Number of K</arg>
///<arg name='testingSampleSize'>Size of testing sample</arg>
///<Summary>Calculate the centroids targets.</Summary>
///<Output>Void.</Output>
double predictTarget(double inputs[INPUTS_COUNT][NN], double** centroids, double targets[INPUTS_COUNT], int maxK, int testingSampleSize){
int totalCorrectClassified=0, totalMissClassified=0;
int** centroidsTargetCounter; //0= Target 0; 1= Target 1
int* centroidsTarget; //Target as eighter 0 or 1

centroidsTargetCounter = (int**) malloc(sizeof(int *) * maxK);
centroidsTarget = (int*) malloc(sizeof(int) * maxK);
for(int i=0;i<maxK;i++) {
centroidsTargetCounter[i] = (int*) malloc(sizeof(double) * 2);
}

for(int i=0;i<maxK;i++){
for(int j=0;j<2;j++){
centroidsTargetCounter[i][j] = 0;
}
}

int testingAdjust;
if(INPUTS_COUNT-NUMBER_OF_TESTING_DATA==testingSampleSize)
testingAdjust = 0;
else
testingAdjust = testingSampleSize;

for (int i=0;i<INPUTS_COUNT-NUMBER_OF_TESTING_DATA-testingAdjust;i++) {
int nearestCentroid = getClosestCentroid(inputs[i],centroids,maxK);
if(targets[i]==0)
centroidsTargetCounter[nearestCentroid][0]++;
else
centroidsTargetCounter[nearestCentroid][1]++;
}

for (int i=0;i<maxK;i++) {
if(centroidsTargetCounter[i][0]>centroidsTargetCounter[i][1]){
centroidsTarget[i]=0;
totalCorrectClassified +=centroidsTargetCounter[i][0];
}
else{
centroidsTarget[i]=1;
totalCorrectClassified +=centroidsTargetCounter[i][1];
}
}


char filename[20];
FILE* pFile;
sprintf(filename,"Prediction.txt");
pFile = fopen(filename,"w");
for (int i=(INPUTS_COUNT-NUMBER_OF_TESTING_DATA);i<INPUTS_COUNT;i++) {
fprintf(pFile,"%d\n", centroidsTarget[getClosestCentroid(inputs[i],centroids, maxK)]);
}
fclose(pFile);

for(int i=0;i<maxK;i++) {
free(centroidsTargetCounter[i]);
}
free(centroidsTargetCounter);
free(centroidsTarget);

//printf("%f\n",( ((double)totalCorrectClassified/testingSampleSize) * 100));

return ( ((double)totalCorrectClassified/testingSampleSize) * 100);
}

No comments:

Post a Comment