K-Means聚类算法C++实现

2022-06-16 14:28:53 浏览数 (2)

说明:将10组列表分为4类

测试函数

代码语言:javascript复制
#include <iostream>
#include "KMeans.h"
using namespace std;

int main()
{
    double data[] = {
        0.0, 0.2, 0.4,
        0.3, 0.2, 0.4,
        0.4, 0.2, 0.4,
        0.5, 0.2, 0.4,
        5.0, 5.2, 8.4,
        6.0, 5.2, 7.4,
        4.0, 5.2, 4.4,
        10.3, 10.4, 10.5,
        10.1, 10.6, 10.7,
        11.3, 10.2, 10.9
    };

    const int size = 10; //样本数目
    const int dim = 3;   //三维矩阵
    const int cluster_num = 4; //分成4类

    KMeans* kmeans = new KMeans(dim,cluster_num);//初始化构造函数
    int* labels = new int[size];
    kmeans->SetInitMode(KMeans::InitUniform);
  kmeans->Cluster(data,size,labels);

  for(int i = 0; i < size;   i)
  {
      printf("%f, %f, %f belongs to %d clustern", data[i*dim 0], data[i*dim 1], data[i*dim 2], labels[i]);
  }

  delete []labels;
  delete kmeans;

    return 0;
}

函数声明

代码语言:javascript复制

#pragma once
#include <fstream>
#include <string.h>
#include <assert.h>

class KMeans
{
public:

  enum InitMode
  {
    InitRandom,
    InitManual,
    InitUniform,
  };

  KMeans(int dimNum = 1, int clusterNum = 1);
  ~KMeans();
  
  void SetInitMode(int i){ m_initMode = i; }

  void Init(double *data, int N);
  void Cluster(double *data, int N, int *Label);

private:
  int m_dimNum;
  int m_clusterNum;
  double** m_means;

  int m_initMode;
  int m_maxIterNum;  
  double m_endError;    

  double GetLabel(const double* x, int* label);
  double CalcDistance(const double* x, const double* u, int dimNum);
};

具体实现

代码语言:javascript复制

#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include "KMeans.h"
using namespace std;


KMeans::KMeans(int dimNum, int clusterNum)
{
  m_dimNum = dimNum;
  m_clusterNum = clusterNum;

  m_means = new double*[m_clusterNum];//1*4的数组 []

  for (int i = 0; i < m_clusterNum; i  )
  {
    m_means[i] = new double[m_dimNum];
    memset(m_means[i], 0, sizeof(double) * m_dimNum);
  }

  m_initMode = InitRandom;
  m_maxIterNum = 100;
  m_endError = 0.001;
}

KMeans::~KMeans()
{
  for (int i = 0; i < m_clusterNum; i  )
  {
    delete[] m_means[i];
  }
  delete[] m_means;
}


void KMeans::Cluster(double *data, int N, int *Label)
{
  int size = 0;
  size=N;

  assert(size >= m_clusterNum);

  // 初始化模型
  Init(data,N);

  // 循环递归
  double* x = new double[m_dimNum];  // 样本数据
  int label = -1;    // 类别索引
  double iterNum = 0;
  double lastCost = 0;
  double currCost = 0;
  int unchanged = 0;
  bool loop = true;
  int* counts = new int[m_clusterNum];
  double** next_means = new double*[m_clusterNum];  // 再次分类新的模型
  //初始化新模型
  for (int i = 0; i < m_clusterNum; i  )
  {
    next_means[i] = new double[m_dimNum];
  }
    
  //迭代次数
  while (loop)
  {
    memset(counts, 0, sizeof(int) * m_clusterNum);
    for (int i = 0; i < m_clusterNum; i  )
    {
      memset(next_means[i], 0, sizeof(double) * m_dimNum);
    }

    lastCost = currCost;
    currCost = 0;

    // 进行初次分类
    for (int i = 0; i < size; i  )
    {
      for(int j = 0; j < m_dimNum; j  )
        x[j] = data[i*m_dimNum j];

      currCost  = GetLabel(x, &label);

      counts[label]  ;
      for (int d = 0; d < m_dimNum; d  )
      {
        next_means[label][d]  = x[d];
      }
    }
    currCost /= size;//平均距离,为再次分类做准备

    // 重新分类
    for (int i = 0; i < m_clusterNum; i  )
    {
      if (counts[i] > 0)
      {
        for (int d = 0; d < m_dimNum; d  )
        {
          next_means[i][d] /= counts[i];
        }
        memcpy(m_means[i], next_means[i], sizeof(double) * m_dimNum);
      }
    }

    // 迭代终止条件
    iterNum  ;
    if (fabs(lastCost - currCost) < m_endError * lastCost)
    {
      unchanged  ;
    }
    if (iterNum >= m_maxIterNum || unchanged >= 3)
    {
      loop = false;
    }

    
    cout << "Iter: " << iterNum << ", Average Cost: " << currCost << endl;
  }

  // 跳出迭代,输出结果
  for (int i = 0; i < size; i  )
  {
    for(int j = 0; j < m_dimNum; j  )
        x[j] = data[i*m_dimNum j];
    GetLabel(x, &label);
    Label[i] = label;
  }
  delete[] counts;
  delete[] x;
  for (int i = 0; i < m_clusterNum; i  )
  {
    delete[] next_means[i];
  }
  delete[] next_means;
}

//初始化模型
void KMeans::Init(double *data, int N)
{
  int size = N;

  if (m_initMode ==  InitRandom)
  {
    int inteval = size / m_clusterNum;//每个分类里有几组
    double* sample = new double[m_dimNum];

    // Seed the random-number generator with current time
    srand((unsigned)time(NULL));

    for (int i = 0; i < m_clusterNum; i  )
    {
      int select = inteval * i   (inteval - 1) * rand() / RAND_MAX;
      for(int j = 0; j < m_dimNum; j  )
        sample[j] = data[select*m_dimNum j];
      memcpy(m_means[i], sample, sizeof(double) * m_dimNum);
    }

    delete[] sample;
  }
  else if (m_initMode == InitUniform)
  {
    double* sample = new double[m_dimNum];

    for (int i = 0; i < m_clusterNum; i  )
    {
      int select = i * size / m_clusterNum;
      for(int j = 0; j < m_dimNum; j  )
        sample[j] = data[select*m_dimNum j];
      memcpy(m_means[i], sample, sizeof(double) * m_dimNum);
    }

    delete[] sample;
  }
  else if (m_initMode == InitManual)
  {
    // Do nothing
  }
}

//返回分类标准,以距离为准
double KMeans::GetLabel(const double* sample, int* label)
{
  double dist = -1;
  for (int i = 0; i < m_clusterNum; i  )
  {
    double temp = CalcDistance(sample, m_means[i], m_dimNum);
    if (temp < dist || dist == -1)
    {
      dist = temp;
      *label = i;
    }
  }
  return dist;
}

//计算街区距离
double KMeans::CalcDistance(const double* x, const double* u, int dimNum)
{
  double temp = 0;
  for (int d = 0; d < dimNum; d  )
  {
    temp  = (x[d] - u[d]) * (x[d] - u[d]);
  }
  return sqrt(temp);
}


博主:菜鸟程序员

初衷:学习资料,程序设计,视觉算法,求职经验,工作心得

0 人点赞