K-均值聚类算法的MATLAB的实现。

kmeans_test.m

代码语言：javascript复制

%% (C) Copyright 2012. All rights reserved. Sotiris L Karavarsamis.
% Contact author at sokar@aiia.csd.auth.gr
% 
% This is an implementation of the k-means algorithm straight from the
% pseudocode description based on the book 'Introduction to Information
% Retrieval' by Manning, Schutze, Raghavan.

close all;

for i=1:10
    % clear workspace
    clear all;
    
    % set algorithm parameters
    TOL = 0.0004;
    ITER = 30;
    kappa = 4;
    
    % generate random data
    X = [1000*randn(1000,2)   1000; 2000*randn(1000,2)   5000];
    
    % run k-Means on random data
    tic;
    [C, I, iter] = myKmeans(X, kappa, ITER, TOL);
    toc
    
    % show number of iteration taken by k-means
    disp(['k-means instance took ' int2str(iter) ' iterations to complete']);
    
    % available colos for the points in the resulting clustering plot
    colors = {'red', 'green', 'blue', 'black'};
    
    % show plot of clustering
    figure;
    for i=1:kappa
       hold on, plot(X(find(I == i), 1), X(find(I == i), 2), '.', 'color', colors{i});
    end
    
    % wait key
    pause;
end

% pause and close all windows
pause;
close all;

myKmeans.m

代码语言：javascript复制

%% (C) Copyright 2012. All rights reserved. Sotiris L Karavarsamis.
%  Contact the author at <sokar@aiia.csd.auth.gr>
% 
%  This is my implementation on the k-means algorithm straight from the
%  pseudocode description of the very same algorithm on the book
%  'Introduction to Information Retrieval' by Manning, Schutze
%  and Raghavan.

function [C, I, iter] = myKmeans(X, K, maxIter, TOL)

% number of vectors in X
[vectors_num, dim] = size(X);

% compute a random permutation of all input vectors
R = randperm(vectors_num);

% construct indicator matrix (each entry corresponds to the cluster
% of each point in X)
I = zeros(vectors_num, 1);

% construct centers matrix
C = zeros(K, dim);

% take the first K points in the random permutation as the center sead
for k=1:K
    C(k,:) = X(R(k),:);
end

% iteration count
iter = 0;

% compute new clustering while the cumulative intracluster error in kept
% below the maximum allowed error, or the iterative process has not
% exceeded the maximum number of iterations permitted
while 1
    % find closest point
    for n=1:vectors_num
        % find closest center to current input point
        minIdx = 1;
        minVal = norm(X(n,:) - C(minIdx,:), 1);
        for j=1:K
            dist = norm(C(j,:) - X(n,:), 1);
            if dist < minVal
                minIdx = j;
                minVal = dist;
            end
        end
        
        % assign point to the closter center
        I(n) = minIdx;
    end
    
    % compute centers
    for k=1:K
        C(k, :) = sum(X(find(I == k), :));
        C(k, :) = C(k, :) / length(find(I == k));
    end
    
    % compute RSS error
    RSS_error = 0;
    for idx=1:vectors_num
        RSS_error = RSS_error   norm(X(idx, :) - C(I(idx),:), 2);
    end
    RSS_error = RSS_error / vectors_num;
    
    % increment iteration
    iter = iter   1;
    
    % check stopping criteria
    if 1/RSS_error < TOL
        break;
    end
    
    if iter > maxIter
        iter = iter - 1;
        break;
    end
end

disp(['k-means took ' int2str(iter) ' steps to converge']);

matlab 聚类算法

0 人点赞