k-means算法思想和代码实现

发布时间:2024年01月18日
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time


# 计算欧拉距离
def getDistance(dataSet, centroids, k):
    clalist = []
    for data in dataSet:
        diff = np.tile(data, (k, 1)) - centroids  # 相减   (np.tile(a,(2,1))就是把a先沿x轴复制1倍,即没有复制,仍然是 [0,1,2]。 再把结果沿y方向复制2倍得到array([[0,1,2],[0,1,2]]))
        squaredDiff = diff ** 2  # 平方
        squaredDist = np.sum(squaredDiff, axis=1)  # 和  (axis=1表示行)
        distance = squaredDist ** 0.5  # 开根号
        clalist.append(distance)
    clalist = np.array(clalist)  # 返回一个每个点到质点的距离len(dateSet)*k的数组
    return clalist


# 计算质心
def classify(dataSet, centroids, k):
    # 计算样本到质心的距离
    clalist = getDistance(dataSet, centroids, k)
    # 分组并计算新的质心
    minDistIndices = np.argmin(clalist, axis=1)  # axis=1 表示求出每行的最小值的下标
    newCentroids = pd.DataFrame(dataSet).groupby(
        minDistIndices).mean()  # DataFramte(dataSet)对DataSet分组,groupby(min)按照min进行统计分类,mean()对分类结果求均值
    newCentroids = newCentroids.values

    # 计算变化量
    changed = newCentroids - centroids
    return changed, newCentroids

def step(dataSet, k, newCentroids):

    centroids = sorted(newCentroids.tolist())  # tolist()将矩阵转换成列表 sorted()排序

    # 根据质心计算每个集群
    cluster = []
    clalist = getDistance(dataSet, centroids, k)  # 调用欧拉距离
    minDistIndices = np.argmin(clalist, axis=1)
    for i in range(k):
        cluster.append([])
    for i, j in enumerate(minDistIndices):  # enymerate()可同时遍历索引和遍历元素
        cluster[j].append(dataSet[i])


    print('质心为:%s' % centroids)
    print('集群为:%s' % cluster)
    for i in range(len(cluster)):
        for j in range(len(cluster[i])):
            plt.scatter(cluster[i][j][0], cluster[i][j][1], marker='o', color=colorSet[i], s=40, label='原始点')
        #  记号形状       颜色      点的大小      设置标签
    for i in range(len(centroids)):
        plt.scatter(centroids[i][0], centroids[i][1], marker='x', color=colorSet[i], s=50, label='质心')
    plt.show()
    time.sleep(2)

# 使用k-means分类
def kmeans(dataSet, k):
    # 随机取质心

    centroids = random.sample(dataSet, k)
    # 给定初始化
    #centroids = [[4, 9], [3, 3], [2, 1], [6, 3]]
    print("初始化k个点", centroids)

    for i in range(len(dataSet)):
        plt.scatter(dataSet[i][0], dataSet[i][1], marker='o', color='green', s=50, label='质心')
    plt.show()
    input()

    for i in range(len(dataSet)):
        plt.scatter(dataSet[i][0], dataSet[i][1], marker='o', color='green', s=50, label='质心')
    for i in range(len(centroids)):
        plt.scatter(centroids[i][0], centroids[i][1], marker='x', color=colorSet[i], s=100, label='质心')
    plt.show()


    # 迭代轮数
    n = 1
    # 更新质心 直到变化量全为0
    changed, newCentroids = classify(dataSet, centroids, k)
    while np.any(changed != 0):
        input()
        print("第", n, "轮")
        n = n + 1
        changed, newCentroids = classify(dataSet, newCentroids, k)
        step(dataSet, k, newCentroids)

    centroids = sorted(newCentroids.tolist())  # tolist()将矩阵转换成列表 sorted()排序

    # 根据质心计算每个集群
    cluster = []
    clalist = getDistance(dataSet, centroids, k)  # 调用欧拉距离
    minDistIndices = np.argmin(clalist, axis=1)
    for i in range(k):
        cluster.append([])
    for i, j in enumerate(minDistIndices):  # enymerate()可同时遍历索引和遍历元素
        cluster[j].append(dataSet[i])

    return centroids, cluster


# 创建数据集
def createDataSet():
    return [[1, 1], [1, 2], [2, 1], [6, 4], [6, 3], [5, 4], [10, 2], [4, 4], [3, 7], [4, 9], [5, 3], [2, 9],
            [5, 8], [7, 0], [8, 8], [3, 3]]


if __name__ == '__main__':
    dataset = createDataSet()
    colorSet = ['grey', 'red', 'blue', 'orange']
    centroids, cluster = kmeans(dataset, 4)

文章来源:https://blog.csdn.net/qq_35500719/article/details/135654726
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。