本文主要是介绍【机器学习】(5.4)聚类--密度聚类(DBSCAN、MDCA),希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
1. 密度聚类方法

2. DBSCAN
2.1 DBSCAN算法的若干概念


2.2 DBSCAN算法流程

代码中运用了并查集,并查集详解 ——图文解说,简单易懂(转)_mjiansun的博客-CSDN博客
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import sklearn.datasets as ds
from sklearn.preprocessing import StandardScaler
import mathdef expand(a, b):d = (b - a) * 0.1return a-d, b+ddef euler_distance(point1: np.ndarray, point2: list) -> float:"""计算两点之间的欧拉距离,支持多维"""distance = 0.0for a, b in zip(point1, point2):distance += math.pow(a - b, 2)return math.sqrt(distance)class UnionFindSet(object):def __init__(self, data_num = 0):self.data_num = data_numself.prev = {}self.init_uf()def init_uf(self):for i in range(self.data_num):self.prev[i] = idef union_data(self, x, y):x_root = self.find_data(x)y_root = self.find_data(y)if x_root != y_root:self.prev[y_root] = x_rootdef find_data(self, x):x_root = self.prev[x]while self.prev[x_root] != x_root:x_root = self.prev[x_root]# 路径压缩,不写这部也没事,这里只是为了加速用x_copy = xwhile x_copy != x_root:temp = self.prev[x_copy]self.prev[x_copy] = x_rootx_copy = tempreturn x_rootdef get_uf_set(self):return self.prevclass DBSCAN(object):def __init__(self, eps, min_samples):self.eps = epsself.min_samples = min_samplesdef get_cluster(self, data):eps =self.epsmin_samples = self.min_samplesdata_num, feature_num = data.shapesim = [[] for i in range(data_num)]for i in range(data_num):for j in range(data_num):d = euler_distance(data[i, :], data[j, :])if d < eps:sim[i].append(j)uf = UnionFindSet(data_num)for i in range(data_num):single_data = sim[i]if len(single_data) <= min_samples:continuefor j in single_data:uf.union_data(i, j)cluster_cls = {}cls_num = 0for i in range(data_num):r = uf.find_data(i)if len(sim[i]) <= min_samples:continueif r not in cluster_cls:cls_num += 1cluster_cls[r] = cls_numdata_cls = [0] * data_numfor i in range(data_num):r = uf.find_data(i)if r != i or len(sim[i]) > min_samples:data_cls[i] = cluster_cls[r]return data_clsif __name__ == "__main__":N = 1000centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0)data = StandardScaler().fit_transform(data)params = ((0.19, 5), (0.2, 10), (0.2, 15), (0.3, 5), (0.3, 10), (0.3, 15))eps, min_samples = params[0]dbscan = DBSCAN(eps, min_samples)data_cls = dbscan.get_cluster(data)y_hat = np.array(data_cls)core_indices = np.zeros_like(y_hat, dtype=bool)core_indices[np.where(y_hat >= 1)] = Truey_unique = np.unique(y_hat)n_clusters = y_unique.size - (1 if 0 in y_hat else 0)print(y_unique, '聚类簇的个数为:', n_clusters)# plt.subplot(2, 3, i + 1)clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size))print(clrs)for k, clr in zip(y_unique, clrs):cur = (y_hat == k)if k == 0:plt.scatter(data[cur, 0], data[cur, 1], s=10, c='k')continueplt.scatter(data[cur, 0], data[cur, 1], s=15, c=clr, edgecolors='k')plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=30, c=clr, marker='o',edgecolors='k')x1_min, x2_min = np.min(data, axis=0)x1_max, x2_max = np.max(data, axis=0)x1_min, x1_max = expand(x1_min, x1_max)x2_min, x2_max = expand(x2_min, x2_max)plt.xlim((x1_min, x1_max))plt.ylim((x2_min, x2_max))plt.plot()plt.grid(b=True, ls=':', color='#606060')plt.title(r'$\epsilon$ = %.1f m = %d,聚类数目:%d' % (eps, min_samples, n_clusters), fontsize=12)plt.tight_layout()plt.subplots_adjust(top=0.9)plt.show()

3. MDCA
MDCA(Maximum Density Clustering Application):将基于密度的思想引入到划分聚类中,使用密度而不是初始质心作为考察簇归属情况的依据,能够自动确定簇数量并发现任意形状的簇。MDCA一般不保留噪声,因此也避免了由于阈值选择不当而造成大量对象丢弃情况。
我是依据参考文献2实现的内容进行的复现,我不清楚是否正确,因为我对分类结果不是很满意,与我的预期有所差别。
3.1 核心思想
1.最大密度点:可用K近邻距离之和的倒数表示密度
![]()
2. 密度曲线:根据所有对象与x的欧式距离对数据集重新排序
![]()

3. 将密度曲线中第一个谷值之前的数据归为一类,并将其剔除
4. 重复步骤1,2,3直到所有的点都在ρ0之下或者ρ0之上
5. 两个簇Ci和Cj,用最近样本距离作为簇间距离

6. 根据簇间距离阈值d0,判断是否需要合并两类
3.2 代码实战
import os
import numpy as np
import sklearn.datasets as ds
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScalerdef expand(a, b):d = (b - a) * 0.1return a-d, b+ddef find_cls(p, y_pred, distance, density_min, cls_num):candidate_indices = np.where(y_pred == 0)[0]cur_p = p[candidate_indices]cur_distance = distance[np.argmax(cur_p), candidate_indices]# 按照到最大密度点的距离从小到大排序cur_distance_sort = np.argsort(cur_distance)cur_curve = cur_p[cur_distance_sort] # 密度按照离“最高密度”的距离排序ori_indices = candidate_indices[cur_distance_sort] # 对应到最原始的索引值# plt.plot(cur_distance[cur_distance_sort], cur_curve)# plt.show()if np.max(cur_curve) > density_min:loc = np.where(cur_curve <= density_min)[0]loc_length = len(loc)if loc_length >= 2:for i in range(loc_length - 1):if loc[i + 1] - loc[i] <= 2:breaky_pred[ori_indices[:loc[i+1]]] = cls_numy_pred, cls_num = find_cls(p, y_pred, distance, density_min, cls_num+1)return y_pred, cls_numif __name__ == "__main__":N = 1000centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.25, 0.25, 0.25, 0.25], random_state=0)# data = StandardScaler().fit_transform(data)max_data = np.max(data)min_data = np.min(data)data = (data - min_data) / (max_data- min_data)# 判断密度时检测周围点的个数k = 3# 最小阈值密度,设置该值主要是使用K近邻距离之和的倒数表示密度,但是这个值怎么选出来暂时不知道density_min = 10000# 最小阈值距离distance_min = 0.008# 样本数,特征数目sample_num, feature_num = data.shape# 预测结果y_pred = np.zeros(sample_num)# p为每个样本的密度p = np.zeros(sample_num)# distance(i, j) 代表第i个样本到第j个样本的距离distance = np.zeros((sample_num,sample_num))# 利用k近邻均值定义密度较好,不会出现很多密度相同的点。如果采用半径内个数的定义方法,可能一块区域会出现很多的类别for i in range(sample_num):distance[i, :] = np.sum((data[i, :] - data)**2, axis=1)# temp_sort = np.sort(distance[i, :])# p[i] = k / np.sum(distance[i, distance[i, :] <= temp_sort[k]])temp_sort = np.argsort(distance[i, :])p[i] = k / np.sum(distance[i, temp_sort[:k+1]])cls_num = 1# 确定初始的类簇y_pred, cls_num = find_cls(p, y_pred, distance, density_min, cls_num)# 合并距离较近的类while True:flag_1 = 0flag_2 = 0for i in range(1, cls_num):for j in range(i+1, cls_num):a = np.where(y_pred == i)[0]b = np.where(y_pred == j)[0]temp = distance[a, :][:, b]if np.min(temp) <= distance_min:y_pred[y_pred==j] = iy_pred[y_pred > j] = y_pred[y_pred > j] - 1cls_num = cls_num - 1flag_1 = 1flag_2 = 0break #跳出的第二个forif flag_1 == 1: #跳出的第一个forbreakflag_2 = 1if flag_2 == 1: # 跳出的whilebreak# 合并离聚类团较小的散点scatter_indices = np.where(y_pred==0)[0]loc_indices = np.where(y_pred > 0)[0]scatter_cls_distance = distance[scatter_indices,:][:,loc_indices]scatter_cls_min_distance = np.min(scatter_cls_distance, axis=1)scatter_cls_min_indices = np.argmin(scatter_cls_distance, axis=1)a = y_pred[loc_indices[scatter_cls_min_indices]]b = (scatter_cls_min_distance <= distance_min)y_pred[scatter_indices] = y_pred[loc_indices[scatter_cls_min_indices]] * (scatter_cls_min_distance <= distance_min)# 画图y_hat = np.array(y_pred)core_indices = np.zeros_like(y_hat, dtype=bool)core_indices[np.where(y_hat >= 1)] = Truey_unique = np.unique(y_hat)n_clusters = y_unique.size - (1 if 0 in y_hat else 0)print(y_unique, '聚类簇的个数为:', n_clusters)# plt.subplot(2, 3, i + 1)clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size))print(clrs)for k, clr in zip(y_unique, clrs):cur = (y_hat == k)if k == 0:plt.scatter(data[cur, 0], data[cur, 1], s=10, c='k')continueplt.scatter(data[cur, 0], data[cur, 1], s=15, c=clr, edgecolors='k')plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=30, c=clr, marker='o',edgecolors='k')x1_min, x2_min = np.min(data, axis=0)x1_max, x2_max = np.max(data, axis=0)x1_min, x1_max = expand(x1_min, x1_max)x2_min, x2_max = expand(x2_min, x2_max)plt.xlim((x1_min, x1_max))plt.ylim((x2_min, x2_max))plt.plot()plt.grid(b=True, ls=':', color='#606060')# plt.title(r'$\epsilon$ = %.1f m = %d,聚类数目:%d' % (1, 1, n_clusters), fontsize=12)plt.tight_layout()plt.subplots_adjust(top=0.9)plt.show()print("end")
3.3 实验结果

3.4 性能比较(个人感觉不太好用,参数太难调了)
- 优点:
- 对噪声数据不敏感
- 不依赖初始数据点的选择
- 可以完成任意形状的聚类
- 缺点:
- 算法复杂,分类速度较慢
- 需要在测试前确定密度阈值
- 对于高维数据,距离的度量并不是很好
- 不适合数据集密度差异较大或整体密度基本相同的情况
参考
1. 并查集详解 ——图文解说,简单易懂(转)_mjiansun的博客-CSDN博客
2. 密度最大值聚类(MDCA) | GitHub
这篇关于【机器学习】(5.4)聚类--密度聚类(DBSCAN、MDCA)的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!