Fork me on GitHub

python实现基于MapReduce的K-means聚类算法

python实现基于MapReduce的K-means聚类算法

话不多说,直接上代码。
数据源我没给,想要本地尝试跑的话,可以把sys.stdin替换成本地数据源。

Mapper:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#!/usr/bin/env python
import sys
import numpy as np


def ComputeDist(instance, center): # 计算样本点与中心点距离
i = np.array(eval(instance)).astype(np.float)
c = np.array(center).astype(np.float)
ans = np.sqrt(np.sum(np.square(i - c)))
return ans


def main(separator='\t'): # 传入K值
minDis = float('inf')
centers = [(1, 1), (-1, -1), (1, -1), (-1, 1)] # 选择初始中心点,考虑改用传参。
index = -1
for instance in sys.stdin:
instance = instance.split(separator)
instance = instance[0].strip('\n') # 依据实际数据做预处理
for i in range(0, len(centers)):
dis = ComputeDist(instance, centers[i]) # 遍历寻找距离最近的中心点
if dis < minDis:
minDis = dis
index = i
print("%d%s%s" % (index, separator, instance))


if __name__ == "__main__":
main()

Combiner:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import sys
import numpy as np


def main(separator='\t'):
values = {}
num = {}
keys = []
for line in sys.stdin:
line = line.strip()
key, value = line.split(separator, 1) # 获取mapper传来的中心点索引与对象
value = np.array(eval(value)).astype(np.float) # 将对象坐标字符串先转化为元组,再向量化
keys.append(key)
values[key] = values.get(key, np.array((0, 0)).astype(np.float)) + value # 逢相加,必先统一格式!
num[key] = num.get(key, 0) + 1
for key in set(keys):
print("%s%s%s%s%s" % (key, separator, str(tuple(values[key])), separator, num[key]))
# 将向量先转化为元组,再字符化,以保持格式。


if __name__ == '__main__':
main()

Reducer:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/usr/bin/env python
import sys
import numpy as np


def main(separator='\t'):
Num = {}
keys = []
values = {}
for line in sys.stdin:
line = line.strip()
key, value, num = line.split(separator, 2)
value = np.array(eval(value)) # 将局部坐标和向量化
num = int(num) # 将局部计数和整型化
keys.append(key)
values[key] = values.get(key, 0) + value
Num[key] = Num.get(key, 0) + num
for key in keys:
center = values[key] / Num[key]
print('{}{}({:.2f}, {:.2f})'.format(key, separator, center[0], center[1]))


if __name__ == '__main__':
main()
欢迎投喂,但你的支持就是对我最佳的回馈。