-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimhash.py
More file actions
91 lines (75 loc) · 2.31 KB
/
simhash.py
File metadata and controls
91 lines (75 loc) · 2.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""
@Time : 2018/5/16 20:44
@Author : fazhanzhang
@Function : 采用simhash实现两篇文章的相似度
"""
import jieba
import jieba.analyse
def cut_words_weights(content):
"""
根据jieba分词,提取关键词及其权重
:param data:
:return:
"""
# jieba提取关键词及其权重
# 设置停用词
# jieba.analyse.set_stop_words('path_of_stopwords')
tags = jieba.analyse.extract_tags(content, topK=20, withWeight=True)
# print(tags)
tags = [(keyword, int(weight*10)) for keyword, weight in tags]
return tags
def hash_keyword_add_weight(keyword_weight, len_hash=64):
"""
对关键词进行hash, 然后加权
:param keyword_weight:
:param len_hash:
:return:
"""
# 关键词hash
keyword_weight = [(bin(hash(keyword)).replace("0b", "").replace("-", "").zfill(len_hash)[-1*len_hash:], weight)
for keyword, weight in keyword_weight]
# 加权
add_weight = [0] * len_hash
for keyword, weight in keyword_weight:
for i in range(len_hash):
if keyword[i] == "1":
add_weight[i] += weight
else:
add_weight[i] += -1 * weight
result = ""
for _ in add_weight:
if _ >= 0:
result += "1"
else:
result += "0"
return result
def cal_hamming_distance(hash_file1, hash_file2):
"""
计算两篇文章的海明距离
:param hash_file1:
:param hash_file2:
:return:
"""
hamming_dis = 0
for i in range(len(hash_file1)):
if hash_file1[i] != hash_file2[i]:
hamming_dis += 1
# print("海明距离:", hamming_dis)
return hamming_dis
def run_simhash(str1, str2):
"""
主程序
:param str1:
:param str2:
:return:
"""
tags1 = cut_words_weights(str1)
tags2 = cut_words_weights(str2)
hash_file1 = hash_keyword_add_weight(tags1)
hash_file2 = hash_keyword_add_weight(tags2)
hamming_dis = cal_hamming_distance(hash_file1, hash_file2)
return hamming_dis
if __name__ == "__main__":
print(run_simhash("turn off the light in sitting room", "turn off the smart light in the sitting room"))
# print(run_simhash("reduce the brightness of the light in the sitting room", "reduce the brightness of sitting room"))