七叶笔记 » golang编程 » SimHash算法

SimHash算法

SimHash算法 可计算文本间的相似度,实现文本去重。文本相似度的计算,可以使用向量空间模型(VSM),即先对文本分词,提取特征,根据特征建立文本向量,把文本之间相似度的计算转化为特征向量距离的计算,如欧式距离、余弦夹角等。但这样做的缺点是复杂度会很高。

基于VSM的文本相似度计算,对于小量数据处理是可以的,但对于百度,google这样的搜索引擎,爬虫每天爬取的网页数目大得惊人,为了防止网页的重复,需要进行判重处理。对于这样的数据,VSM无能为力,google提出了SimHash算法,大大减少了计算量。具体步骤如下

算法流程如下

每篇文档算出签名S后,再计算两个签名的海明距离,海明距离是这两个二进制数异或后1的个数。根据经验,对于64位的SimHash,海明距离在3以内的都可以认为相似度比较高,认为这两个文档基本相同。以64位的签名来说,把它以每16位划分块,如果两个签名海明距离小于等于3,则至少有一个16位块完全相同。但是我们不知道具体是哪两个块相同,所以要进行枚举,这里只需要枚举4次。问题抽象如下

有10亿个不重复的64位01串,给定一个64位的01串,如何快速从这10亿个串中找出与给定串的海明距离小于等于3的字符串。

以8位数字签名01100011说明。划分为4块,每块两位,即01 10 00 11,然后在所有8位二进制签名中分别查找以01,10,00,11开始的签名。在所有的8位二进制签名中,按前两位进行索引,例如把01000000和01111111放在一个簇中,若找不到以01,10,00,11开始的二进制签名,说明海明距离肯定大于3,这样就直接可以判断这两个文本不相似,否则再比较后面的部分,大大减少计算量。

​# include  <iostream>
#include <sstream>
#include <algorithm>
#include <string.h>
#include <stdio.h>
#include <string>
#include <vector>
using namespace std;
struct Token {
 string token;
 double weight;
};
// Murmurhash 32bit.
uint32_t MurmurHash32(const void * key, int len, uint32_t seed) {
 const uint32_t m = 0x5bd1e995;
 const int r = 24;
 uint32_t h = seed ^ len;
 const unsigned char * data = (const unsigned char *)key;
 while (len >= 4) {
 uint32_t k = *reinterpret_cast<const uint32_t *>(data);
 k *= m;
 k ^= k >> r;
 k *= m;
 h *= m;
 h ^= k;
 data += 4;
 len -= 4;
 }
 switch (len) {
 // From the case beginning on the match, all
 // you need to perform back, so there is no break.
 case 3:
 h ^= data[2] << 16;
 case 2:
 h ^= data[1] << 8;
 case 1:
 h ^= data[0];
 h *= m;
 }
 h ^= h >> 13;
 h *= m;
 h ^= h >> 15;
 return h;
}
// Murmur_hash 64bit.
uint64_t MurmurHash64(const void *key, int len, uint32_t seed) {
 const uint64_t m = 0xc6a4a7935bd1e995;
 const int r = 47;
 uint64_t h = seed ^ (len * m);
 const uint64_t * data = (const uint64_t *)key;
 const uint64_t * end = data + (len >> 3);
 while (data != end) {
 uint64_t k = *data++;
 k *= m;
 k ^= k >> r;
 k *= m;
 h ^= k;
 h *= m;
 }
 const unsigned char * data2 = (const unsigned char*)data;
 switch (len & 7) {
 // From the case beginning on the match, all
 // you need to perform back, so there is no break.
 case 7:
 h ^= (uint64_t)(data2[6]) << 48;
 case 6:
 h ^= (uint64_t)(data2[5]) << 40;
 case 5:
 h ^= (uint64_t)(data2[4]) << 32;
 case 4:
 h ^= (uint64_t)(data2[3]) << 24;
 case 3:
 h ^= (uint64_t)(data2[2]) << 16;
 case 2:
 h ^= (uint64_t)(data2[1]) << 8;
 case 1:
 h ^= (uint64_t)(data2[0]);
 h *= m;
 }
 h ^= h >> r;
 h *= m;
 h ^= h >> r;
 return h;
}
vector<string>  Split (const string& sentence, const string& separator) {
 vector<string> res;
 string  segment  = sentence + separator;
 int size = segment.size();
 for (int i = 0; i < size; i++) {
 int p = segment.find(separator, i);
 if (p < size) {
 string s = segment.substr(i, p - i);
 i = p + separator.size() - 1;
 res.push_back(s);
 }
 }
 return res;
}
vector<Token> WordSegment(const string& document) {
 vector<Token> tokens;
 const auto& sentence = Split(document, " ");
 for (const auto& str : sentence) {
 const auto& word_num = Split(str, "^");
 Token wordInfo;
 wordInfo.token = word_num[0];
 wordInfo.weight = atof(word_num[1].c_str());
 tokens.push_back(wordInfo);
 }
 return tokens;
}
uint32_t SimHash32(const string& document) {
 double bits[32];
  memset (bits, 0, sizeof(bits));
 vector<Token> tokens = WordSegment(document);
 for (const auto& it : tokens) {
 uint32_t t = MurmurHash32((it.token).c_str(), (it.token). length (), 999999937);
 for (int i = 0; i < 32; i++) {
 uint32_t mask = (uint32_t)1 << i;
 if (t & mask)
 bits[i] += it.weight;
 else
 bits[i] -= it.weight;
 }
 }
 uint32_t hash = 0;
 for (int i = 0; i < 32; i++) {
 if (bits[i] > 0)
 hash |= (uint32_t)1 << i;
 }
 return hash;
}
uint64_t SimHash64(const string& document) {
 double bits[64];
 memset(bits, 0, sizeof(bits));
 vector<Token> tokens = WordSegment(document);
 for (const auto& it : tokens) {
 uint64_t t = MurmurHash64((it.token).c_str(), (it.token).length(), 999999937);
 for (int i = 0; i < 64; i++) {
 uint64_t mask = (uint64_t)1 << i;
 if (t & mask)
 bits[i] += it.weight;
 else
 bits[i] -= it.weight;
 }
 }
 uint64_t hash = 0;
 for (int i = 0; i < 64; i++) {
 if (bits[i] > 0)
 hash |= (uint64_t)1 << i;
 }
 return hash;
}
int HammingDist32(const uint32_t hash1, const uint32_t hash2) {
 uint32_t hash = hash1 ^ hash2;
 int d = 0;
 while (hash) {
 d++;
 hash &= hash - 1;
 }
 return d;
}
int HammingDist64(const uint64_t hash1, const uint64_t hash2) {
 uint64_t hash = hash1 ^ hash2;
 int d = 0;
 while (hash) {
 d++;
 hash &= hash - 1;
 }
 return d;
}
string Binary32(const uint32_t sign) {
 string str;
 uint32_t hash = sign;
 while (hash) {
 if (hash & 1)
 str += '1';
 else
 str += '0';
 hash >>= 1;
 }
 int rem = 32 - str.length();
 while (rem--)
 str += '0';
  reverse (str.begin(), str.end());
 return str;
}
string Binary64(const uint64_t sign) {
 string str;
 uint64_t hash = sign;
 while (hash) {
 if (hash & 1)
 str += '1';
 else
 str += '0';
 hash >>= 1;
 }
 int rem = 64 - str.length();
 while (rem--)
 str += '0';
 reverse(str.begin(), str.end());
 return str;
}
int main() {
 uint32_t _sign1 = SimHash32("中国^2 知乎^1 读者^2");
 uint32_t _sign2 = SimHash32("中国^2 世界^1 读者^2");
 int dist_32 = HammingDist32(_sign1, _sign2);
 std::cout << Binary32(_sign1) << std::endl;
 std::cout << Binary32(_sign2) << std::endl;
 std::cout << (32 - dist_32) / 32.0 << std::endl;
 uint64_t sign1 = SimHash64("中国^2 知乎^1 读者^2");
 uint64_t sign2 = SimHash64("中国^2 知乎^1 读者^1");
 int dist_64 = HammingDist64(sign1, sign2);
 std::cout << Binary64(sign1) << std::endl;
 std::cout << Binary64(sign2) << std::endl;
 std::cout << (64 - dist_64) / 64.0 << std::endl;
 return 0;
}
 

相关文章