SimHash算法 可计算文本间的相似度,实现文本去重。文本相似度的计算,可以使用向量空间模型(VSM),即先对文本分词,提取特征,根据特征建立文本向量,把文本之间相似度的计算转化为特征向量距离的计算,如欧式距离、余弦夹角等。但这样做的缺点是复杂度会很高。
基于VSM的文本相似度计算,对于小量数据处理是可以的,但对于百度,google这样的搜索引擎,爬虫每天爬取的网页数目大得惊人,为了防止网页的重复,需要进行判重处理。对于这样的数据,VSM无能为力,google提出了SimHash算法,大大减少了计算量。具体步骤如下
算法流程如下
每篇文档算出签名S后,再计算两个签名的海明距离,海明距离是这两个二进制数异或后1的个数。根据经验,对于64位的SimHash,海明距离在3以内的都可以认为相似度比较高,认为这两个文档基本相同。以64位的签名来说,把它以每16位划分块,如果两个签名海明距离小于等于3,则至少有一个16位块完全相同。但是我们不知道具体是哪两个块相同,所以要进行枚举,这里只需要枚举4次。问题抽象如下
有10亿个不重复的64位01串,给定一个64位的01串,如何快速从这10亿个串中找出与给定串的海明距离小于等于3的字符串。
以8位数字签名01100011说明。划分为4块,每块两位,即01 10 00 11,然后在所有8位二进制签名中分别查找以01,10,00,11开始的签名。在所有的8位二进制签名中,按前两位进行索引,例如把01000000和01111111放在一个簇中,若找不到以01,10,00,11开始的二进制签名,说明海明距离肯定大于3,这样就直接可以判断这两个文本不相似,否则再比较后面的部分,大大减少计算量。
# include <iostream> #include <sstream> #include <algorithm> #include <string.h> #include <stdio.h> #include <string> #include <vector> using namespace std; struct Token { string token; double weight; }; // Murmurhash 32bit. uint32_t MurmurHash32(const void * key, int len, uint32_t seed) { const uint32_t m = 0x5bd1e995; const int r = 24; uint32_t h = seed ^ len; const unsigned char * data = (const unsigned char *)key; while (len >= 4) { uint32_t k = *reinterpret_cast<const uint32_t *>(data); k *= m; k ^= k >> r; k *= m; h *= m; h ^= k; data += 4; len -= 4; } switch (len) { // From the case beginning on the match, all // you need to perform back, so there is no break. case 3: h ^= data[2] << 16; case 2: h ^= data[1] << 8; case 1: h ^= data[0]; h *= m; } h ^= h >> 13; h *= m; h ^= h >> 15; return h; } // Murmur_hash 64bit. uint64_t MurmurHash64(const void *key, int len, uint32_t seed) { const uint64_t m = 0xc6a4a7935bd1e995; const int r = 47; uint64_t h = seed ^ (len * m); const uint64_t * data = (const uint64_t *)key; const uint64_t * end = data + (len >> 3); while (data != end) { uint64_t k = *data++; k *= m; k ^= k >> r; k *= m; h ^= k; h *= m; } const unsigned char * data2 = (const unsigned char*)data; switch (len & 7) { // From the case beginning on the match, all // you need to perform back, so there is no break. case 7: h ^= (uint64_t)(data2[6]) << 48; case 6: h ^= (uint64_t)(data2[5]) << 40; case 5: h ^= (uint64_t)(data2[4]) << 32; case 4: h ^= (uint64_t)(data2[3]) << 24; case 3: h ^= (uint64_t)(data2[2]) << 16; case 2: h ^= (uint64_t)(data2[1]) << 8; case 1: h ^= (uint64_t)(data2[0]); h *= m; } h ^= h >> r; h *= m; h ^= h >> r; return h; } vector<string> Split (const string& sentence, const string& separator) { vector<string> res; string segment = sentence + separator; int size = segment.size(); for (int i = 0; i < size; i++) { int p = segment.find(separator, i); if (p < size) { string s = segment.substr(i, p - i); i = p + separator.size() - 1; res.push_back(s); } } return res; } vector<Token> WordSegment(const string& document) { vector<Token> tokens; const auto& sentence = Split(document, " "); for (const auto& str : sentence) { const auto& word_num = Split(str, "^"); Token wordInfo; wordInfo.token = word_num[0]; wordInfo.weight = atof(word_num[1].c_str()); tokens.push_back(wordInfo); } return tokens; } uint32_t SimHash32(const string& document) { double bits[32]; memset (bits, 0, sizeof(bits)); vector<Token> tokens = WordSegment(document); for (const auto& it : tokens) { uint32_t t = MurmurHash32((it.token).c_str(), (it.token). length (), 999999937); for (int i = 0; i < 32; i++) { uint32_t mask = (uint32_t)1 << i; if (t & mask) bits[i] += it.weight; else bits[i] -= it.weight; } } uint32_t hash = 0; for (int i = 0; i < 32; i++) { if (bits[i] > 0) hash |= (uint32_t)1 << i; } return hash; } uint64_t SimHash64(const string& document) { double bits[64]; memset(bits, 0, sizeof(bits)); vector<Token> tokens = WordSegment(document); for (const auto& it : tokens) { uint64_t t = MurmurHash64((it.token).c_str(), (it.token).length(), 999999937); for (int i = 0; i < 64; i++) { uint64_t mask = (uint64_t)1 << i; if (t & mask) bits[i] += it.weight; else bits[i] -= it.weight; } } uint64_t hash = 0; for (int i = 0; i < 64; i++) { if (bits[i] > 0) hash |= (uint64_t)1 << i; } return hash; } int HammingDist32(const uint32_t hash1, const uint32_t hash2) { uint32_t hash = hash1 ^ hash2; int d = 0; while (hash) { d++; hash &= hash - 1; } return d; } int HammingDist64(const uint64_t hash1, const uint64_t hash2) { uint64_t hash = hash1 ^ hash2; int d = 0; while (hash) { d++; hash &= hash - 1; } return d; } string Binary32(const uint32_t sign) { string str; uint32_t hash = sign; while (hash) { if (hash & 1) str += '1'; else str += '0'; hash >>= 1; } int rem = 32 - str.length(); while (rem--) str += '0'; reverse (str.begin(), str.end()); return str; } string Binary64(const uint64_t sign) { string str; uint64_t hash = sign; while (hash) { if (hash & 1) str += '1'; else str += '0'; hash >>= 1; } int rem = 64 - str.length(); while (rem--) str += '0'; reverse(str.begin(), str.end()); return str; } int main() { uint32_t _sign1 = SimHash32("中国^2 知乎^1 读者^2"); uint32_t _sign2 = SimHash32("中国^2 世界^1 读者^2"); int dist_32 = HammingDist32(_sign1, _sign2); std::cout << Binary32(_sign1) << std::endl; std::cout << Binary32(_sign2) << std::endl; std::cout << (32 - dist_32) / 32.0 << std::endl; uint64_t sign1 = SimHash64("中国^2 知乎^1 读者^2"); uint64_t sign2 = SimHash64("中国^2 知乎^1 读者^1"); int dist_64 = HammingDist64(sign1, sign2); std::cout << Binary64(sign1) << std::endl; std::cout << Binary64(sign2) << std::endl; std::cout << (64 - dist_64) / 64.0 << std::endl; return 0; }