一个普通技术宅的点点滴滴

0%

学习总结第三十六天——余弦定理判断文章相似度

拜读过《数学之美》中的余弦定理应用的相关章节,并学习了符号表的实现以后,一直希望自己能够做一个能够判断文章相似度类似的东西。虽然基础有限,但是仍然取得了不错的效果。
分词相关对于我来说数学基础还是有些不够,所以我决定暂时不进行分词,对英文文章这种天生的分词进行处理就好。
首先是实现一个符号表,在C++中声明一个模板类,实现如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#ifndef SILIB_H_
#define STLIB_H_

template <typename Key, typename Value> class BinarySearchST
{
private:
Key *keys;
Value *values;
int st_size;
int count;
int resize(int capacity);
public:
BinarySearchST(int capacity);
int size();
bool isempty();
Value get(Key key);
int rank(Key key);
int put(Key key, Value val);
void del(Key key);
Key min();
Key max();
Key select(int k);
Key ceiling(Key key);
Key floor(Key key);
~BinarySearchST();
};
template <typename Key, typename Value>int BinarySearchST<Key,Value>::resize(int capacity)
{
if (capacity < count + 1)
{
return -1;//capacity is too small...
}
Key *keys_new = new Key[capacity];
Value *values_new = new Value[capacity];
if (!keys_new || !values_new)
{
return -2;//allocate memory error...
}
for (int i = 0;i < count;i++)
{
keys_new[i] = keys[i];
values_new[i] = values[i];
}
delete[] keys;
delete[] values;
keys = keys_new;
values = values_new;
st_size = capacity;
return 0;
}
template <typename Key, typename Value>BinarySearchST<Key,Value>::BinarySearchST(int capacity):st_size(capacity),count(0)
{
keys = new Key[capacity];
values = new Value[capacity];
}
template <typename Key, typename Value>int BinarySearchST<Key,Value>::size()
{
return count;
}
template <typename Key, typename Value>bool BinarySearchST<Key,Value>::isempty()
{
return !count;
}
template <typename Key, typename Value>Value BinarySearchST<Key,Value>::get(Key key)
{
if (isempty())
{
return 0;
}
int i = rank(key);
if (i < count && keys[i] == key)
{
return values[i];
}
return 0;
}
template <typename Key, typename Value>int BinarySearchST<Key,Value>::rank(Key key)
{
int lo = 0, hi = count-1;
while (lo <= hi)
{
int mid = lo + (hi-lo)/2;
if (keys[mid] > key)
{
hi = mid - 1;
}
else if (keys[mid] < key)
{
lo = mid + 1;
}
else
{
return mid;
}
}
return lo;//if not found, return the index of lower than search key.
}
template <typename Key, typename Value>int BinarySearchST<Key,Value>::put(Key key, Value val)
{
int i = rank(key);
if (i < count && keys[i] == key)
{
values[i] = val;
return 0;
}
if (count+1 > st_size)
{
int ret_code=resize(st_size*2);
if (ret_code != 0)
{
return ret_code;
}
}
for (int j = count; j > i;j--)
{
keys[j] = keys[j-1];
values[j] = values[j-1];
}
keys[i] = key;
values[i] = val;
count++;
return 0;
}

template <typename Key, typename Value>void BinarySearchST<Key,Value>::del(Key key)
{
int i = rank(key);
if (i < count && keys[i] == key)
{
for (int j = i;j < --count;j++)
{
keys[j] = keys[j+1];
keys[j] = keys[j+1];
}
}
}
template <typename Key, typename Value>Key BinarySearchST<Key,Value>::min()
{
return keys[0];
}
template <typename Key, typename Value>Key BinarySearchST<Key,Value>::max()
{
return keys[count-1];
}
template <typename Key, typename Value>Key BinarySearchST<Key,Value>::select(int k)
{
return keys[k];
}
template <typename Key, typename Value>Key BinarySearchST<Key,Value>::ceiling(Key key)
{
int i = rank(key);
return keys[i];
}
template <typename Key, typename Value>Key BinarySearchST<Key,Value>::floor(Key key)
{
int i = rank(key);
if (keys[i] > 0 && keys[i] == key)
{
return keys[i];
}
return keys[i+1];
}

template <typename Key, typename Value> BinarySearchST<Key,Value>::~BinarySearchST()
{
delete[] keys;
delete[] values;
}

#endif

这里统计词频率时,Key为string,Value为int,我们可以设计一个函数,能够将文章读入符号表。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
BinarySearchST<string, int> *getCountedObj(std::ifstream& ifst)
{
BinarySearchST<string, int> *st = new BinarySearchST<string, int>(1);
while (!ifst.eof())
{
string word;
ifst >> word;
if (word != "")
{
st->put(word, st->get(word)+1);
}
}
return st;
}

这时可以得到两个文章向量,各维的坐标值对应了相应的词频。使用以下函数可以计算出这两个文章向量夹角的余弦值,即文章相似度

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
double calculateSimialarity(BinarySearchST<string, int> *st1, BinarySearchST<string, int> *st2)
{
int i=0, j=0;
long long product = 0;
double abs1 = 0,abs2 = 0;
while (i<st1->size()&&j<st2->size())
{
string key1 = st1->select(i);
string key2 = st2->select(j);
if (key1 > key2)
{
abs2 += pow(st2->get(key2), 2);
j++;
}
else if (key1 < key2)
{
abs1 += pow(st1->get(key1), 2);
i++;
}
else
{
product += st1->get(key1) * st2->get(key2);
abs1 += pow(st1->get(key1), 2);
abs2 += pow(st2->get(key2), 2);
j++;
i++;
}
}
abs1 = sqrt(abs1);
abs2 = sqrt(abs2);
double cosed_degree = product/(abs1*abs2);
return cosed_degree;
}

使用这个网站这个中的小说作为实验,发现同为福尔摩斯的两篇小说基本上相似度在0.9以上,不同题材小说仅有0.6左右的相似度,较好地符合了预期,实验大成功!!