学习总结第三十六天——余弦定理判断文章相似度

拜读过《数学之美》中的余弦定理应用的相关章节，并学习了符号表的实现以后，一直希望自己能够做一个能够判断文章相似度类似的东西。虽然基础有限，但是仍然取得了不错的效果。
分词相关对于我来说数学基础还是有些不够，所以我决定暂时不进行分词，对英文文章这种天生的分词进行处理就好。
首先是实现一个符号表，在C++中声明一个模板类，实现如下：

#ifndef SILIB_H_
#define STLIB_H_

template <typename Key, typename Value> class BinarySearchST
{
private:
  Key *keys;
  Value *values;
  int st_size;
  int count;
  int resize(int capacity);
public:
  BinarySearchST(int capacity);
  int size();
  bool isempty();
  Value get(Key key);
  int rank(Key key);
  int put(Key key, Value val);
  void del(Key key);
  Key min();
  Key max();
  Key select(int k);
  Key ceiling(Key key);
  Key floor(Key key);
  ~BinarySearchST();
};
template <typename Key, typename Value>int BinarySearchST<Key,Value>::resize(int capacity)
{
  if (capacity < count + 1)
  {
    return -1;//capacity is too small...
  }
  Key *keys_new = new Key[capacity];
  Value *values_new = new Value[capacity];
  if (!keys_new || !values_new)
  {
    return -2;//allocate memory error...
  }
  for (int i = 0;i < count;i++)
  {
    keys_new[i] = keys[i];
    values_new[i] = values[i];
  }
  delete[] keys;
  delete[] values;
  keys = keys_new;
  values = values_new;
  st_size = capacity;
  return 0;
}
template <typename Key, typename Value>BinarySearchST<Key,Value>::BinarySearchST(int capacity):st_size(capacity),count(0)
{
  keys = new Key[capacity];
  values = new Value[capacity];
}
template <typename Key, typename Value>int BinarySearchST<Key,Value>::size()
{
  return count;
}
template <typename Key, typename Value>bool BinarySearchST<Key,Value>::isempty()
{
  return !count;
}
template <typename Key, typename Value>Value BinarySearchST<Key,Value>::get(Key key)
{
  if (isempty())
  {
    return 0;
  }
  int i = rank(key);
  if (i < count && keys[i] == key)
  {
    return values[i];
  }
  return 0;
}
template <typename Key, typename Value>int BinarySearchST<Key,Value>::rank(Key key)
{
  int lo = 0, hi = count-1;
  while (lo <= hi)
  {
    int mid = lo + (hi-lo)/2;
    if (keys[mid] > key)
    {
      hi = mid - 1;
    }
    else if (keys[mid] < key)
    {
      lo = mid + 1;
    }
    else
    {
      return mid;
    }
  }
  return lo;//if not found, return the index of lower than search key.
}
template <typename Key, typename Value>int BinarySearchST<Key,Value>::put(Key key, Value val)
{
  int i = rank(key);
  if (i < count && keys[i] == key)
  {
    values[i] = val;
    return 0;
  }
  if (count+1 > st_size)
  {
    int ret_code=resize(st_size*2);
    if (ret_code != 0)
    {
      return ret_code;
    }
  }
  for (int j = count; j > i;j--)
  {
    keys[j] = keys[j-1];
    values[j] = values[j-1];
  }
  keys[i] = key;
  values[i] = val;
  count++;
  return 0;
}

template <typename Key, typename Value>void BinarySearchST<Key,Value>::del(Key key)
{
  int i = rank(key);
  if (i < count && keys[i] == key)
  {
    for (int j = i;j < --count;j++)
    {
      keys[j] = keys[j+1];
      keys[j] = keys[j+1];
    }
  }
}
template <typename Key, typename Value>Key BinarySearchST<Key,Value>::min()
{
  return keys[0];
}
template <typename Key, typename Value>Key BinarySearchST<Key,Value>::max()
{
  return keys[count-1];
}
template <typename Key, typename Value>Key BinarySearchST<Key,Value>::select(int k)
{
  return keys[k];
}
template <typename Key, typename Value>Key BinarySearchST<Key,Value>::ceiling(Key key)
{
  int i = rank(key);
  return keys[i];
}
template <typename Key, typename Value>Key BinarySearchST<Key,Value>::floor(Key key)
{
  int i = rank(key);
  if (keys[i] > 0 && keys[i] == key)
  {
    return keys[i];
  }
  return keys[i+1];
}

template <typename Key, typename Value> BinarySearchST<Key,Value>::~BinarySearchST()
{
  delete[] keys;
  delete[] values;
}

#endif

这里统计词频率时，Key为string，Value为int，我们可以设计一个函数，能够将文章读入符号表。

BinarySearchST<string, int> *getCountedObj(std::ifstream& ifst)
{
  BinarySearchST<string, int> *st = new BinarySearchST<string, int>(1);
  while (!ifst.eof())
  {
    string word;
    ifst >> word;
    if (word != "")
    {
      st->put(word, st->get(word)+1);
    }
  }
  return st;
}

这时可以得到两个文章向量，各维的坐标值对应了相应的词频。使用以下函数可以计算出这两个文章向量夹角的余弦值，即文章相似度

double calculateSimialarity(BinarySearchST<string, int> *st1, BinarySearchST<string, int> *st2)
{
  int i=0, j=0;
  long long product = 0;
  double abs1 = 0,abs2 = 0;
  while (i<st1->size()&&j<st2->size())
  {
    string key1 = st1->select(i);
    string key2 = st2->select(j);
    if (key1 > key2)
    {
      abs2 += pow(st2->get(key2), 2);
      j++;
    }
    else if (key1 < key2)
    {
      abs1 +=  pow(st1->get(key1), 2);
      i++;
    }
    else
    {
      product += st1->get(key1) * st2->get(key2);
      abs1 += pow(st1->get(key1), 2);
      abs2 += pow(st2->get(key2), 2);
      j++;
      i++;
    }
  }
  abs1 = sqrt(abs1);
  abs2 = sqrt(abs2);
  double cosed_degree = product/(abs1*abs2);
  return cosed_degree;
}

使用这个网站和这个中的小说作为实验，发现同为福尔摩斯的两篇小说基本上相似度在0.9以上，不同题材小说仅有0.6左右的相似度，较好地符合了预期，实验大成功！！