编写正排索引
继续编写incde.hpp
#pragma once
#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include <unordered_map>
#include "util.hpp"
namespace ns_index{
struct DocInfo{
std::string title; //文档标题
std::string content; //文档对应的去标签之后的内容
std::string url; //官网文档url
uint64_t dic_id; //文档的ID
}
struct InvertedElem{
uint64_t doc_id;
std::string word;
int weight;
}
//倒排拉链
typedef std::vector<InvertedElem> InvertedList;
class Index{
private:
//正排索引的数据结构用数组,数组的下标天然是文档的ID
std::vector<DocInfo> forward_index; //正排索引
//倒排索引一定是一个关键字和一组(个)InvertedElem对应[关键字和倒排拉链的映射关系]
std::unordered_map<std::string, InvertedList> inverted_index;
public:
Index(){}
~Index(){}
public:
//根据doc_id找到文档内容
DocInfo *GetForwardIdex(uint64_t doc_id)
{
if(doc_id >= forward_index.size()){
std::cerr << "doc_id out range, error" << std::endl;
return nullptr;
}
return &forward_index[doc_id];
}
//根据关键字string获得倒排拉链
InvertedList *GetInvertedList(const std::string &word)
{
auto iter = inverted_index.find(word);
if(iter == inverted_index.end()){
std::cerr << word << " have no InvertedList" << std::endl;
return nullptr;
}
return &(iter->second);
}
//根据去标签,格式化之后的文档,构建正排和倒排索引
//data/raw_html/raw.txt
bool BuildIndex(const std::string &input) //parse处理完毕的数据交给我
{
std::ifstream in(input, std::ios::in | std::ios::binary);
if(!in.is_open()){
std::cerr << "sorry, " << input << " open error" << std::endl;
return false;
}
std::string line;
while(std::getline(in, line)){
DocInfo * doc = BuildForwardIndex(line);
if(nullptr == doc){
std::cerr << "build " << line << " error" << std::endl; //for debug
continue;
}
BuildInvertedIndex(*doc);
}
return true;
}
private:
DocInfo *BuildForwardIndex(const std::string &line)
{
//1.解析line,字符串切分
//line -> 3 string, title, content, url
std::vector<std::string> results;
const std::string sep = '\3'; //行内分隔符
ns_util::StringUtil::CutString(line, &results, sep);
if(results.size() != 3){
return nullptr;
}
//2.字符串进行填充到DocInfo中
DocInfo doc;
doc.title = results[0]; //title
doc.content = results[1]; //content
doc.url = results[2]; //url
doc.doc_id = forward_index.size(); //先保存id,再插入,对应的id就是当前doc在vector中的下标
//3.插入到正排索引的vector中
forward_index.push_back(doc);
}
bool BuildInvertedIndex(const DocInfo &doc)
{
}
};
}
切分字符串
打开util.hpp
#pragma once
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <boost/algorithm/string.hpp>
namespace ns_util{
class FileUtil{
public:
static bool ReadFile(const std::string &file_path, std::string *out)
{
std::ifstream in(file_path, std::ios::in);
if(!in.is_open()){
std::cerr << "open file" << file_path << " error" << std::endl;
return false;
}
std::string line;
while(std::getline(in, line)){ //如何理解getline读取到文件结束:getline的返回值是一个&,while判断的是一个bool类型,本质是因为返
回的对象当中重载了强制类型转化
*out += line;
}
in.close();
return true;
}
};
class StringUtil{
public:
static void CutString(const std::string &target, std::vector<std::string> *out, const std::string &sep)
{
//boost split
boost::split(*out, target, boost::is_any_of(sep), boost::token_compress_on); //打开压缩分隔符
}
};
}
打开index.hpp
DocInfo *BuildForwardIndex(const std::string &line)
{
//1.解析line,字符串切分
//line -> 3 string, title, content, url
std::vector<std::string> results;
const std::string sep = '\3'; //行内分隔符
ns_util::StringUtil::CutString(line, &results, sep);
if(results.size() != 3){
return nullptr;
}
//2.字符串进行填充到DocInfo中
DocInfo doc;
doc.title = results[0]; //title
doc.content = results[1]; //content
doc.url = results[2]; //url
doc.doc_id = forward_index.size(); //先保存id,再插入,对应的id就是当前doc在vector中的下标
//3.插入到正排索引的vector中
forward_index.push_back(doc);
return &forward_index.back();
}
bool BuildInvertedIndex(const DocInfo &doc)
{
//DocInfo{title, content, url, doc_id}
//word -> 倒排拉链
return true;
}
倒排索引原理
struct InvertedElem{
uint64_t doc_id;
std::string word;
int weight;
};
倒排拉链
typedef std::vector<InvertedElem> InvertedList;
倒排索引⼀定是⼀个关键字和⼀组(个)InvertedElem对应[关键字和倒排拉链的映射关系]
std::unordered_map<std::string, InvertedList> inverted_index;
我们拿到的⽂档内容
struct DocInfo{
std::string title; //⽂档的标题
std::string content; //⽂档对应的去标签之后的内容
std::string url; //官⽹⽂档url
uint64_t doc_id; //⽂档的ID,暂时先不做过多理解
};
⽂档:
title : 吃葡萄
content: 吃葡萄不吐葡萄⽪
url: http://XXXX
doc_id: 123
根据⽂档内容,形成⼀个或者多个InvertedElem(倒排拉链)
因为当前我们是⼀个⼀个⽂档进⾏处理的,⼀个⽂档会包含多个”词“, 都应当对应到当前的doc_id
- 需要对 title && content都要先分词 --使⽤jieba分词
title: 吃/葡萄/吃葡萄(title_word)
content:吃/葡萄/不吐/葡萄⽪(content_word)
词和⽂档的相关性(词频:在标题中出现的词,可以认为相关性更⾼⼀些,在内容中出现相关性低⼀些)
2. 词频统计
struct word_cnt{
title_cnt;
content_cnt;
}
unordered_map<std::string, word_cnt> word_cnt;
for &word : title_word{
word_cnt[word].title_cnt++; //吃(1)/葡萄(1)/吃葡萄(1)
}
for &word : content_word {
word_cnt[word].content_cnt++; //吃(1)/葡萄(1)/不吐(1)/葡萄⽪(1)
}
知道了在⽂档中,标题和内容每个词出现的次数
3. 构建倒排拉链节点,⾃定义相关性
for &word : word_cnt{
//具体⼀个词和123⽂档的对应关系,当有多个不同的词,指向同⼀个⽂档的时候,此时该优先显⽰谁??相关性!
struct InvertedElem elem;
elem.doc_id = 123; 0
elem.word = word.first;
elem.weight = 10*word.second.title_cnt + word.second.content_cnt;
inverted_index[word.first].push_back(elem);
}
jieba的安装和使⽤–cppjieba
克隆,复制网址链接
获取链接: git clone https://gitcode.net/mirrors/yanyiwu/cppjieba.git
如何使⽤:注意细节,我们需要⾃⼰执⾏:
cd cppjieba; cp -rf deps/limonp include/cppjieba/, 不然会编译报错
[whb@VM-0-3-centos test]$ ll
total 372
-rwxrwxr-x 1 whb whb 366424 Mar 28 12:11 a.out
drwxrwxr-x 8 whb whb 4096 Mar 28 12:01 cppjieba
-rw-rw-r-- 1 whb whb 856 Mar 28 12:11 demo.cpp
lrwxrwxrwx 1 whb whb 13 Mar 28 12:05 dict -> cppjieba/dict
lrwxrwxrwx 1 whb whb 16 Mar 28 12:06 inc -> cppjieba/include
-rw-rw-r-- 1 whb whb 365 Mar 28 10:21 test.cc
[whb@VM-0-3-centos test]$ cat demo.cpp
#include "inc/cppjieba/Jieba.hpp"
#include <iostream>
#include <string>
#include <vector>
using namespace std;
const char* const DICT_PATH = "./dict/jieba.dict.utf8";
const char* const HMM_PATH = "./dict/hmm_model.utf8";
const char* const USER_DICT_PATH = "./dict/user.dict.utf8";
const char* const IDF_PATH = "./dict/idf.utf8";
const char* const STOP_WORD_PATH = "./dict/stop_words.utf8";
int main(int argc, char** argv) {
cppjieba::Jieba jieba(DICT_PATH,
HMM_PATH,
USER_DICT_PATH,
IDF_PATH,
STOP_WORD_PATH);
vector<string> words;
string s;
s = "⼩明硕⼠毕业于中国科学院计算所,后在⽇本京都⼤学深造";
cout << s << endl;
cout << "[demo] CutForSearch" << endl;
jieba.CutForSearch(s, words);
cout << limonp::Join(words.begin(), words.end(), "/") << endl;
return EXIT_SUCCESS;
}
//编写倒排索引的代码
//注意:建⽴倒排索引的时候,要忽略⼤⼩写!!
引入jieba到项目
把jieba作为工具,写入到util.hpp中
ln -s ./test/cppjieba/include/cppjieba cppjieba
ln -s ./test/cppjieba/dict dict
如果想取消链接可以使用unlink
打开util.hpp
#pragma once
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <boost/algorithm/string.hpp>
#include "cppjieba/Jieba.hpp"
namespace ns_util{
class FileUtil{
public:
static bool ReadFile(const std::string &file_path, std::string *out)
{
std::ifstream in(file_path, std::ios::in);
if(!in.is_open()){
std::cerr << "open file" << file_path << " error" << std::endl;
return false;
}
std::string line;
while(std::getline(in, line)){ //如何理解getline读取到文件结束:getline的返回值是一个&,while判断的是一个bool类型,本质是因为返回的对象当中重载了强制类型转化
*out += line;
}
in.close();
return true;
}
};
class StringUtil{
public:
static void CutString(const std::string &target, std::vector<std::string> *out, const std::string &sep)
{
//boost split
boost::split(*out, target, boost::is_any_of(sep), boost::token_compress_on); //打开压缩分隔符
}
};
const char* const DICT_PATH = "./dict/jieba.dict.utf8";
const char* const HMM_PATH = "./dict/hmm_model.utf8";
const char* const USER_DICT_PATH = "./dict/user.dict.utf8";
const char* const IDF_PATH = "./dict/idf.utf8";
const char* const STOP_WORD_PATH = "./dict/stop_words.utf8";
class JiebaUtil{
private:
static cppjieba::Jieba jieba;
public:
static void CutString(const std::string &src, std::vector<std::string> *out)
{
jieba.CutForSearch(src, *out);
}
};
cppjieba::Jieba JiebaUtil::jieba(DICT_PATH, HMM_PATH, USER_DICT_PATH, IDF_PATH, STOP_WORD_PATH);
}