c++ 正则表达式处理语言文本,根据标点符号分割句子
支持对中 日 英的文本处理,根据标点符号将文本分割为句子,结果为每行为一个完整句子。
#include <iostream>
#include <vector>
#include <string>
#include <regex>
#include<fstream>
#include <codecvt>
using namespace std;
#define PUNCTUATION_ENG ",.!?;\""
#define PUNCTUATION_CHN ",。!?;:、"
#define REGEXP_SYMBOL_ENG (R"([,.!?;\"])")
#define REGEXP_SYMBOL_CHN (L"([,。!?;:、])")
typedef enum{
LANGUAGE_ENG,
LANGUAGE_CHN,
LANGUAGE_JAPAN,
}E_LANGUAGE_TYPE;
int isPunctuation(string str, E_LANGUAGE_TYPE lang_type)
{
string punct_set = (lang_type == LANGUAGE_ENG) ? PUNCTUATION_ENG : PUNCTUATION_CHN;
int one_punct_size = (lang_type == LANGUAGE_ENG) ? strlen(".") : strlen("。");
printf("a punctuation size %d Bytes\n", one_punct_size);
if((str.size() == one_punct_size) && (str.find_first_of(punct_set) != string::npos))
{
return 1;
}
else
{
return 0;
}
}
string crop_blank_char(string input)
{
//去掉行首尾的空白符号
string str = input;
if (!input.empty())
{
char blank_char[] = " \n\r";//其中包括空格键、tab键 、换行符、回车符
int pos1 = input.find_first_not_of(blank_char);
int pos2 = input.find_last_not_of(blank_char);
printf("input.size %ld, pos1 %d, pos2 %d\n", input.size(), pos1, pos2);
if((pos1 == 0) && (pos2 == input.size()-1))
{
}
else if((pos1 == string::npos) || (pos2 == string::npos))
{
str = "";
}
else
{
str.assign(input.begin() + pos1, input.begin() + pos2 + 1);
}
}
return str;
}
string crop_hyphen_char(string input)
{
//针对英文去掉行尾的连接符
string str = input;
if (!str.empty())
{
char hyphen = '-';
int pos = input.find_first_of(hyphen, input.size()-2);
if(pos != string::npos)
{
str.assign(input.begin(), input.begin() + pos);
//printf("crop_hyphen_char(), input.size %ld, pos %d\n", input.size(), pos);
}
else
{
str = str + " ";
}
}
return str;
}
string WString2String(wstring ws)
{
string s;
if(!ws.empty())
{
wstring_convert<codecvt_utf8<wchar_t>> converter;
s = converter.to_bytes(ws);
}
return s;
}
wstring String2WString(string s)
{
wstring ws;
if(!s.empty())
{
wstring_convert<codecvt_utf8<wchar_t>> converter;
ws = converter.from_bytes(s);
}
return ws;
}
int splitJapanStr(wstring &ws, vector<string> &result)
{
wregex regexp_split_symbol(REGEXP_SYMBOL_CHN);
wsregex_token_iterator iter(ws.begin(), ws.end(), regexp_split_symbol, {-1,0});
wsregex_token_iterator end;
while (iter != end)
{
string s = WString2String(*iter++);
s = crop_blank_char(s);
if(!s.empty())
{
result.push_back(s);
}
}
return 0;
}
int splitEngStr(string& s, vector<string> &result)
{
int i = 0;
int j = 0;
regex regexp_split_symbol(REGEXP_SYMBOL_ENG);
sregex_token_iterator iter(s.begin(), s.end(), regexp_split_symbol, {-1,0});
sregex_token_iterator end;
sregex_token_iterator it = iter;
while(it != end)
{
it++;
i++;
}
while (iter != end)
{
int line_tail_flag = 0;
if(i - 1 == j)
{
line_tail_flag = 1;
printf("line tail\n");
}
string str_trimed = crop_blank_char(iter->str());
if(line_tail_flag == 1 && !str_trimed.empty() && !isPunctuation(str_trimed, LANGUAGE_ENG))
{
str_trimed = crop_hyphen_char(str_trimed);
}
if(!str_trimed.empty())
{
result.push_back(str_trimed);
}
iter++;
j++;
}
return 0;
}
int splitStrByPunct(string line, vector<string> &result, E_LANGUAGE_TYPE lang_type)
{
line = crop_blank_char(line);
if(lang_type == LANGUAGE_ENG)
{
splitEngStr(line, result);
}
else if((lang_type == LANGUAGE_JAPAN) || (lang_type == LANGUAGE_CHN))
{
wstring ws = String2WString(line);
splitJapanStr(ws, result);
}
}
int main()
{
vector<string> g_sentence;
int i = 0;
int line_num = 0;
int whole_sentence_falg = 0;
E_LANGUAGE_TYPE language_type;
string lineStr;
ifstream file;
string fileName = "test.txt";
//getline(cin, fileName);
file.open(fileName.c_str(), ios::in);
if(!file.is_open())
{
printf("open %s failed\n", fileName.c_str());
return -1;
}
while(getline(file, lineStr))
{
vector<string> result;
cout << "line_" << line_num++ << ":" << lineStr << endl;
language_type = LANGUAGE_ENG;
splitStrByPunct(lineStr, result, language_type);
cout<<"split_count:"<<result.size()<<endl;
i = 0;
for (const auto& str : result)
{
cout << "i=" << i++ << "," << str << " len:"<< str.size() << endl;
if(isPunctuation(str, language_type)) //是否是标点符号
{
if(whole_sentence_falg == 1)//如果上一句已经匹配有标点符号 则丢掉此标点符号
{
continue;
}
else //如果上一句未匹配标点符号
{
if(!g_sentence.empty())
{
cout << "line:" << __LINE__ << "," << g_sentence.back() << "|" << str << endl;
g_sentence.back() = g_sentence.back() + str;//在上个句子末尾加上符号
whole_sentence_falg = 1;
}
}
}
else //非标点符号
{
if(whole_sentence_falg == 1)//上个句子已经匹配有符号 可以直接增加新的句子文本
{
g_sentence.push_back(str);
whole_sentence_falg = 0;
}
else//上个句子未匹配标点符号 继续在该句子后边追加文本(对于英文要插入一个空格)
{
if(!g_sentence.empty())
{
cout << "line:" << __LINE__ << "," << g_sentence.back() << "|" << str << endl;
g_sentence.back() = g_sentence.back() + str;
}
else
{
g_sentence.push_back(str);
}
}
}
}
}
i = 0;
for(auto &str : g_sentence)
{
cout << "i=" << i++ << "," << str << endl;
}
file.close();
return 0;
}