检测敏感词功能
今天策划给我一个任务 —— 检测昵称中是否含有敏感词功能,然后丢给我两个压缩包,我解压一看:
有的txt文件是一行一个词:
有的txt文件是按逗号分隔开:
不管是什么格式的总之量非常多,把我这辈子脏话都囊括了🥶
读取TXT文件数据
然后我得先对这些txt文件进行处理转换成我们能用的格式:一开始我直接for循环查找是否含有敏感词,后边找资料看到一个DFA算法。
using System;
using System.Text;
using System.Collections.Generic;
using System.IO;
public class Program
{
static void Main()
{
//换行的txt文件
List<string> list = LineFeed();
//带有逗号的txt文件
Comma();
string name = "假如这是敏感词";
//检测昵称中是否含有敏感词
CensorText(name, list);
Console.Read();
}
static void CensorText(string text, List<string> list)
{
foreach (string line in list)
{
if (text.Contains(line))
{
Console.WriteLine("昵称中存在无法使用的字符,请修改后再次确认");
}
}
}
//用换行分割的txt文件
static List<string> LineFeed()
{
string filePath = "E:\\C#Project\\PBZ\\反动词库.txt"; // 替换为你的 txt 文件路径
List<string> lines = ReadTxtFile(filePath);
string a = "";
foreach (string line in lines)
{
a += "\"" + line + "\",";
}
Console.WriteLine(a);
return lines;
}
static List<string> ReadTxtFile(string filePath)
{
List<string> lines = new List<string>();
try
{
using (StreamReader sr = new StreamReader(filePath))
{
string line;
while ((line = sr.ReadLine()) != null)
{
lines.Add(line);
}
}
}
catch (Exception e)
{
Console.WriteLine("读取文件时出现错误: " + e.Message);
}
return lines;
}
//用逗号分隔的txt文件
static void Comma()
{
string filePath = "E:\\C#Project\\PBZ\\GFW补充词库.txt"; // 替换为你的 txt 文件路径
List<string> elements = ReadTxtFile1(filePath);
string a = "";
foreach (string element in elements)
{
a += "\"" + element + "\",";
}
Console.WriteLine(a);
}
static List<string> ReadTxtFile1(string filePath)
{
List<string> elements = new List<string>();
try
{
using (StreamReader sr = new StreamReader(filePath))
{
string line = sr.ReadLine();
if (line != null)
{
string[] splitElements = line.Split(',');
foreach (string element in splitElements)
{
elements.Add(element);
}
}
}
}
catch (Exception e)
{
Console.WriteLine("读取文件时出现错误: " + e.Message);
}
return elements;
}
}
这样处理过后的数据就是List<string>,或者可以处理成数组、集合都可以
我把处理出来的数据放在HashSet中
/// <summary>
/// 敏感词词库
/// </summary>
public static HashSet<string> MaskWord = new HashSet<string>
{
"敏感词1","敏感词2","敏感词3","..."
}
C#版DFA算法
然后通过C#版的DFA算法判断昵称中是否含有敏感词返回bool型放在工具类中使用:
java实现敏感词过滤(DFA算法) - AlanLee-Java - 博客园
敏感词管理(DFA算法实现)_dfa算法初始化map-CSDN博客
敏感词过滤-DFA算法-CSDN博客
/// <summary>
/// 检测敏感词
/// </summary>
/// <param name="text">要检测的词</param>
/// <param name="MaskWord">敏感词词库</param>
/// <returns></returns>
public static bool CheckSensitiveWords(string text)
{
Dictionary<string, Dictionary<string, string>> stateMap = new Dictionary<string, Dictionary<string, string>>();
Dictionary<string, string> currentState = new Dictionary<string, string>();
char[] chars;
foreach (string word in MaskWord)
{
currentState = stateMap.ContainsKey("0") ? stateMap["0"] : new Dictionary<string, string>();
Dictionary<string, string> nextState;
chars = word.ToCharArray();
for (int i = 0; i < chars.Length; i++)
{
string c = chars[i].ToString();
string nextStateKey = i == chars.Length - 1 ? "end" : (i + 1).ToString();
if (currentState.ContainsKey(c))
{
nextState = stateMap[currentState[c]];
}
else
{
nextState = new Dictionary<string, string>();
stateMap[currentState.Count.ToString()] = nextState;
currentState[c] = currentState.Count.ToString();
}
currentState = nextState;
currentState["end"] = "end";
}
}
currentState = stateMap.ContainsKey("0") ? stateMap["0"] : new Dictionary<string, string>();
chars = text.ToCharArray();
for (int i = 0; i < chars.Length; i++)
{
string c = chars[i].ToString();
if (currentState.ContainsKey(c))
{
currentState = stateMap[currentState[c]];
if (currentState.ContainsKey("end"))
{
return true; // 匹配到敏感词
}
}
else
{
currentState = stateMap.ContainsKey("0") ? stateMap["0"] : new Dictionary<string, string>();
}
}
return false; // 未匹配到敏感词
}