日期:2014-05-18  浏览次数:20446 次

过滤敏感词组
1。请教如何过滤敏感词组,
如把骂人的话操你妈,傻逼,或一些反动的词组过滤掉。
2。谁有这类词的集合啊!

------解决方案--------------------
public class BadWordsFilter
{
private HashSet<string> hash = new HashSet<string>();
private byte[] fastCheck = new byte[char.MaxValue];
private byte[] fastLength = new byte[char.MaxValue];
private BitArray charCheck = new BitArray(char.MaxValue);
private BitArray endCheck = new BitArray(char.MaxValue);
private int maxWordLength = 0;
private int minWordLength = int.MaxValue;
public BadWordsFilter()
{

}
public void Init(string[] badwords)
{
foreach (string word in badwords)
{
maxWordLength = Math.Max(maxWordLength, word.Length);
minWordLength = Math.Min(minWordLength, word.Length);
for (int i = 0; i < 7 && i < word.Length; i++)
{
fastCheck[word[i]] |= (byte)(1 << i);
}
for (int i = 7; i < word.Length; i++)
{
fastCheck[word[i]] |= 0x80;
}
if (word.Length == 1)
{
charCheck[word[0]] = true;
}
else
{
fastLength[word[0]] |= (byte)(1 << (Math.Min(7, word.Length - 2)));
endCheck[word[word.Length - 1]] = true;

hash.Add(word);
}
}
}
public string Filter(string text, string mask)
{
throw new NotImplementedException();
}
public bool HasBadWord(string text)
{
int index = 0;

while (index < text.Length)
{
int count = 1;

if (index > 0 || (fastCheck[text[index]] & 1) == 0)
{
while (index < text.Length - 1 && (fastCheck[text[++index]] & 1) == 0) ;
}

char begin = text[index];

if (minWordLength == 1 && charCheck[begin])
{
return true;
}

for (int j = 1; j <= Math.Min(maxWordLength, text.Length - index - 1); j++)
{
char current = text[index + j];

if ((fastCheck[current] & 1) == 0)
{
++count;
}

if ((fastCheck[current] & (1 << Math.Min(j, 7))) == 0)
{
break;
}

if (j + 1 >= minWordLength)
{
if ((fastLength[begin] & (1 << Math.Min(j - 1, 7))) > 0 && endCheck[current])
{
string sub = text.Substring(index, j + 1);

if (hash.Contains(sub))
{
return true;
}
}
}
}

index += count;
}

return false;
}
}
------解决方案--------------------
http://www.cnblogs.com/xingd/archive/2008/01/31/1060425.html .NET脏字过滤算法 

http://www.cnblogs.com/xingd/archive/2008/01/23/1050443.html.NET脏字过滤算法 

http://www.cnblogs.com/goody9807/archive/2006/09/12/502094.html.NET脏字过滤算法 

 

------解决方案--------------------
Mark,作个记号。
------解决方案--------------------
mark