禁用字檢測

發布時間：2020-07-13 18:18:22 來源：網絡閱讀：224 作者：zy20140925 欄目：編程語言

utf8編碼的數據可直接使用下面的代碼

最關鍵的步驟就是把字符串拆成單個字，UTF-8編碼的字，如果只有一個字節則其最高二進制位為0；如果是多字節，其第一個字節從最高位開始，連續的二進制位值為1的個數決定了其編碼的位數，其余各字節均以10開頭。

UTF-8最多可用到6個字節。

1字節 0xxxxxxx

2字節 110xxxxx 10xxxxxx

3字節 1110xxxx 10xxxxxx 10xxxxxx

4字節 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

5字節 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

6字節 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

其它就很簡單了

1、禁用字處理禁用字拆分后以第一個為key保存

2、待測試字符串

a)、拆分成單字

b)、大寫轉小寫，字母和空格全角轉半角，去掉多余空格(英文字母后最多只會有一個空格，中文后不應該有空格)

c)、遍歷字符串的所有字檢測每個字對應的禁用字組是否在待測字符串中

#include <string>
#include <vector>
#include <map>
#include <set>
#include <iostream>
#include <sstream>
#include <string.h>
#include <stdio.h>

class CDisableWord
{
struct SDisableWord
{
	std::string	str;
};
typedef std::vector<SDisableWord> 	 VDW;
private:
	std::map<std::string, VDW>	m_mapDisableWord;
	std::set<std::string>		m_setAllDisableWord;

    // 特殊轉換 
    std::map<std::string, std::string>  m_mapSpecialWord;
private:
	// 把字符串拆分為單個字
	size_t SplitWord(const char* pSrc, unsigned int len, std::vector<std::string>& output);
    // 獲取特殊字對應的轉換字
    const std::string* GetSpecialWord(const std::string& str);
public:
    CDisableWord();

    // 設置禁用字
	void AddOneDisableWord(const std::string& str);

    // 檢測
	bool CheckStr(const char* pSrc, unsigned int len);
	bool CheckStr(const std::string& str);
};

CDisableWord::CDisableWord()
{
	std::string qjdx[26] = {"Ａ","Ｂ","Ｃ","Ｄ","Ｅ","Ｆ","Ｇ","Ｈ","Ｉ","Ｊ","Ｋ","Ｌ","Ｍ","Ｎ","Ｏ","Ｐ","Ｑ","Ｒ","Ｓ","Ｔ","Ｕ","Ｖ","Ｗ","Ｘ","Ｙ","Ｚ"};
	std::string qjxx[26] = {"ａ","ｂ","ｃ","ｄ","ｅ","ｆ","ｇ","ｈ","ｉ","ｊ","ｋ","ｌ","ｍ","ｎ","ｏ","ｐ","ｑ","ｒ","ｓ","ｔ","ｕ","ｖ","ｗ","ｘ","ｙ","ｚ"};
	std::string dx = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
	std::string rst = "abcdefghijklmnopqrstuvwxyz";

    std::string str1 = "a";
    std::string str2 = "a";
	for(int i = 0; i < 26; i++)
	{
        str1[0] = rst[i];
        str2[0] = dx[i];

		m_mapSpecialWord[qjdx[i]] = str1;
		m_mapSpecialWord[qjxx[i]] = str1;
		m_mapSpecialWord[str2] = str1;
	}

	m_mapSpecialWord["　"] = std::string(" ");
}

// 把字符串拆分為單個字
size_t CDisableWord::SplitWord(const char* pSrc, unsigned int len, std::vector<std::string>& output)
{
    std::string ch;
    unsigned char byte;
    for(unsigned int i = 0, wlen = 0; i < len; i += wlen)
    {
        byte = (unsigned char)pSrc[i];
        if (byte >= 0xFC)
            wlen = 6;  
        else if (byte >= 0xF8)
            wlen = 5;
        else if (byte >= 0xF0)
            wlen = 4;
        else if (byte >= 0xE0)
            wlen = 3;
        else if (byte >= 0xC0)
            wlen = 2;
        else
            wlen = 1;

        if(i + wlen > len)
            break;

        ch.clear();
        for(unsigned int j = 0; j < wlen; j++)
            ch += pSrc[i+j];

        output.push_back(ch);
    }

    return output.size();
}

// 獲取特殊字對應的轉換字
const std::string* CDisableWord::GetSpecialWord(const std::string& str)
{
    std::map<std::string, std::string>::iterator miter = m_mapSpecialWord.find(str);
    if(miter == m_mapSpecialWord.end())
        return NULL;

    return &(miter->second);
}

void CDisableWord::AddOneDisableWord(const std::string& str)
{
    if(m_setAllDisableWord.find(str) != m_setAllDisableWord.end())
        return;

    std::vector<std::string> output;
    if(SplitWord(str.c_str(), str.size(), output) == 0 || output[0].size() == 0)
        return;

    std::map<std::string, VDW>::iterator miter = m_mapDisableWord.find(output[0]);
    if(miter == m_mapDisableWord.end())
    {
        m_mapDisableWord[output[0]] = VDW();
        miter = m_mapDisableWord.find(output[0]);
    }

    if(miter == m_mapDisableWord.end())
        return;

    SDisableWord sdw;
    sdw.str = str;
    miter->second.push_back(sdw);
}

bool CDisableWord::CheckStr(const char* pSrc, unsigned int len)
{
    if(len == 0)
        return true;

    std::string str(pSrc, len);
    return CheckStr(str);
}

bool CDisableWord::CheckStr(const std::string& str)
{
    if(str.size() == 0)
        return true;

    std::vector<std::string> output;
    if(SplitWord(str.c_str(), str.size(), output) == 0 || output[0].size() == 0)
        return false;

    // 大寫轉小寫  全角轉半角
    for(size_t i = 0; i < output.size(); ++i)
    {
        const std::string* pStr = GetSpecialWord(output[i]);
        if(pStr)
            output[i] = *pStr;
    }


    std::string StrSrc = "";        //轉換之后的字符串
    std::string StrDelSpace = "";	//刪除非英文之后的所有空格 所有大寫轉成小寫

    std::set<std::string> sonly;
    for(size_t i = 0; i < output.size(); ++i)
    {
        sonly.insert(output[i]);
        StrSrc += output[i];

        bool bnoadd = false;
        if(i > 0 && output[i] == " ")
        {
            bnoadd = true;
            for(int j = int(i - 1); j >= 0; --j)
            {
                if(output[j] == " ")
                    continue;

                if(output[j].size() > 1)
                    bnoadd = false;
                else if(j + 1 == int(i)) // 英文字符留一個空格
					bnoadd = false;

                break;
            }
        }

        if(!bnoadd)
       		StrDelSpace += output[i];
    }
    bool bSame = (StrDelSpace == StrSrc);

    std::set<std::string>::iterator siter = sonly.begin();
    for(; siter != sonly.end(); ++siter)
    {
        std::map<std::string, VDW>::iterator miter = m_mapDisableWord.find(*siter);
        if(miter == m_mapDisableWord.end())
            continue;

        for(size_t j = 0; j < miter->second.size(); ++j)
        {
            SDisableWord& sdw = miter->second[j];
            if(StrSrc.find(sdw.str) != std::string::npos)
                return false;
            else if(!bSame && StrDelSpace.find(sdw.str) != std::string::npos)
                return false;
        }
    }

    return true;
}

int main()
{
    CDisableWord cdw;

	// 設置禁用字
    std::string strdw[] = {"中文", "英文", "測試", "aabb", "測 試", "cc dd"};
    for(int i = 0; i < 6; i++)
        cdw.AddOneDisableWord(strdw[i]);

    while(1)
    {
        char s[51];
        std::cin.getline(s,50);

        if(cdw.CheckStr(s, strlen(s)))
            printf("收到:%s  沒有敏感字\n", s);
        else
            printf("收到:%s  敏感字 敏感字 敏感字\n", s);
    }

    return 0;
}

// g++ -g -o DisableWord DisableWord.cpp

向AI問一下細節

中文字幕av专区_日韩电影在线播放_精品国产精品久久一区免费式_av在线免费观看网站

禁用字檢測

猜你喜歡

中文字幕av专区_日韩电影在线播放_精品国产精品久久一区免费式_av在线免费观看网站

禁用字檢測

猜你喜歡

最新資訊

相關推薦

相關標簽