您好,登錄后才能下訂單哦!
utf8編碼的數據可直接使用下面的代碼
最關鍵的步驟就是把字符串拆成單個字,UTF-8編碼的字,如果只有一個字節則其最高二進制位為0;如果是多字節,其第一個字節從最高位開始,連續的二進制位值為1的個數決定了其編碼的位數,其余各字節均以10開頭。
UTF-8最多可用到6個字節。
1字節 0xxxxxxx
2字節 110xxxxx 10xxxxxx
3字節 1110xxxx 10xxxxxx 10xxxxxx
4字節 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5字節 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
6字節 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
其它就很簡單了
1、禁用字處理 禁用字拆分后以第一個為key保存
2、待測試字符串
a)、拆分成單字
b)、大寫轉小寫,字母和空格全角轉半角,去掉多余空格(英文字母后最多只會有一個空格,中文后不應該有空格)
c)、遍歷字符串的所有字 檢測每個字對應的禁用字組是否在待測字符串中
#include <string> #include <vector> #include <map> #include <set> #include <iostream> #include <sstream> #include <string.h> #include <stdio.h> class CDisableWord { struct SDisableWord { std::string str; }; typedef std::vector<SDisableWord> VDW; private: std::map<std::string, VDW> m_mapDisableWord; std::set<std::string> m_setAllDisableWord; // 特殊轉換 std::map<std::string, std::string> m_mapSpecialWord; private: // 把字符串拆分為單個字 size_t SplitWord(const char* pSrc, unsigned int len, std::vector<std::string>& output); // 獲取特殊字對應的轉換字 const std::string* GetSpecialWord(const std::string& str); public: CDisableWord(); // 設置禁用字 void AddOneDisableWord(const std::string& str); // 檢測 bool CheckStr(const char* pSrc, unsigned int len); bool CheckStr(const std::string& str); }; CDisableWord::CDisableWord() { std::string qjdx[26] = {"A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"}; std::string qjxx[26] = {"a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"}; std::string dx = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; std::string rst = "abcdefghijklmnopqrstuvwxyz"; std::string str1 = "a"; std::string str2 = "a"; for(int i = 0; i < 26; i++) { str1[0] = rst[i]; str2[0] = dx[i]; m_mapSpecialWord[qjdx[i]] = str1; m_mapSpecialWord[qjxx[i]] = str1; m_mapSpecialWord[str2] = str1; } m_mapSpecialWord[" "] = std::string(" "); } // 把字符串拆分為單個字 size_t CDisableWord::SplitWord(const char* pSrc, unsigned int len, std::vector<std::string>& output) { std::string ch; unsigned char byte; for(unsigned int i = 0, wlen = 0; i < len; i += wlen) { byte = (unsigned char)pSrc[i]; if (byte >= 0xFC) wlen = 6; else if (byte >= 0xF8) wlen = 5; else if (byte >= 0xF0) wlen = 4; else if (byte >= 0xE0) wlen = 3; else if (byte >= 0xC0) wlen = 2; else wlen = 1; if(i + wlen > len) break; ch.clear(); for(unsigned int j = 0; j < wlen; j++) ch += pSrc[i+j]; output.push_back(ch); } return output.size(); } // 獲取特殊字對應的轉換字 const std::string* CDisableWord::GetSpecialWord(const std::string& str) { std::map<std::string, std::string>::iterator miter = m_mapSpecialWord.find(str); if(miter == m_mapSpecialWord.end()) return NULL; return &(miter->second); } void CDisableWord::AddOneDisableWord(const std::string& str) { if(m_setAllDisableWord.find(str) != m_setAllDisableWord.end()) return; std::vector<std::string> output; if(SplitWord(str.c_str(), str.size(), output) == 0 || output[0].size() == 0) return; std::map<std::string, VDW>::iterator miter = m_mapDisableWord.find(output[0]); if(miter == m_mapDisableWord.end()) { m_mapDisableWord[output[0]] = VDW(); miter = m_mapDisableWord.find(output[0]); } if(miter == m_mapDisableWord.end()) return; SDisableWord sdw; sdw.str = str; miter->second.push_back(sdw); } bool CDisableWord::CheckStr(const char* pSrc, unsigned int len) { if(len == 0) return true; std::string str(pSrc, len); return CheckStr(str); } bool CDisableWord::CheckStr(const std::string& str) { if(str.size() == 0) return true; std::vector<std::string> output; if(SplitWord(str.c_str(), str.size(), output) == 0 || output[0].size() == 0) return false; // 大寫轉小寫 全角轉半角 for(size_t i = 0; i < output.size(); ++i) { const std::string* pStr = GetSpecialWord(output[i]); if(pStr) output[i] = *pStr; } std::string StrSrc = ""; //轉換之后的字符串 std::string StrDelSpace = ""; //刪除非英文之后的所有空格 所有大寫轉成小寫 std::set<std::string> sonly; for(size_t i = 0; i < output.size(); ++i) { sonly.insert(output[i]); StrSrc += output[i]; bool bnoadd = false; if(i > 0 && output[i] == " ") { bnoadd = true; for(int j = int(i - 1); j >= 0; --j) { if(output[j] == " ") continue; if(output[j].size() > 1) bnoadd = false; else if(j + 1 == int(i)) // 英文字符留一個空格 bnoadd = false; break; } } if(!bnoadd) StrDelSpace += output[i]; } bool bSame = (StrDelSpace == StrSrc); std::set<std::string>::iterator siter = sonly.begin(); for(; siter != sonly.end(); ++siter) { std::map<std::string, VDW>::iterator miter = m_mapDisableWord.find(*siter); if(miter == m_mapDisableWord.end()) continue; for(size_t j = 0; j < miter->second.size(); ++j) { SDisableWord& sdw = miter->second[j]; if(StrSrc.find(sdw.str) != std::string::npos) return false; else if(!bSame && StrDelSpace.find(sdw.str) != std::string::npos) return false; } } return true; } int main() { CDisableWord cdw; // 設置禁用字 std::string strdw[] = {"中文", "英文", "測試", "aabb", "測 試", "cc dd"}; for(int i = 0; i < 6; i++) cdw.AddOneDisableWord(strdw[i]); while(1) { char s[51]; std::cin.getline(s,50); if(cdw.CheckStr(s, strlen(s))) printf("收到:%s 沒有敏感字\n", s); else printf("收到:%s 敏感字 敏感字 敏感字\n", s); } return 0; } // g++ -g -o DisableWord DisableWord.cpp
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。