UTF-8、UTF-16、UTF-32編碼相互轉換的方法

發布時間：2021-06-24 11:03:30 來源：億速云閱讀：1550 作者：chen 欄目：大數據

這篇文章主要介紹“UTF-8、UTF-16、UTF-32編碼相互轉換的方法”，在日常操作中，相信很多人在UTF-8、UTF-16、UTF-32編碼相互轉換的方法問題上存在疑惑，小編查閱了各式資料，整理出簡單好用的操作方法，希望對大家解答”UTF-8、UTF-16、UTF-32編碼相互轉換的方法”的疑惑有所幫助！接下來，請跟著小編一起來學習吧！

最近在考慮寫一個可以跨平臺的通用字符串類，首先需要搞定的就是編碼轉換問題。

vs默認保存代碼文件，使用的是本地code（中文即GBK，日文即Shift-JIS），也可以使用帶BOM的UTF-8。
gcc則是UTF-8，有無BOM均可（源代碼的字符集可以由參數-finput-charset指定）。
那么源代碼可以采用帶BOM的UTF-8來保存。而windows下的unicode是UTF-16編碼；Linux則使用UTF-8或UTF-32。因此不論在哪種系統里，程序在處理字符串時都需要考慮UTF編碼之間的相互轉換。

下面直接貼出算法代碼。算法上我借鑒了秦建輝（http://blog.csdn.net/jhqin）的UnicodeConverter，只是在外面增加了一些泛型處理，讓使用相對簡單。

核心算法（來自UnicodeConverter）：

namespace transform  
{  
    /* 
        UTF-32 to UTF-8 
    */  
   
    inline static size_t utf(uint32 src, uint8* des)  
    {  
        if (src == 0) return 0;  
   
        static const byte PREFIX[] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };  
        static const uint32 CODE_UP[] =  
        {  
            0x80,           // U+00000000 - U+0000007F  
            0x800,          // U+00000080 - U+000007FF  
            0x10000,        // U+00000800 - U+0000FFFF  
            0x200000,       // U+00010000 - U+001FFFFF  
            0x4000000,      // U+00200000 - U+03FFFFFF  
            0x80000000      // U+04000000 - U+7FFFFFFF  
        };  
   
        size_t i, len = sizeof(CODE_UP) / sizeof(uint32);  
        for(i = 0; i < len; ++i)  
            if (src < CODE_UP[i]) break;  
   
        if (i == len) return 0; // the src is invalid  
   
        len = i + 1;  
        if (des)  
        {  
            for(; i > 0; --i)  
            {  
                des[i] = static_cast<uint8>((src & 0x3F) | 0x80);  
                src >>= 6;  
            }  
            des[0] = static_cast<uint8>(src | PREFIX[len - 1]);  
        }  
        return len;  
    }  
   
    /* 
        UTF-8 to UTF-32 
    */  
   
    inline static size_t utf(const uint8* src, uint32& des)  
    {  
        if (!src || (*src) == 0) return 0;  
   
        uint8 b = *(src++);  
   
        if (b < 0x80)  
        {  
            des = b;  
            return 1;  
        }  
   
        if (b < 0xC0 || b > 0xFD) return 0; // the src is invalid  
   
        size_t len;  
   
        if (b < 0xE0)  
        {  
            des = b & 0x1F;  
            len = 2;  
        }  
        else  
        if (b < 0xF0)  
        {  
            des = b & 0x0F;  
            len = 3;  
        }  
        else  
        if (b < 0xF8)  
        {  
            des = b & 0x07;  
            len = 4;  
        }  
        else  
        if (b < 0xFC)  
        {  
            des = b & 0x03;  
            len = 5;  
        }  
        else  
        {  
            des = b & 0x01;  
            len = 6;  
        }  
   
        size_t i = 1;  
        for (; i < len; ++i)  
        {  
            b = *(src++);  
            if (b < 0x80 || b > 0xBF) return 0; // the src is invalid  
            des = (des << 6) + (b & 0x3F);  
        }  
        return len;  
    }  
   
    /* 
        UTF-32 to UTF-16 
    */  
   
    inline static size_t utf(uint32 src, uint16* des)  
    {  
        if (src == 0) return 0;  
   
        if (src <= 0xFFFF)  
        {  
            if (des) (*des) = static_cast<uint16>(src);  
            return 1;  
        }  
        else  
        if (src <= 0xEFFFF)  
        {  
            if (des)  
            {  
                des[0] = static_cast<uint16>(0xD800 + (src >> 10) - 0x40);  // high  
                des[1] = static_cast<uint16>(0xDC00 + (src & 0x03FF));      // low  
            }  
            return 2;  
        }  
        return 0;  
    }  
   
    /* 
        UTF-16 to UTF-32 
    */  
   
    inline static size_t utf(const uint16* src, uint32& des)  
    {  
        if (!src || (*src) == 0) return 0;  
   
        uint16 w1 = src[0];  
        if (w1 >= 0xD800 && w1 <= 0xDFFF)  
        {  
            if (w1 < 0xDC00)  
            {  
                uint16 w2 = src[1];  
                if (w2 >= 0xDC00 && w2 <= 0xDFFF)  
                {  
                    des = (w2 & 0x03FF) + (((w1 & 0x03FF) + 0x40) << 10);  
                    return 2;  
                }  
            }  
            return 0; // the src is invalid  
        }  
        else  
        {  
            des = w1;  
            return 1;  
        }  
    }  
}

上面這些算法都是針對單個字符的，并且是UTF-32和UTF-16/8之間的互轉。
通過上面的算法，可以得到UTF-16和UTF-8之間的單字符轉換算法：

namespace transform  
{  
    /* 
        UTF-16 to UTF-8 
    */  
   
    inline static size_t utf(uint16 src, uint8* des)  
    {  
        // make utf-16 to utf-32  
        uint32 tmp;  
        if (utf(&src, tmp) != 1) return 0;  
        // make utf-32 to utf-8  
        return utf(tmp, des);  
    }  
   
    /* 
        UTF-8 to UTF-16 
    */  
   
    inline static size_t utf(const uint8* src, uint16& des)  
    {  
        // make utf-8 to utf-32  
        uint32 tmp;  
        size_t len = utf(src, tmp);  
        if (len == 0) return 0;  
        // make utf-32 to utf-16  
        if (utf(tmp, &des) != 1) return 0;  
        return len;  
    }  
}

同樣，通過上面的單字符轉換算法，可以得到整個字符串的轉換算法：

namespace transform  
{  
    /* 
        UTF-X: string to string 
    */  
   
    template <typename T>  
    size_t utf(const uint32* src, T* des)   // UTF-32 to UTF-X(8/16)  
    {  
        if (!src || (*src) == 0) return 0;  
   
        size_t num = 0;  
        for(; *src; ++src)  
        {  
            size_t len = utf(*src, des);  
            if (len == 0) break;  
            if (des) des += len;  
            num += len;  
        }  
        if (des) (*des) = 0;  
        return num;  
    }  
   
    template <typename T>  
    size_t utf(const T* src, uint32* des)   // UTF-X(8/16) to UTF-32  
    {  
        if (!src || (*src) == 0) return 0;  
   
        size_t num = 0;  
        while(*src)  
        {  
            uint32 tmp;  
            size_t len = utf(src, tmp);  
            if (len == 0) break;  
            if (des)  
            {  
                (*des) = tmp;  
                ++des;  
            }  
            src += len;  
            num += 1;  
        }  
        if (des) (*des) = 0;  
        return num;  
    }  
   
    template <typename T, typename U>  
    size_t utf(const T* src, U* des)    // UTF-X(8/16) to UTF-Y(16/8)  
    {  
        if (!src || (*src) == 0) return 0;  
   
        size_t num = 0;  
        while(*src)  
        {  
            // make utf-x to ucs4  
            uint32 tmp;  
            size_t len = utf(src, tmp);  
            if (len == 0) break;  
            src += len;  
            // make ucs4 to utf-y  
            len = utf(tmp, des);  
            if (len == 0) break;  
            if (des) des += len;  
            num += len;  
        }  
        if (des) (*des) = 0;  
        return num;  
    }  
}

有了這些之后，我們已經可以完整的做UTF-8/16/32之間的相互轉換了，但是這些函數的使用仍然不是很方便。
比如我現在想把一個UTF-8字符串轉換成一個wchar_t*字符串，我得這樣寫：

const uint8* c = (uint8*)"こんにちわ、世界";  
size_t n = (sizeof(wchar_t) == 2) ?  
    transform::utf(c, (uint16*)0) :  
    transform::utf(c, (uint32*)0);  
wchar_t* s = new wchar_t[n];  
if (sizeof(wchar_t) == 2)  
    transform::utf(c, (uint16*)s);  
else  
    transform::utf(c, (uint32*)s);

這顯然是一件很抽搐的事情，因為wchar_t在不同的操作系統（windows/linux）里有不同的sizeof長度。
上面的類型強制轉換只是為了去適配合適的函數重載，當然我們也可以通過函數名來區分這些函數：比如分別叫utf8_to_utf32之類的。但是這改變不了寫if-else來適配長度的問題。

顯然這里可以通過泛型來讓算法更好用。
首先，需要被抽離出來的就是參數的類型大小和類型本身的依賴關系：

template <size_t X> struct utf_type;  
template <>         struct utf_type<1> { typedef uint8  type_t; };  
template <>         struct utf_type<2> { typedef uint16 type_t; };  
template <>         struct utf_type<4> { typedef uint32 type_t; };

然后，實現一個簡單的check算法，這樣后面就可以利用SFINAE的技巧篩選出合適的算法函數：

template <size_t X, typename T>  
struct check  
{  
    static const bool value =  
        ((sizeof(T) == sizeof(typename utf_type<X>::type_t)) && !is_pointer<T>::value);  
};

下面我們需要一個detail，即泛型適配的細節。從上面的算法函數參數中，我們可以很容易的觀察出一些規律：
只要是由大向小轉換（比如32->16，或16->8）的，其對外接口可以抽象成這兩種形式：

type_t utf(T src, U* des)  
type_t utf(const T* src, U* des)

而由小向大的轉換，則是下面這兩種形式：

type_t utf(const T* src, U& des)  
type_t utf(const T* src, U* des)

再加上第二個指針參數是可以給一個默認值（空指針）的，因此適配的泛型類就可以寫成這樣：

template <size_t X, size_t Y, bool = (X > Y), bool = (X != Y)>  
struct detail;  
   
/* 
    UTF-X(32/16) to UTF-Y(16/8) 
*/  
   
template <size_t X, size_t Y>  
struct detail<X, Y, true, true>  
{  
    typedef typename utf_type<X>::type_t src_t;  
    typedef typename utf_type<Y>::type_t des_t;  
   
    template <typename T, typename U>  
    static typename enable_if<check<X, T>::value && check<Y, U>::value,  
    size_t>::type_t utf(T src, U* des)  
    {  
        return transform::utf((src_t)(src), (des_t*)(des));  
    }  
   
    template <typename T>  
    static typename enable_if<check<X, T>::value,  
    size_t>::type_t utf(T src)  
    {  
        return transform::utf((src_t)(src), (des_t*)(0));  
    }  
   
    template <typename T, typename U>  
    static typename enable_if<check<X, T>::value && check<Y, U>::value,  
    size_t>::type_t utf(const T* src, U* des)  
    {  
        return transform::utf((const src_t*)(src), (des_t*)(des));  
    }  
   
    template <typename T>  
    static typename enable_if<check<X, T>::value,  
    size_t>::type_t utf(const T* src)  
    {  
        return transform::utf((src_t)(src), (des_t*)(0));  
    }  
};  
   
/* 
    UTF-X(16/8) to UTF-Y(32/16) 
*/  
   
template <size_t X, size_t Y>  
struct detail<X, Y, false, true>  
{  
    typedef typename utf_type<X>::type_t src_t;  
    typedef typename utf_type<Y>::type_t des_t;  
   
    template <typename T, typename U>  
    static typename enable_if<check<X, T>::value && check<Y, U>::value,  
    size_t>::type_t utf(const T* src, U& des)  
    {  
        des_t tmp; // for disable the warning strict-aliasing from gcc 4.4  
        size_t ret = transform::utf((const src_t*)(src), tmp);  
        des = tmp;  
        return ret;  
    }  
   
    template <typename T, typename U>  
    static typename enable_if<check<X, T>::value && check<Y, U>::value,  
    size_t>::type_t utf(const T* src, U* des)  
    {  
        return transform::utf((const src_t*)(src), (des_t*)(des));  
    }  
   
    template <typename T>  
    static typename enable_if<check<X, T>::value,  
    size_t>::type_t utf(const T* src)  
    {  
        return transform::utf((const src_t*)(src), (des_t*)(0));  
    }  
};

最后的外敷類收尾就可以相當的簡單：

template <typename T, typename U>  
struct converter  
    : detail<sizeof(T), sizeof(U)>  
{};

通過上面的detail，我們也可以很輕松的寫出一個通過指定8、16這些數字，來控制選擇哪些轉換算法的外敷模板。
有了converter，同類型的需求（指UTF-8轉wchar_t）就可以變得輕松愉快很多：

const char* c = "こんにちわ、世界";  
wstring s;  
size_t n; wchar_t w;  
while (!!(n = converter<char, wchar_t>::utf(c, w))) // 這里的!!是為了屏蔽gcc的警告  
{  
    s.push_back(w);  
    c += n;  
}  
FILE* fp = fopen("test_converter.txt", "wb");  
fwrite(s.c_str(), sizeof(wchar_t), s.length(), fp);  
fclose(fp);

上面這一小段代碼是將一段UTF-8的文字逐字符轉換為wchar_t，并一個個push_back到wstring里，最后把轉換完畢的字符串輸出到test_converter.txt里。

其實上面的泛型還是顯得累贅了。為什么不直接在transform::utf上使用泛型參數呢？
一開始只想到上面那個方法，自然是由于慣性的想要手動指定如何轉換編碼的緣故，比如最開始的想法，是想做成類似這樣的模板：utf<8, 32>(s1, s2)，指定兩個數字，來決定輸入和輸出的格式。

后來發現，直接指定字符串/字符的類型或許更加直接些。
現在回頭再看看，其實轉換所需要的字長（8、16、32）已經在參數的類型中指定了：8bits的char或byte類型肯定不會是用來存放UTF-32的嘛。。
所以只需要把上面核心算法的參數泛型化就可以了。這時代碼就會寫成下面這個樣子：

namespace transform  
{  
    namespace private_  
    {  
        template <size_t X> struct utf_type;  
        template <>         struct utf_type<1> { typedef uint8  type_t; };  
        template <>         struct utf_type<2> { typedef uint16 type_t; };  
        template <>         struct utf_type<4> { typedef uint32 type_t; };  
   
        template <typename T, size_t X>  
        struct check  
        {  
            static const bool value =  
                ((sizeof(T) == sizeof(typename utf_type<X>::type_t)) && !is_pointer<T>::value);  
        }  
    }  
   
    using namespace transform::private_;  
   
    /* 
        UTF-32 to UTF-8 
    */  
   
    template <typename T, typename U>  
    typename enable_if<check<T, 4>::value && check<U, 1>::value,  
    size_t>::type_t utf(T src, U* des)  
    {  
        if (src == 0) return 0;  
   
        static const byte PREFIX[] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };  
        static const uint32 CODE_UP[] =  
        {  
            0x80,           // U+00000000 - U+0000007F  
            0x800,          // U+00000080 - U+000007FF  
            0x10000,        // U+00000800 - U+0000FFFF  
            0x200000,       // U+00010000 - U+001FFFFF  
            0x4000000,      // U+00200000 - U+03FFFFFF  
            0x80000000      // U+04000000 - U+7FFFFFFF  
        };  
   
        size_t i, len = sizeof(CODE_UP) / sizeof(uint32);  
        for(i = 0; i < len; ++i)  
            if (src < CODE_UP[i]) break;  
   
        if (i == len) return 0; // the src is invalid  
   
        len = i + 1;  
        if (des)  
        {  
            for(; i > 0; --i)  
            {  
                des[i] = static_cast<U>((src & 0x3F) | 0x80);  
                src >>= 6;  
            }  
            des[0] = static_cast<U>(src | PREFIX[len - 1]);  
        }  
        return len;  
    }  
   
    /* 
        UTF-8 to UTF-32 
    */  
   
    template <typename T, typename U>  
    typename enable_if<check<T, 1>::value && check<U, 4>::value,  
    size_t>::type_t utf(const T* src, U& des)  
    {  
        if (!src || (*src) == 0) return 0;  
   
        uint8 b = *(src++);  
   
        if (b < 0x80)  
        {  
            des = b;  
            return 1;  
        }  
   
        if (b < 0xC0 || b > 0xFD) return 0; // the src is invalid  
   
        size_t len;  
   
        if (b < 0xE0)  
        {  
            des = b & 0x1F;  
            len = 2;  
        }  
        else  
        if (b < 0xF0)  
        {  
            des = b & 0x0F;  
            len = 3;  
        }  
        else  
        if (b < 0xF8)  
        {  
            des = b & 0x07;  
            len = 4;  
        }  
        else  
        if (b < 0xFC)  
        {  
            des = b & 0x03;  
            len = 5;  
        }  
        else  
        {  
            des = b & 0x01;  
            len = 6;  
        }  
   
        size_t i = 1;  
        for (; i < len; ++i)  
        {  
            b = *(src++);  
            if (b < 0x80 || b > 0xBF) return 0; // the src is invalid  
            des = (des << 6) + (b & 0x3F);  
        }  
        return len;  
    }  
   
    /* 
        UTF-32 to UTF-16 
    */  
   
    template <typename T, typename U>  
    typename enable_if<check<T, 4>::value && check<U, 2>::value,  
    size_t>::type_t utf(T src, U* des)  
    {  
        if (src == 0) return 0;  
   
        if (src <= 0xFFFF)  
        {  
            if (des) (*des) = static_cast<U>(src);  
            return 1;  
        }  
        else  
        if (src <= 0xEFFFF)  
        {  
            if (des)  
            {  
                des[0] = static_cast<U>(0xD800 + (src >> 10) - 0x40);  // high  
                des[1] = static_cast<U>(0xDC00 + (src & 0x03FF));      // low  
            }  
            return 2;  
        }  
        return 0;  
    }  
   
    /* 
        UTF-16 to UTF-32 
    */  
   
    template <typename T, typename U>  
    typename enable_if<check<T, 2>::value && check<U, 4>::value,  
    size_t>::type_t utf(const T* src, U& des)  
    {  
        if (!src || (*src) == 0) return 0;  
   
        uint16 w1 = src[0];  
        if (w1 >= 0xD800 && w1 <= 0xDFFF)  
        {  
            if (w1 < 0xDC00)  
            {  
                uint16 w2 = src[1];  
                if (w2 >= 0xDC00 && w2 <= 0xDFFF)  
                {  
                    des = (w2 & 0x03FF) + (((w1 & 0x03FF) + 0x40) << 10);  
                    return 2;  
                }  
            }  
            return 0; // the src is invalid  
        }  
        else  
        {  
            des = w1;  
            return 1;  
        }  
    }  
   
    /* 
        UTF-16 to UTF-8 
    */  
   
    template <typename T, typename U>  
    typename enable_if<check<T, 2>::value && check<U, 1>::value,  
    size_t>::type_t utf(T src, U* des)  
    {  
        // make utf-16 to utf-32  
        uint32 tmp;  
        if (utf(&src, tmp) != 1) return 0;  
        // make utf-32 to utf-8  
        return utf(tmp, des);  
    }  
   
    /* 
        UTF-8 to UTF-16 
    */  
   
    template <typename T, typename U>  
    typename enable_if<check<T, 1>::value && check<U, 2>::value,  
    size_t>::type_t utf(const T* src, U& des)  
    {  
        // make utf-8 to utf-32  
        uint32 tmp;  
        size_t len = utf(src, tmp);  
        if (len == 0) return 0;  
        // make utf-32 to utf-16  
        if (utf(tmp, &des) != 1) return 0;  
        return len;  
    }  
   
    /* 
        UTF-X: string to string 
    */  
   
    template <typename T, typename U>  
    typename enable_if<check<T, 4>::value && (check<U, 1>::value || check<U, 2>::value),  
    size_t>::type_t utf(const T* src, U* des)   // UTF-32 to UTF-X(8/16)  
    {  
        if (!src || (*src) == 0) return 0;  
   
        size_t num = 0;  
        for(; *src; ++src)  
        {  
            size_t len = utf(*src, des);  
            if (len == 0) break;  
            if (des) des += len;  
            num += len;  
        }  
        if (des) (*des) = 0;  
        return num;  
    }  
   
    template <typename T, typename U>  
    typename enable_if<(check<T, 1>::value || check<T, 2>::value) && check<U, 4>::value,  
    size_t>::type_t utf(const T* src, U* des)   // UTF-X(8/16) to UTF-32  
    {  
        if (!src || (*src) == 0) return 0;  
   
        size_t num = 0;  
        while(*src)  
        {  
            uint32 tmp;  
            size_t len = utf(src, tmp);  
            if (len == 0) break;  
            if (des)  
            {  
                (*des) = tmp;  
                ++des;  
            }  
            src += len;  
            num += 1;  
        }  
        if (des) (*des) = 0;  
        return num;  
    }  
   
    template <typename T, typename U>  
    typename enable_if<(check<T, 1>::value && check<U, 2>::value) ||  
                       (check<T, 2>::value && check<U, 1>::value),  
    size_t>::type_t utf(const T* src, U* des)    // UTF-X(8/16) to UTF-Y(16/8)  
    {  
        if (!src || (*src) == 0) return 0;  
   
        size_t num = 0;  
        while(*src)  
        {  
            // make utf-x to utf-32  
            uint32 tmp;  
            size_t len = utf(src, tmp);  
            if (len == 0) break;  
            src += len;  
            // make utf-32 to utf-y  
            len = utf(tmp, des);  
            if (len == 0) break;  
            if (des) des += len;  
            num += len;  
        }  
        if (des) (*des) = 0;  
        return num;  
    }  
}

這樣用起來就更加簡單了：

const char* c = "你好世界";  
size_t n = nx::transform::utf(c, (wchar_t*)0);

完整代碼請參考：
https://code.google.com/p/nixy/source/browse/trunk/nixycore/string/transform.h

到此，關于“UTF-8、UTF-16、UTF-32編碼相互轉換的方法”的學習就結束了，希望能夠解決大家的疑惑。理論與實踐的搭配能更好的幫助大家學習，快去試試吧！若想繼續學習更多相關知識，請繼續關注億速云網站，小編會繼續努力為大家帶來更多實用的文章！

向AI問一下細節

中文字幕av专区_日韩电影在线播放_精品国产精品久久一区免费式_av在线免费观看网站

UTF-8、UTF-16、UTF-32編碼相互轉換的方法

猜你喜歡

中文字幕av专区_日韩电影在线播放_精品国产精品久久一区免费式_av在线免费观看网站

UTF-8、UTF-16、UTF-32編碼相互轉換的方法

猜你喜歡

最新資訊

相關推薦

相關標簽