icu 编解码数据:
extern const UConverterSharedData
_MBCSData, _Latin1Data,
_UTF8Data, _UTF16BEData, _UTF16LEData, _UTF32BEData, _UTF32LEData,
_ISO2022Data,
_LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6,
_LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19,
_HZData,_ISCIIData, _SCSUData, _ASCIIData,
_UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data, _IMAPData, _CompoundTextData;
U_CDECL_END
Code point 代码点 Byte 1 Byte 2 Byte 3 Byte 4
U+ 0000 .. 007F 0xxxxxxx
U+ 0080 .. 07FF 110xxxxx 10xxxxxx
U+ 0800 .. FFFF 1110xxxx 10xxxxxx 10xxxxxx
U+ 10000 .. 10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
可变长度编码
🚵🏻♀️ is U+1F6B5 + U+1F3FB + U+200D + U+2640 + U+FE0F
🤦🏼♂️ 由 5 个代码点 ( U+1F926 U+1F3FB U+200D U+2642 U+FE0F )
组成的事实仅仅是实现细节。它不应该被拆开,它不应该被计为多个字符,文本光标不应该位于其中,它不应该被部分选择
关键代码
std::size_t sequence_length(char8_t lead_byte)
{
if (lead_byte < 0x80)
return 1;
else if ((lead_byte >> 5) == 0x6)
return 2;
else if ((lead_byte >> 4) == 0xe)
return 3;
else if ((lead_byte >> 3) == 0x1e)
return 4;
else
return 0;
}
UTF_ERROR encode_next_utf8(const char32_t code_point, std::u8string &utf8str)
{
if (!is_code_point_valid(code_point))
return UTF_ERROR::INVALID_CODE_POINT;
if (code_point < 0x80) { // 1 byte
utf8str.append(1u, static_cast<char8_t>(code_point));
} else if (code_point < 0x800) { // 2 bytes
add_capacity_if_needed(utf8str, 2);
utf8str.append(1, static_cast<char8_t>((code_point >> 6) | 0xc0));
utf8str.append(1, static_cast<char8_t>((code_point & 0x3f) | 0x80));
} else if (code_point < 0x10000) { // 3 bytes
add_capacity_if_needed(utf8str, 3);
utf8str.append(1, static_cast<char8_t>((code_point >> 12) | 0xe0));
utf8str.append(1, static_cast<char8_t>(((code_point >> 6) & 0x3f) | 0x80));
utf8str.append(1, static_cast<char8_t>((code_point & 0x3f) | 0x80));
} else { // 4 bytes
add_capacity_if_needed(utf8str, 4);
utf8str.append(1, static_cast<char8_t>((code_point >> 18) | 0xf0));
utf8str.append(1, static_cast<char8_t>(((code_point >> 12) & 0x3f) | 0x80));
utf8str.append(1, static_cast<char8_t>(((code_point >> 6) & 0x3f) | 0x80));
utf8str.append(1, static_cast<char8_t>((code_point & 0x3f) | 0x80));
}
return UTF_ERROR::OK;
}
C++ 标准库
#include <locale>
#include <codecvt>
#include <fstream>
// convert std::string to wstring
std::wstring to_wide_string(const std::string &input)
{
// std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
// return converter.from_bytes(input);
std::locale sys_loc(std::locale("C.UTF-8"));
std::ofstream ofs(" cvt_buf ");
ofs << input;
ofs.close();
std::wifstream wifs(" cvt_buf ");
wifs.imbue(sys_loc);
std::wstring wstr;
wifs >> wstr;
wifs.close();
return wstr;
}
// convert wstring to std::string
std::string to_byte_string(const std::wstring &input)
{
// std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
// std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
// return converter.to_bytes(input);
std::locale sys_loc(std::locale("C.UTF-8"));
std::wofstream wofs(" cvt_buf ");
wofs.imbue(sys_loc);
wofs << input;
wofs.close();
std::ifstream ifs(" cvt_buf ");
std::string str;
ifs >> str;
ifs.close();
return str;
}
std::u32string to_utf32(std::string str)
{
return std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t>{}.from_bytes(str);
}
std::string to_utf8(std::u32string str32)
{
return std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t>{}.to_bytes(str32);
}
参考
The Absolute Minimum Every Software Developer Must Know About Unicode in 2023 (Still No Excuses!) @ tonsky.me
GitHub - soasis/text: A spicy text library for C++ that has the explicit goal of enabling the entire ecosystem to share in proper forward progress towards a bright Unicode future.
utfcpp-3.2.1.tar.gz · src-openEuler/utf8cpp - Gitee.com
GitHub - nemtrif/utfcpp: UTF-8 with C++ in a Portable Way
GitHub - nemtrif/utfcpp20: Unicode encodings with C++20