Unicode code unit types:
UTF-8 – char – std::string
UTF-16 – char16_t – std::u16string
UTF-32 – char32_t – std::u32string
Unicode string literals:
char const a[] { u8″Hello, \u2603!”};
char const b[] { u8″Hello, ☃!”};
char16_t const c[] { u”Hello, \u2603!”};
char16_t const d[] { u”Hello, ☃!”};
char32_t const e[] { U”Hello, \u2603!”};
char32_t const f[] { U”Hello, ☃!”};
sizeof(a) == sizeof(b) == 12
sizeof(c) == sizeof(d) == 20;
sizeof(e) == sizeof(f) == 40;
chara{‘x’}; char a{‘\u0078’}; but char b{‘☃’};// Wrong
char16_te{u’☃’}; char16_t e{u’\2603′}; but char16_t f{u’🍸’};// Wrong
char32_t i{U’\U0001F378′};
std::string a{u8″Hello, \u2603!”}; std::stringb{u8″Hello, ☃!”};
std::u16string c{u”Hello, \u2603!”}; std::u16stringd[]{u”Hello, ☃!”};
std::u32string e{U”Hello, \u2603!”}; std::u32stringf[]{U”Hello, ☃!”};
std::basic string
– sequence of code units(!)
– NOT code points, characters, test elements
std::string a{“Hello, “};
a.push_back(‘\u2603’}; // WRONG
char const snowman[]{u8″\u2603″};
a.insert(a.end, begin(snowman), end(snowman));
std::u16string a{u”Hello, “};
a.push_back(‘\u1F378’}; // WRONG
char const glass[]{u8″\u1F378″};
a.insert(a.end, begin(glass), end(glass));
std::u32string a{U”A glass: “};
a.push_back(‘\u1F378’};
String length…
# of bytes:
“1Ä🍸” -> 1 U+0031
A U+0041
¨ U+0308
🍸 U+1F378
UFT-8 = 8
UTF-16 = 10
UTF-32 = 16
# of Code Units
UFT-8 = 8
UTF-16 = 5
UTF-32 = 4
# of Code Points
UFT-8 = 4
UTF-16 = 4
UTF-32 = 4
# of text elements
UFT-8 = 3
UTF-16 = 3
UTF-32 = 3
std::string s{u8″1A “};
std::size_t number_of_code_units{s.length()};
Text manipulations
std::locale loc{“en-US”};
char lowercase_a{‘a’};
char uppercase_a{std::toupper(lowercase_a, loc)}; //A
– all functions are code unit based (isspace, isalpha, isalnum…)
Conversion using <codecvt>
std::string utf8 { u8 “1Ä🍸” };
std::wstring_convert<std::codecvt_utf8,char32_t> utf32_converter;
std::u32string utf32 { utf32_converter.from_bytes(utf8) };
Boost.Locale
boost::locale::generator gen{};
std::locale loc { gen.generate(“en-US.UTF-8”) };
// Latin Capital Letter A With Diaeresis
std::string a { u8″1Ä🍸” };
std::u16string b { boost::locale::conv::utf_to_utf(a) };
std::u32string c { boost::locale::conv::utf_to_utf(a) };
std::u32string d { boost::locale::conv::utf_to_utf(b) };
std::string e { boost::locale::conv::utf_to_utf(b) };
// Алло
std::strings{“\xB0\xDB\xDB\xDE”};
std::stringutf8_s{boost::locale::conv::to_utf(s,”ISO-8859-5″)};
– Provides string-based case manipulation (to_upper, to_lower, to_title, fold_case)
Boundary Analysis
namespace ba = boost::locale::boundary;
std::string subject { u8″1Ä🍸” };
ba::segment_index<std::string::const_iterator> map(ba::character,subject.begin(),subject.end(),loc);
size_t const byte_length { subject.size() }; // 8
size_t const text_length { std::distance(map.begin(),map.end()) }; // 3
mbrtoc32
setlocale(LC_CTYPE,”en_US.UTF-8″);
char const utf8_c[5] = “\U0001f378”;
char32_t utf32_c = 0;
mbstate_tstate = { 0 };
mbrtoc32 (&utf32_c, utf8_c, 4, &state);
printf (“0x%8x\n”, utf32_c);
setlocale(LC_CTYPE,”en_US.UTF-8″);
char const utf8_c[5] = “\U0001f378”;
char16_t utf16_c[2] = { 0 };
mbstate_tstate = { 0 };
mbrtoc16 (&utf16_c[0],utf8_c, 4, &state);
mbrtoc16 (&utf16_c[1],utf8_c, 4, &state);
printf (“0x%4x 0x%4x\n”,utf16_c[0],utf16_c[1]);