Unicode | Programming Tips

Unicode code unit types:

UTF-8 – char – std::string
UTF-16 – char16_t – std::u16string
UTF-32 – char32_t – std::u32string

Unicode string literals:

char const a[] { u8″Hello, \u2603!”};
char const b[] { u8″Hello, ☃!”};
char16_t const c[] { u”Hello, \u2603!”};
char16_t const d[] { u”Hello, ☃!”};
char32_t const e[] { U”Hello, \u2603!”};
char32_t const f[] { U”Hello, ☃!”};

sizeof(a) == sizeof(b) == 12
sizeof(c) == sizeof(d) == 20;
sizeof(e) == sizeof(f) == 40;

chara{‘x’}; char a{‘\u0078’}; but char b{‘☃’};// Wrong
char16_te{u’☃’}; char16_t e{u’\2603′}; but char16_t f{u’🍸’};// Wrong
char32_t i{U’\U0001F378′};

std::string a{u8″Hello, \u2603!”}; std::stringb{u8″Hello, ☃!”};
std::u16string c{u”Hello, \u2603!”}; std::u16stringd[]{u”Hello, ☃!”};
std::u32string e{U”Hello, \u2603!”}; std::u32stringf[]{U”Hello, ☃!”};

std::basic string

– sequence of code units(!)
– NOT code points, characters, test elements

std::string a{“Hello, “};
a.push_back(‘\u2603’}; // WRONG
char const snowman[]{u8″\u2603″};
a.insert(a.end, begin(snowman), end(snowman));

std::u16string a{u”Hello, “};
a.push_back(‘\u1F378’}; // WRONG
char const glass[]{u8″\u1F378″};
a.insert(a.end, begin(glass), end(glass));

std::u32string a{U”A glass: “};
a.push_back(‘\u1F378’};

String length…

# of bytes:
“1Ä🍸” -> 1 U+0031
A U+0041
¨ U+0308
🍸 U+1F378

UFT-8 = 8
UTF-16 = 10
UTF-32 = 16

# of Code Units

UFT-8 = 8
UTF-16 = 5
UTF-32 = 4

# of Code Points

UFT-8 = 4
UTF-16 = 4
UTF-32 = 4

# of text elements

UFT-8 = 3
UTF-16 = 3
UTF-32 = 3

std::string s{u8″1A “};
std::size_t number_of_code_units{s.length()};

Text manipulations

std::locale loc{“en-US”};

char lowercase_a{‘a’};
char uppercase_a{std::toupper(lowercase_a, loc)}; //A

– all functions are code unit based (isspace, isalpha, isalnum…)

Conversion using <codecvt>

std::string utf8 { u8 “1Ä🍸” };
std::wstring_convert<std::codecvt_utf8,char32_t> utf32_converter;
std::u32string utf32 { utf32_converter.from_bytes(utf8) };

Boost.Locale

boost::locale::generator gen{};
std::locale loc { gen.generate(“en-US.UTF-8”) };

// Latin Capital Letter A With Diaeresis
std::string a { u8″1Ä🍸” };
std::u16string b { boost::locale::conv::utf_to_utf(a) };
std::u32string c { boost::locale::conv::utf_to_utf(a) };
std::u32string d { boost::locale::conv::utf_to_utf(b) };
std::string e { boost::locale::conv::utf_to_utf(b) };

// Алло
std::strings{“\xB0\xDB\xDB\xDE”};
std::stringutf8_s{boost::locale::conv::to_utf(s,”ISO-8859-5″)};

– Provides string-based case manipulation (to_upper, to_lower, to_title, fold_case)

Boundary Analysis

namespace ba = boost::locale::boundary;
std::string subject { u8″1Ä🍸” };
ba::segment_index<std::string::const_iterator> map(ba::character,subject.begin(),subject.end(),loc);
size_t const byte_length { subject.size() }; // 8
size_t const text_length { std::distance(map.begin(),map.end()) }; // 3

mbrtoc32

setlocale(LC_CTYPE,”en_US.UTF-8″);
char const utf8_c[5] = “\U0001f378”;
char32_t utf32_c = 0;
mbstate_tstate = { 0 };
mbrtoc32 (&utf32_c, utf8_c, 4, &state);
printf (“0x%8x\n”, utf32_c);

setlocale(LC_CTYPE,”en_US.UTF-8″);
char const utf8_c[5] = “\U0001f378”;
char16_t utf16_c[2] = { 0 };
mbstate_tstate = { 0 };
mbrtoc16 (&utf16_c[0],utf8_c, 4, &state);
mbrtoc16 (&utf16_c[1],utf8_c, 4, &state);
printf (“0x%4x 0x%4x\n”,utf16_c[0],utf16_c[1]);

some useful tips (mostly for myself)