1#ifndef MSTL_CORE_STRING_CHARACTER_HPP__
2#define MSTL_CORE_STRING_CHARACTER_HPP__
3#include "../interface/icharacter.hpp"
9MSTL_ALWAYS_INLINE
constexpr void __append_utf8_char_aux(T&) {}
11MSTL_CONSTEXPR20
void __append_utf8_char_aux<string>(
string& result) {
12 result.append(
"\xEF\xBF\xBD", 3);
14#ifdef MSTL_STANDARD_20__
16MSTL_CONSTEXPR20
void __append_utf8_char_aux<u8string>(u8string& result) {
17 result.append(u8
"\xEF\xBF\xBD", 3);
22MSTL_CONSTEXPR20
void append_utf8_char(basic_string<T>& result,
uint32_t cp) {
24 _INNER __append_utf8_char_aux(result);
29 result.push_back(
static_cast<T
>(cp));
30 }
else if (cp <= 0x7FF) {
31 result.push_back(
static_cast<T
>(0xC0 | (cp >> 6)));
32 result.push_back(
static_cast<T
>(0x80 | (cp & 0x3F)));
33 }
else if (cp <= 0xFFFF) {
34 result.push_back(
static_cast<T
>(0xE0 | (cp >> 12)));
35 result.push_back(
static_cast<T
>(0x80 | ((cp >> 6) & 0x3F)));
36 result.push_back(
static_cast<T
>(0x80 | (cp & 0x3F)));
38 result.push_back(
static_cast<T
>(0xF0 | (cp >> 18)));
39 result.push_back(
static_cast<T
>(0x80 | ((cp >> 12) & 0x3F)));
40 result.push_back(
static_cast<T
>(0x80 | ((cp >> 6) & 0x3F)));
41 result.push_back(
static_cast<T
>(0x80 | (cp & 0x3F)));
45constexpr bool decode_utf8_char(
const byte_t*
data,
size_t& i,
const size_t len,
uint32_t& cp)
noexcept {
52 if ((b1 & 0x80) == 0) {
55 }
else if ((b1 & 0xE0) == 0xC0) {
56 if (i >= len)
return false;
58 if ((b2 & 0xC0) != 0x80)
return false;
59 cp = ((b1 & 0x1F) << 6) | (b2 & 0x3F);
61 }
else if ((b1 & 0xF0) == 0xE0) {
62 if (i + 1 >= len)
return false;
65 if ((b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80)
return false;
66 cp = ((b1 & 0x0F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
67 return cp >= 0x800 && !(cp >= 0xD800 && cp <= 0xDFFF);
68 }
else if ((b1 & 0xF8) == 0xF0) {
69 if (i + 2 >= len)
return false;
73 if ((b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80 || (b4 & 0xC0) != 0x80)
return false;
74 cp = ((b1 & 0x07) << 18) | ((b2 & 0x3F) << 12) | ((b3 & 0x3F) << 6) | (b4 & 0x3F);
75 return cp >= 0x10000 && cp <= 0x10FFFF;
83constexpr bool get_utf16_codepoint(
const T* obj,
size_t i,
const size_t len,
uint32_t& cp,
size_t& chars_consumed) {
84 const auto c1 =
static_cast<uint32_t>(obj[i]);
89 const auto c2 =
static_cast<uint32_t>(obj[i + 1]);
108constexpr bool handle_utf16_surrogate_pair(
109 const T* obj,
size_t& i,
const size_t len,
uint32_t& cp)
noexcept {
110 const auto c1 =
static_cast<uint32_t>(obj[i]);
113 const auto c2 =
static_cast<uint32_t>(obj[i + 1]);
133constexpr bool is_valid_unicode_codepoint(
const uint32_t cp)
noexcept {
138MSTL_CONSTEXPR20
void codepoint_to_utf16(basic_string<T>& result,
uint32_t cp) {
139 if (!
_INNER is_valid_unicode_codepoint(cp)) {
140 result.push_back(0xFFFD);
145 if (cp >= 0xD800 && cp <= 0xDFFF) {
146 result.push_back(0xFFFD);
148 result.push_back(
static_cast<T
>(cp));
151 const uint32_t adjusted = cp - 0x10000;
152 const auto high_surrogate =
static_cast<T
>((adjusted >> 10) + 0xD800);
153 const auto low_surrogate =
static_cast<T
>((adjusted & 0x3FF) + 0xDC00);
154 result.push_back(high_surrogate);
155 result.push_back(low_surrogate);
160MSTL_CONSTEXPR20
void codepoint_to_wchar(basic_string<T>& result,
uint32_t cp) {
161 if (!
_INNER is_valid_unicode_codepoint(cp)) {
162 result.push_back(0xFFFD);
166#ifdef MSTL_PLATFORM_WINDOWS__
168 result.push_back(
static_cast<wchar_t>(cp));
170 const uint32_t adjusted = cp - 0x10000;
171 const wchar_t high_surrogate =
static_cast<wchar_t>((adjusted >> 10) + 0xD800);
172 const wchar_t low_surrogate =
static_cast<wchar_t>((adjusted & 0x3FF) + 0xDC00);
173 result.push_back(high_surrogate);
174 result.push_back(low_surrogate);
176#elif defined(MSTL_PLATFORM_LINUX__)
177 result.push_back(
static_cast<wchar_t>(cp));
181template <
typename T,
typename U>
182MSTL_CONSTEXPR20
void append_ascii_chars(basic_string<T>& result,
const U* str,
size_t len) {
183 result.reserve(result.size() + len);
184 for (
size_t i = 0; i < len; ++i) {
185 result.push_back(
static_cast<T
>(
static_cast<byte_t>(str[i])));
191#define MSTL_BUILD_PACKAGE_CONSTRUCTOR(T) \
192constexpr T() noexcept = default; \
193constexpr T(const T&) noexcept = default; \
194constexpr T(T&&) noexcept = default; \
195constexpr T(value_type val) noexcept : base(val) {} \
196MSTL_CONSTEXPR20 ~T() = default; \
197constexpr T& operator =(const T& other) noexcept { \
198 value_ = other.value_; \
201constexpr T& operator =(T&& other) noexcept { \
202 value_ = other.value_; \
203 other.value_ = initialize<package_type>(); \
206constexpr T& operator =(value_type other) noexcept { \
212struct character : icharacter<character, char> {
213 using value_type = char;
214 using base = icharacter<character, char>;
216 MSTL_BUILD_PACKAGE_CONSTRUCTOR(character)
218 static MSTL_CONSTEXPR20
string to_string(
const basic_string_view<value_type>& obj) {
222 static MSTL_CONSTEXPR20 wstring to_wstring(
const basic_string_view<value_type>& obj) {
223 if (obj.empty())
return {};
226 const auto*
data =
reinterpret_cast<const byte_t*
>(obj.data());
228 const size_t len = obj.size();
233 if (
_INNER decode_utf8_char(
data, i, len, cp)) {
234 _INNER codepoint_to_wchar(result, cp);
236 _INNER codepoint_to_wchar(result, 0xFFFD);
242#ifdef MSTL_STANDARD_20__
243 static MSTL_CONSTEXPR20 u8string to_u8string(
const basic_string_view<value_type>& obj) {
244 if (obj.empty())
return {};
246 result.reserve(obj.size());
247 for (
const char c : obj) {
248 result.push_back(
static_cast<char8_t>(
static_cast<byte_t>(c)));
254 static MSTL_CONSTEXPR20 u16string to_u16string(
const basic_string_view<value_type>& obj) {
255 if (obj.empty())
return {};
258 const auto*
data =
reinterpret_cast<const byte_t*
>(obj.data());
260 const size_t len = obj.size();
261 result.reserve(len * 2);
265 if (
_INNER decode_utf8_char(
data, i, len, cp)) {
266 _INNER codepoint_to_utf16(result, cp);
268 result.push_back(0xFFFD);
274 static MSTL_CONSTEXPR20 u32string to_u32string(
const basic_string_view<value_type>& obj) {
275 if (obj.empty())
return {};
278 const auto*
data =
reinterpret_cast<const byte_t*
>(obj.data());
280 const size_t len = obj.size();
285 if (
_INNER decode_utf8_char(
data, i, len, cp)) {
286 result.push_back(
static_cast<char32_t>(cp));
288 result.push_back(0xFFFD);
297 using type = character;
305struct wcharacter : icharacter<wcharacter, wchar_t> {
306 using value_type = wchar_t;
307 using base = icharacter<wcharacter, wchar_t>;
309 MSTL_BUILD_PACKAGE_CONSTRUCTOR(wcharacter)
311 static MSTL_CONSTEXPR20
string to_string(
const basic_string_view<value_type>& obj) {
312 if (obj.empty())
return {};
315#ifdef MSTL_PLATFORM_WINDOWS__
316 for (
size_t i = 0; i < obj.size(); ) {
318 size_t chars_consumed;
319 if (
_INNER get_utf16_codepoint(obj.data(), i, obj.size(), cp, chars_consumed)) {
320 _INNER append_utf8_char(result, cp);
323 _INNER append_utf8_char(result, 0xFFFD);
327#elif defined(MSTL_PLATFORM_LINUX__)
328 for (
const value_type i : obj) {
329 _INNER append_utf8_char(result, i);
335 static MSTL_CONSTEXPR20 wstring to_wstring(
const basic_string_view<value_type>& obj) {
339#ifdef MSTL_STANDARD_20__
340 static MSTL_CONSTEXPR20 u8string to_u8string(
const basic_string_view<value_type>& obj) {
341 if (obj.empty())
return {};
344#ifdef MSTL_PLATFORM_WINDOWS__
345 for (
size_t i = 0; i < obj.size(); ) {
347 size_t chars_consumed;
348 if (
_INNER get_utf16_codepoint(obj.data(), i, obj.size(), cp, chars_consumed)) {
349 _INNER append_utf8_char(result, cp);
352 _INNER append_utf8_char(result, 0xFFFD);
356#elif defined(MSTL_PLATFORM_LINUX__)
357 for (
const value_type i : obj) {
358 _INNER append_utf8_char(result, i);
365 static MSTL_CONSTEXPR20 u16string to_u16string(
const basic_string_view<value_type>& obj) {
366 if (obj.empty())
return {};
369#ifdef MSTL_PLATFORM_WINDOWS__
370 result.reserve(obj.size());
371 for (
size_t i = 0; i < obj.size(); ++i) {
372 result.push_back(
static_cast<char16_t>(
static_cast<uint16_t>(obj[i])));
374#elif defined(MSTL_PLATFORM_LINUX__)
375 result.reserve(obj.size() * 2);
376 for (
const value_type i : obj) {
377 _INNER codepoint_to_utf16(result, i);
383 static MSTL_CONSTEXPR20 u32string to_u32string(
const basic_string_view<value_type>& obj) {
384 if (obj.empty())
return {};
386 result.reserve(obj.size());
388#ifdef MSTL_PLATFORM_WINDOWS__
389 for (
size_t i = 0; i < obj.size(); ) {
391 size_t chars_consumed;
392 if (
_INNER get_utf16_codepoint(obj.data(), i, obj.size(), cp, chars_consumed)) {
393 result.push_back(
static_cast<char32_t>(cp));
396 result.push_back(0xFFFD);
400#elif defined(MSTL_PLATFORM_LINUX__)
401 for (
const value_type i : obj) {
402 result.push_back(
static_cast<char32_t>(i));
411 using type = wcharacter;
415 using type = wchar_t;
419#ifdef MSTL_STANDARD_20__
421struct u8character : icharacter<u8character, char8_t> {
422 using value_type = char8_t;
423 using base = icharacter<u8character, char8_t>;
425 MSTL_BUILD_PACKAGE_CONSTRUCTOR(u8character)
427 static MSTL_CONSTEXPR20
string to_string(
const basic_string_view<value_type>& obj) {
428 if (obj.empty())
return {};
430 _INNER append_ascii_chars(result, obj.data(), obj.size());
434 static MSTL_CONSTEXPR20 wstring to_wstring(
const basic_string_view<value_type>& obj) {
435 if (obj.empty())
return {};
437 const size_t len = obj.size();
442 const auto data =
reinterpret_cast<const byte_t*
>(obj.data());
444 if (
_INNER decode_utf8_char(
data, i, len, cp)) {
445 _INNER codepoint_to_wchar(result, cp);
447 result.push_back(0xFFFD);
453 static MSTL_CONSTEXPR20 u8string to_u8string(
const basic_string_view<value_type>& obj) {
454 return u8string{obj};
457 static MSTL_CONSTEXPR20 u16string to_u16string(
const basic_string_view<value_type>& obj) {
458 if (obj.empty())
return {};
460 const size_t len = obj.size();
465 const auto data =
reinterpret_cast<const byte_t*
>(obj.data());
467 if (
_INNER decode_utf8_char(
data, i, len, cp)) {
468 _INNER codepoint_to_utf16(result, cp);
470 result.push_back(0xFFFD);
476 static MSTL_CONSTEXPR20 u32string to_u32string(
const basic_string_view<value_type>& obj) {
477 if (obj.empty())
return {};
479 const size_t len = obj.size();
484 const auto data =
reinterpret_cast<const byte_t*
>(obj.data());
486 if (
_INNER decode_utf8_char(
data, i, len, cp)) {
487 result.push_back(
static_cast<char32_t>(cp));
489 result.push_back(0xFFFD);
498 using type = u8character;
502 using type = char8_t;
508struct u16character : icharacter<u16character, char16_t> {
509 using value_type = char16_t;
510 using base = icharacter<u16character, char16_t>;
512 MSTL_BUILD_PACKAGE_CONSTRUCTOR(u16character)
514 static MSTL_CONSTEXPR20
string to_string(
const basic_string_view<value_type>& obj) {
515 if (obj.empty())
return {};
518 size_t start_pos = 0;
519 if (!obj.empty() && obj[0] == 0xFEFF) {
523 for (
size_t i = start_pos; i < obj.size(); ) {
525 size_t chars_consumed;
527 if (
_INNER get_utf16_codepoint(obj.data(), i, obj.size(), cp, chars_consumed)) {
529 _INNER append_utf8_char(result, cp);
531 _INNER append_utf8_char(result, 0xFFFD);
535 _INNER append_utf8_char(result, 0xFFFD);
542 static MSTL_CONSTEXPR20 wstring to_wstring(
const basic_string_view<value_type>& obj) {
543 if (obj.empty())
return {};
545 result.reserve(obj.size());
547 for (
size_t i = 0; i < obj.size(); ) {
548 if (i == 0 && obj[i] == 0xFEFF) {
554 size_t chars_consumed;
555 if (
_INNER get_utf16_codepoint(obj.data(), i, obj.size(), cp, chars_consumed)) {
556 _INNER codepoint_to_wchar(result, cp);
559 result.push_back(0xFFFD);
566#ifdef MSTL_STANDARD_20__
567 static MSTL_CONSTEXPR20 u8string to_u8string(
const basic_string_view<value_type>& obj) {
568 if (obj.empty())
return {};
570 result.reserve(obj.size() * 3);
572 for (
size_t i = 0; i < obj.size(); ) {
573 if (i == 0 && obj[i] == 0xFEFF) {
579 size_t chars_consumed;
580 if (
_INNER get_utf16_codepoint(obj.data(), i, obj.size(), cp, chars_consumed)) {
581 _INNER append_utf8_char(result, cp);
584 _INNER append_utf8_char(result, 0xFFFD);
592 static MSTL_CONSTEXPR20 u16string to_u16string(
const basic_string_view<value_type>& obj) {
593 return u16string{obj};
596 static MSTL_CONSTEXPR20 u32string to_u32string(
const basic_string_view<value_type>& obj) {
597 if (obj.empty())
return {};
599 result.reserve(obj.size());
601 for (
size_t i = 0; i < obj.size(); ) {
602 if (i == 0 && obj[i] == 0xFEFF) {
608 size_t chars_consumed;
609 if (
_INNER get_utf16_codepoint(obj.data(), i, obj.size(), cp, chars_consumed)) {
610 result.push_back(
static_cast<char32_t>(cp));
613 result.push_back(0xFFFD);
623 using type = u16character;
627 using type = char16_t;
631struct u32character : icharacter<u32character, char32_t> {
632 using value_type = char32_t;
633 using base = icharacter<u32character, char32_t>;
635 MSTL_BUILD_PACKAGE_CONSTRUCTOR(u32character)
637 static MSTL_CONSTEXPR20
string to_string(
const basic_string_view<value_type>& obj) {
638 if (obj.empty())
return {};
640 for (
const value_type i : obj) {
641 _INNER append_utf8_char(result, i);
646 static MSTL_CONSTEXPR20 wstring to_wstring(
const basic_string_view<value_type>& obj) {
647 if (obj.empty())
return {};
649 result.reserve(obj.size());
650 for (
const value_type i : obj) {
651 _INNER codepoint_to_wchar(result, i);
656#ifdef MSTL_STANDARD_20__
657 static MSTL_CONSTEXPR20 u8string to_u8string(
const basic_string_view<value_type>& obj) {
658 if (obj.empty())
return {};
660 result.reserve(obj.size() * 4);
661 for (
const value_type i : obj) {
662 _INNER append_utf8_char(result, i);
668 static MSTL_CONSTEXPR20 u16string to_u16string(
const basic_string_view<value_type>& obj) {
669 if (obj.empty())
return {};
671 result.reserve(obj.size() * 2);
672 for (
const value_type i : obj) {
673 _INNER codepoint_to_utf16(result, i);
678 static MSTL_CONSTEXPR20 u32string to_u32string(
const basic_string_view<value_type>& obj) {
679 return u32string{obj};
685 using type = u32character;
689 using type = char32_t;
unsigned char byte_t
字节类型,定义为无符号字符
unsigned int uint32_t
32位无符号整数类型
unsigned short uint16_t
16位无符号整数类型
#define _MSTL
全局命名空间MSTL前缀
#define MSTL_END_INNER__
结束inner命名空间
#define _INNER
inner命名空间前缀
#define MSTL_END_NAMESPACE__
结束全局命名空间MSTL
#define MSTL_BEGIN_NAMESPACE__
开始全局命名空间MSTL
#define MSTL_BEGIN_INNER__
开始inner命名空间
MSTL_NODISCARD MSTL_ALWAYS_INLINE constexpr decltype(auto) data(Container &cont) noexcept(noexcept(cont.data()))
获取容器的底层数据指针
MSTL_CONST_FUNCTION constexpr bool is_high_surrogate(const char16_t c) noexcept
检查字符是否为高代理项
MSTL_CONST_FUNCTION constexpr bool is_low_surrogate(const char16_t c) noexcept
检查字符是否为低代理项
MSTL_CONST_FUNCTION constexpr uint32_t combine_surrogates(const char16_t high, const char16_t low) noexcept
组合高代理项和低代理项为完整的Unicode码点