// Locale support (codecvt) -*- C++ -*-
// Copyright (C) 2015-2020 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the
// terms of the GNU General Public License as published by the
// Free Software Foundation; either version 3, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// .
#include
#include // std::memcpy, std::memcmp
#include // std::min
namespace std _GLIBCXX_VISIBILITY(default)
{
_GLIBCXX_BEGIN_NAMESPACE_VERSION
// The standard doesn't define these operators, which is annoying.
static underlying_type::type
to_integer(codecvt_mode m)
{ return static_cast::type>(m); }
static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
{ return m = codecvt_mode(to_integer(m) & to_integer(n)); }
static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
{ return m = codecvt_mode(to_integer(m) | to_integer(n)); }
static codecvt_mode operator~(codecvt_mode m)
{ return codecvt_mode(~to_integer(m)); }
namespace
{
// Largest code point that fits in a single UTF-16 code unit.
const char32_t max_single_utf16_unit = 0xFFFF;
const char32_t max_code_point = 0x10FFFF;
// The functions below rely on maxcode < incomplete_mb_character
// (which is enforced by the codecvt_utf* classes on construction).
const char32_t incomplete_mb_character = char32_t(-2);
const char32_t invalid_mb_sequence = char32_t(-1);
// Utility type for reading and writing code units of type Elem from
// a range defined by a pair of pointers.
template
struct range
{
Elem* next;
Elem* end;
// Write a code unit.
range& operator=(Elem e)
{
*next++ = e;
return *this;
}
// Read the next code unit.
Elem operator*() const { return *next; }
// Read the Nth code unit.
Elem operator[](size_t n) const { return next[n]; }
// Move to the next code unit.
range& operator++()
{
++next;
return *this;
}
// Move to the Nth code unit.
range& operator+=(size_t n)
{
next += n;
return *this;
}
// The number of code units remaining.
size_t size() const { return end - next; }
// The number of bytes remaining.
size_t nbytes() const { return (const char*)end - (const char*)next; }
};
// This specialization is used when accessing char16_t values through
// pointers to char, which might not be correctly aligned for char16_t.
template
struct range
{
using value_type = typename remove_const::type;
using char_pointer = typename
conditional::value, const char*, char*>::type;
char_pointer next;
char_pointer end;
// Write a code unit.
range& operator=(Elem e)
{
memcpy(next, &e, sizeof(Elem));
++*this;
return *this;
}
// Read the next code unit.
Elem operator*() const
{
value_type e;
memcpy(&e, next, sizeof(Elem));
return e;
}
// Read the Nth code unit.
Elem operator[](size_t n) const
{
value_type e;
memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
return e;
}
// Move to the next code unit.
range& operator++()
{
next += sizeof(Elem);
return *this;
}
// Move to the Nth code unit.
range& operator+=(size_t n)
{
next += n * sizeof(Elem);
return *this;
}
// The number of code units remaining.
size_t size() const { return nbytes() / sizeof(Elem); }
// The number of bytes remaining.
size_t nbytes() const { return end - next; }
};
// Multibyte sequences can have "header" consisting of Byte Order Mark
const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
// Write a BOM (space permitting).
template
bool
write_bom(range& to, const unsigned char (&bom)[N])
{
static_assert( (N / sizeof(C)) != 0, "" );
static_assert( (N % sizeof(C)) == 0, "" );
if (to.nbytes() < N)
return false;
memcpy(to.next, bom, N);
to += (N / sizeof(C));
return true;
}
// Try to read a BOM.
template
bool
read_bom(range& from, const unsigned char (&bom)[N])
{
static_assert( (N / sizeof(C)) != 0, "" );
static_assert( (N % sizeof(C)) == 0, "" );
if (from.nbytes() >= N && !memcmp(from.next, bom, N))
{
from += (N / sizeof(C));
return true;
}
return false;
}
// If generate_header is set in mode write out UTF-8 BOM.
template
bool
write_utf8_bom(range& to, codecvt_mode mode)
{
if (mode & generate_header)
return write_bom(to, utf8_bom);
return true;
}
// If generate_header is set in mode write out the UTF-16 BOM indicated
// by whether little_endian is set in mode.
template
bool
write_utf16_bom(range& to, codecvt_mode mode)
{
if (mode & generate_header)
{
if (mode & little_endian)
return write_bom(to, utf16le_bom);
else
return write_bom(to, utf16_bom);
}
return true;
}
// If consume_header is set in mode update from.next to after any BOM.
template
void
read_utf8_bom(range& from, codecvt_mode mode)
{
if (mode & consume_header)
read_bom(from, utf8_bom);
}
// If consume_header is not set in mode, no effects.
// Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
// - if the UTF-16BE BOM was found unset little_endian in mode, or
// - if the UTF-16LE BOM was found set little_endian in mode.
template
void
read_utf16_bom(range& from, codecvt_mode& mode)
{
if (mode & consume_header)
{
if (read_bom(from, utf16_bom))
mode &= ~little_endian;
else if (read_bom(from, utf16le_bom))
mode |= little_endian;
}
}
// Read a codepoint from a UTF-8 multibyte sequence.
// Updates from.next if the codepoint is not greater than maxcode.
// Returns invalid_mb_sequence, incomplete_mb_character or the code point.
template
char32_t
read_utf8_code_point(range& from, unsigned long maxcode)
{
const size_t avail = from.size();
if (avail == 0)
return incomplete_mb_character;
unsigned char c1 = from[0];
// https://en.wikipedia.org/wiki/UTF-8#Sample_code
if (c1 < 0x80)
{
++from;
return c1;
}
else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
return invalid_mb_sequence;
else if (c1 < 0xE0) // 2-byte sequence
{
if (avail < 2)
return incomplete_mb_character;
unsigned char c2 = from[1];
if ((c2 & 0xC0) != 0x80)
return invalid_mb_sequence;
char32_t c = (c1 << 6) + c2 - 0x3080;
if (c <= maxcode)
from += 2;
return c;
}
else if (c1 < 0xF0) // 3-byte sequence
{
if (avail < 3)
return incomplete_mb_character;
unsigned char c2 = from[1];
if ((c2 & 0xC0) != 0x80)
return invalid_mb_sequence;
if (c1 == 0xE0 && c2 < 0xA0) // overlong
return invalid_mb_sequence;
unsigned char c3 = from[2];
if ((c3 & 0xC0) != 0x80)
return invalid_mb_sequence;
char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
if (c <= maxcode)
from += 3;
return c;
}
else if (c1 < 0xF5) // 4-byte sequence
{
if (avail < 4)
return incomplete_mb_character;
unsigned char c2 = from[1];
if ((c2 & 0xC0) != 0x80)
return invalid_mb_sequence;
if (c1 == 0xF0 && c2 < 0x90) // overlong
return invalid_mb_sequence;
if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
return invalid_mb_sequence;
unsigned char c3 = from[2];
if ((c3 & 0xC0) != 0x80)
return invalid_mb_sequence;
unsigned char c4 = from[3];
if ((c4 & 0xC0) != 0x80)
return invalid_mb_sequence;
char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
if (c <= maxcode)
from += 4;
return c;
}
else // > U+10FFFF
return invalid_mb_sequence;
}
template
bool
write_utf8_code_point(range& to, char32_t code_point)
{
if (code_point < 0x80)
{
if (to.size() < 1)
return false;
to = code_point;
}
else if (code_point <= 0x7FF)
{
if (to.size() < 2)
return false;
to = (code_point >> 6) + 0xC0;
to = (code_point & 0x3F) + 0x80;
}
else if (code_point <= 0xFFFF)
{
if (to.size() < 3)
return false;
to = (code_point >> 12) + 0xE0;
to = ((code_point >> 6) & 0x3F) + 0x80;
to = (code_point & 0x3F) + 0x80;
}
else if (code_point <= 0x10FFFF)
{
if (to.size() < 4)
return false;
to = (code_point >> 18) + 0xF0;
to = ((code_point >> 12) & 0x3F) + 0x80;
to = ((code_point >> 6) & 0x3F) + 0x80;
to = (code_point & 0x3F) + 0x80;
}
else
return false;
return true;
}
inline char16_t
adjust_byte_order(char16_t c, codecvt_mode mode)
{
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
return (mode & little_endian) ? __builtin_bswap16(c) : c;
#else
return (mode & little_endian) ? c : __builtin_bswap16(c);
#endif
}
// Return true if c is a high-surrogate (aka leading) code point.
inline bool
is_high_surrogate(char32_t c)
{
return c >= 0xD800 && c <= 0xDBFF;
}
// Return true if c is a low-surrogate (aka trailing) code point.
inline bool
is_low_surrogate(char32_t c)
{
return c >= 0xDC00 && c <= 0xDFFF;
}
inline char32_t
surrogate_pair_to_code_point(char32_t high, char32_t low)
{
return (high << 10) + low - 0x35FDC00;
}
// Read a codepoint from a UTF-16 multibyte sequence.
// The sequence's endianness is indicated by (mode & little_endian).
// Updates from.next if the codepoint is not greater than maxcode.
// Returns invalid_mb_sequence, incomplete_mb_character or the code point.
template
char32_t
read_utf16_code_point(range& from,
unsigned long maxcode, codecvt_mode mode)
{
const size_t avail = from.size();
if (avail == 0)
return incomplete_mb_character;
int inc = 1;
char32_t c = adjust_byte_order(from[0], mode);
if (is_high_surrogate(c))
{
if (avail < 2)
return incomplete_mb_character;
const char16_t c2 = adjust_byte_order(from[1], mode);
if (is_low_surrogate(c2))
{
c = surrogate_pair_to_code_point(c, c2);
inc = 2;
}
else
return invalid_mb_sequence;
}
else if (is_low_surrogate(c))
return invalid_mb_sequence;
if (c <= maxcode)
from += inc;
return c;
}
template
bool
write_utf16_code_point(range& to, char32_t codepoint, codecvt_mode mode)
{
static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
if (codepoint <= max_single_utf16_unit)
{
if (to.size() > 0)
{
to = adjust_byte_order(codepoint, mode);
return true;
}
}
else if (to.size() > 1)
{
// Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
char16_t lead = LEAD_OFFSET + (codepoint >> 10);
char16_t trail = 0xDC00 + (codepoint & 0x3FF);
to = adjust_byte_order(lead, mode);
to = adjust_byte_order(trail, mode);
return true;
}
return false;
}
// utf8 -> ucs4
template
codecvt_base::result
ucs4_in(range& from, range& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {})
{
read_utf8_bom(from, mode);
while (from.size() && to.size())
{
const char32_t codepoint = read_utf8_code_point(from, maxcode);
if (codepoint == incomplete_mb_character)
return codecvt_base::partial;
if (codepoint > maxcode)
return codecvt_base::error;
to = codepoint;
}
return from.size() ? codecvt_base::partial : codecvt_base::ok;
}
// ucs4 -> utf8
template
codecvt_base::result
ucs4_out(range& from, range& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {})
{
if (!write_utf8_bom(to, mode))
return codecvt_base::partial;
while (from.size())
{
const char32_t c = from[0];
if (c > maxcode)
return codecvt_base::error;
if (!write_utf8_code_point(to, c))
return codecvt_base::partial;
++from;
}
return codecvt_base::ok;
}
// utf16 -> ucs4
codecvt_base::result
ucs4_in(range& from, range& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {})
{
read_utf16_bom(from, mode);
while (from.size() && to.size())
{
const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
if (codepoint == incomplete_mb_character)
return codecvt_base::partial;
if (codepoint > maxcode)
return codecvt_base::error;
to = codepoint;
}
return from.size() ? codecvt_base::partial : codecvt_base::ok;
}
// ucs4 -> utf16
codecvt_base::result
ucs4_out(range& from, range& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {})
{
if (!write_utf16_bom(to, mode))
return codecvt_base::partial;
while (from.size())
{
const char32_t c = from[0];
if (c > maxcode)
return codecvt_base::error;
if (!write_utf16_code_point(to, c, mode))
return codecvt_base::partial;
++from;
}
return codecvt_base::ok;
}
// Flag indicating whether to process UTF-16 or UCS2
enum class surrogates { allowed, disallowed };
// utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
template
codecvt_base::result
utf16_in(range& from, range& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {},
surrogates s = surrogates::allowed)
{
read_utf8_bom(from, mode);
while (from.size() && to.size())
{
auto orig = from;
const char32_t codepoint = read_utf8_code_point(from, maxcode);
if (codepoint == incomplete_mb_character)
{
if (s == surrogates::allowed)
return codecvt_base::partial;
else
return codecvt_base::error; // No surrogates in UCS2
}
if (codepoint > maxcode)
return codecvt_base::error;
if (!write_utf16_code_point(to, codepoint, mode))
{
from = orig; // rewind to previous position
return codecvt_base::partial;
}
}
return codecvt_base::ok;
}
// utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
template
codecvt_base::result
utf16_out(range& from, range& to,
unsigned long maxcode = max_code_point, codecvt_mode mode = {},
surrogates s = surrogates::allowed)
{
if (!write_utf8_bom(to, mode))
return codecvt_base::partial;
while (from.size())
{
char32_t c = from[0];
int inc = 1;
if (is_high_surrogate(c))
{
if (s == surrogates::disallowed)
return codecvt_base::error; // No surrogates in UCS-2
if (from.size() < 2)
return codecvt_base::ok; // stop converting at this point
const char32_t c2 = from[1];
if (is_low_surrogate(c2))
{
c = surrogate_pair_to_code_point(c, c2);
inc = 2;
}
else
return codecvt_base::error;
}
else if (is_low_surrogate(c))
return codecvt_base::error;
if (c > maxcode)
return codecvt_base::error;
if (!write_utf8_code_point(to, c))
return codecvt_base::partial;
from += inc;
}
return codecvt_base::ok;
}
// return pos such that [begin,pos) is valid UTF-16 string no longer than max
template
const C*
utf16_span(const C* begin, const C* end, size_t max,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
range from{ begin, end };
read_utf8_bom(from, mode);
size_t count = 0;
while (count+1 < max)
{
char32_t c = read_utf8_code_point(from, maxcode);
if (c > maxcode)
return from.next;
else if (c > max_single_utf16_unit)
++count;
++count;
}
if (count+1 == max) // take one more character if it fits in a single unit
read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
return from.next;
}
// utf8 -> ucs2
template
codecvt_base::result
ucs2_in(range& from, range& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
// UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::min(max_single_utf16_unit, maxcode);
return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
}
// ucs2 -> utf8
template
codecvt_base::result
ucs2_out(range& from, range& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
// UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::min(max_single_utf16_unit, maxcode);
return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
}
// ucs2 -> utf16
codecvt_base::result
ucs2_out(range& from, range& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
if (!write_utf16_bom(to, mode))
return codecvt_base::partial;
while (from.size() && to.size())
{
char16_t c = from[0];
if (is_high_surrogate(c))
return codecvt_base::error;
if (c > maxcode)
return codecvt_base::error;
to = adjust_byte_order(c, mode);
++from;
}
return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
}
// utf16 -> ucs2
codecvt_base::result
ucs2_in(range& from, range& to,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
read_utf16_bom(from, mode);
// UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::min(max_single_utf16_unit, maxcode);
while (from.size() && to.size())
{
const char32_t c = read_utf16_code_point(from, maxcode, mode);
if (c == incomplete_mb_character)
return codecvt_base::error; // UCS-2 only supports single units.
if (c > maxcode)
return codecvt_base::error;
to = c;
}
return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
}
const char16_t*
ucs2_span(range& from, size_t max,
char32_t maxcode, codecvt_mode mode)
{
read_utf16_bom(from, mode);
// UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::min(max_single_utf16_unit, maxcode);
char32_t c = 0;
while (max-- && c <= maxcode)
c = read_utf16_code_point(from, maxcode, mode);
return reinterpret_cast(from.next);
}
template
const C*
ucs2_span(const C* begin, const C* end, size_t max,
char32_t maxcode, codecvt_mode mode)
{
range from{ begin, end };
read_utf8_bom(from, mode);
// UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
maxcode = std::min(max_single_utf16_unit, maxcode);
char32_t c = 0;
while (max-- && c <= maxcode)
c = read_utf8_code_point(from, maxcode);
return from.next;
}
// return pos such that [begin,pos) is valid UCS-4 string no longer than max
template
const C*
ucs4_span(const C* begin, const C* end, size_t max,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
range from{ begin, end };
read_utf8_bom(from, mode);
char32_t c = 0;
while (max-- && c <= maxcode)
c = read_utf8_code_point(from, maxcode);
return from.next;
}
// return pos such that [begin,pos) is valid UCS-4 string no longer than max
const char16_t*
ucs4_span(range& from, size_t max,
char32_t maxcode = max_code_point, codecvt_mode mode = {})
{
read_utf16_bom(from, mode);
char32_t c = 0;
while (max-- && c <= maxcode)
c = read_utf16_code_point(from, maxcode, mode);
return reinterpret_cast(from.next);
}
}
// Define members of codecvt specialization.
// Converts from UTF-8 to UTF-16.
locale::id codecvt::id;
codecvt::~codecvt() { }
codecvt_base::result
codecvt::
do_out(state_type&,
const intern_type* __from,
const intern_type* __from_end, const intern_type*& __from_next,
extern_type* __to, extern_type* __to_end,
extern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
auto res = utf16_out(from, to);
__from_next = from.next;
__to_next = to.next;
return res;
}
codecvt_base::result
codecvt::
do_unshift(state_type&, extern_type* __to, extern_type*,
extern_type*& __to_next) const
{
__to_next = __to;
return noconv; // we don't use mbstate_t for the unicode facets
}
codecvt_base::result
codecvt::
do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
const extern_type*& __from_next,
intern_type* __to, intern_type* __to_end,
intern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
codecvt_mode mode = {};
#else
codecvt_mode mode = little_endian;
#endif
auto res = utf16_in(from, to, max_code_point, mode);
__from_next = from.next;
__to_next = to.next;
return res;
}
int
codecvt::do_encoding() const throw()
{ return 0; } // UTF-8 is not a fixed-width encoding
bool
codecvt::do_always_noconv() const throw()
{ return false; }
int
codecvt::
do_length(state_type&, const extern_type* __from,
const extern_type* __end, size_t __max) const
{
__end = utf16_span(__from, __end, __max);
return __end - __from;
}
int
codecvt::do_max_length() const throw()
{
// A single character (one or two UTF-16 code units) requires
// up to four UTF-8 code units.
return 4;
}
// Define members of codecvt specialization.
// Converts from UTF-8 to UTF-32 (aka UCS-4).
locale::id codecvt::id;
codecvt::~codecvt() { }
codecvt_base::result
codecvt::
do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
const intern_type*& __from_next,
extern_type* __to, extern_type* __to_end,
extern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
auto res = ucs4_out(from, to);
__from_next = from.next;
__to_next = to.next;
return res;
}
codecvt_base::result
codecvt::
do_unshift(state_type&, extern_type* __to, extern_type*,
extern_type*& __to_next) const
{
__to_next = __to;
return noconv;
}
codecvt_base::result
codecvt::
do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
const extern_type*& __from_next,
intern_type* __to, intern_type* __to_end,
intern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
auto res = ucs4_in(from, to);
__from_next = from.next;
__to_next = to.next;
return res;
}
int
codecvt::do_encoding() const throw()
{ return 0; } // UTF-8 is not a fixed-width encoding
bool
codecvt::do_always_noconv() const throw()
{ return false; }
int
codecvt::
do_length(state_type&, const extern_type* __from,
const extern_type* __end, size_t __max) const
{
__end = ucs4_span(__from, __end, __max);
return __end - __from;
}
int
codecvt::do_max_length() const throw()
{
// A single character (one UTF-32 code unit) requires
// up to 4 UTF-8 code units.
return 4;
}
#if defined(_GLIBCXX_USE_CHAR8_T)
// Define members of codecvt specialization.
// Converts from UTF-8 to UTF-16.
locale::id codecvt::id;
codecvt::~codecvt() { }
codecvt_base::result
codecvt::
do_out(state_type&,
const intern_type* __from,
const intern_type* __from_end, const intern_type*& __from_next,
extern_type* __to, extern_type* __to_end,
extern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
auto res = utf16_out(from, to);
__from_next = from.next;
__to_next = to.next;
return res;
}
codecvt_base::result
codecvt::
do_unshift(state_type&, extern_type* __to, extern_type*,
extern_type*& __to_next) const
{
__to_next = __to;
return noconv; // we don't use mbstate_t for the unicode facets
}
codecvt_base::result
codecvt::
do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
const extern_type*& __from_next,
intern_type* __to, intern_type* __to_end,
intern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
codecvt_mode mode = {};
#else
codecvt_mode mode = little_endian;
#endif
auto res = utf16_in(from, to, max_code_point, mode);
__from_next = from.next;
__to_next = to.next;
return res;
}
int
codecvt::do_encoding() const throw()
{ return 0; } // UTF-8 is not a fixed-width encoding
bool
codecvt::do_always_noconv() const throw()
{ return false; }
int
codecvt::
do_length(state_type&, const extern_type* __from,
const extern_type* __end, size_t __max) const
{
__end = utf16_span(__from, __end, __max);
return __end - __from;
}
int
codecvt::do_max_length() const throw()
{
// A single character (one or two UTF-16 code units) requires
// up to four UTF-8 code units.
return 4;
}
// Define members of codecvt specialization.
// Converts from UTF-8 to UTF-32 (aka UCS-4).
locale::id codecvt::id;
codecvt::~codecvt() { }
codecvt_base::result
codecvt::
do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
const intern_type*& __from_next,
extern_type* __to, extern_type* __to_end,
extern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
auto res = ucs4_out(from, to);
__from_next = from.next;
__to_next = to.next;
return res;
}
codecvt_base::result
codecvt::
do_unshift(state_type&, extern_type* __to, extern_type*,
extern_type*& __to_next) const
{
__to_next = __to;
return noconv;
}
codecvt_base::result
codecvt::
do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
const extern_type*& __from_next,
intern_type* __to, intern_type* __to_end,
intern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
auto res = ucs4_in(from, to);
__from_next = from.next;
__to_next = to.next;
return res;
}
int
codecvt::do_encoding() const throw()
{ return 0; } // UTF-8 is not a fixed-width encoding
bool
codecvt::do_always_noconv() const throw()
{ return false; }
int
codecvt::
do_length(state_type&, const extern_type* __from,
const extern_type* __end, size_t __max) const
{
__end = ucs4_span(__from, __end, __max);
return __end - __from;
}
int
codecvt::do_max_length() const throw()
{
// A single character (one UTF-32 code unit) requires
// up to 4 UTF-8 code units.
return 4;
}
#endif // _GLIBCXX_USE_CHAR8_T
// Define members of codecvt_utf8 base class implementation.
// Converts from UTF-8 to UCS-2.
__codecvt_utf8_base::~__codecvt_utf8_base() { }
codecvt_base::result
__codecvt_utf8_base::
do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
const intern_type*& __from_next,
extern_type* __to, extern_type* __to_end,
extern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
__from_next = from.next;
__to_next = to.next;
return res;
}
codecvt_base::result
__codecvt_utf8_base::
do_unshift(state_type&, extern_type* __to, extern_type*,
extern_type*& __to_next) const
{
__to_next = __to;
return noconv;
}
codecvt_base::result
__codecvt_utf8_base::
do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
const extern_type*& __from_next,
intern_type* __to, intern_type* __to_end,
intern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
mode = codecvt_mode(mode | little_endian);
#endif
auto res = ucs2_in(from, to, _M_maxcode, mode);
__from_next = from.next;
__to_next = to.next;
return res;
}
int
__codecvt_utf8_base::do_encoding() const throw()
{ return 0; } // UTF-8 is not a fixed-width encoding
bool
__codecvt_utf8_base::do_always_noconv() const throw()
{ return false; }
int
__codecvt_utf8_base::
do_length(state_type&, const extern_type* __from,
const extern_type* __end, size_t __max) const
{
__end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
return __end - __from;
}
int
__codecvt_utf8_base::do_max_length() const throw()
{
// A single UCS-2 character requires up to three UTF-8 code units.
// (UCS-2 cannot represent characters that use four UTF-8 code units).
int max = 3;
if (_M_mode & consume_header)
max += sizeof(utf8_bom);
return max;
}
// Define members of codecvt_utf8 base class implementation.
// Converts from UTF-8 to UTF-32 (aka UCS-4).
__codecvt_utf8_base::~__codecvt_utf8_base() { }
codecvt_base::result
__codecvt_utf8_base::
do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
const intern_type*& __from_next,
extern_type* __to, extern_type* __to_end,
extern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
__from_next = from.next;
__to_next = to.next;
return res;
}
codecvt_base::result
__codecvt_utf8_base::
do_unshift(state_type&, extern_type* __to, extern_type*,
extern_type*& __to_next) const
{
__to_next = __to;
return noconv;
}
codecvt_base::result
__codecvt_utf8_base::
do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
const extern_type*& __from_next,
intern_type* __to, intern_type* __to_end,
intern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
__from_next = from.next;
__to_next = to.next;
return res;
}
int
__codecvt_utf8_base::do_encoding() const throw()
{ return 0; } // UTF-8 is not a fixed-width encoding
bool
__codecvt_utf8_base::do_always_noconv() const throw()
{ return false; }
int
__codecvt_utf8_base::
do_length(state_type&, const extern_type* __from,
const extern_type* __end, size_t __max) const
{
__end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
return __end - __from;
}
int
__codecvt_utf8_base::do_max_length() const throw()
{
// A single UCS-4 character requires up to four UTF-8 code units.
int max = 4;
if (_M_mode & consume_header)
max += sizeof(utf8_bom);
return max;
}
#ifdef _GLIBCXX_USE_WCHAR_T
#if __SIZEOF_WCHAR_T__ == 2
static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
#elif __SIZEOF_WCHAR_T__ == 4
static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
#endif
// Define members of codecvt_utf8 base class implementation.
// Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
__codecvt_utf8_base::~__codecvt_utf8_base() { }
codecvt_base::result
__codecvt_utf8_base::
do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
const intern_type*& __from_next,
extern_type* __to, extern_type* __to_end,
extern_type*& __to_next) const
{
range to{ __to, __to_end };
#if __SIZEOF_WCHAR_T__ == 2
range from{
reinterpret_cast(__from),
reinterpret_cast(__from_end)
};
auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
#elif __SIZEOF_WCHAR_T__ == 4
range from{
reinterpret_cast(__from),
reinterpret_cast(__from_end)
};
auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
#else
return codecvt_base::error;
#endif
__from_next = reinterpret_cast(from.next);
__to_next = to.next;
return res;
}
codecvt_base::result
__codecvt_utf8_base::
do_unshift(state_type&, extern_type* __to, extern_type*,
extern_type*& __to_next) const
{
__to_next = __to;
return noconv;
}
codecvt_base::result
__codecvt_utf8_base::
do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
const extern_type*& __from_next,
intern_type* __to, intern_type* __to_end,
intern_type*& __to_next) const
{
range from{ __from, __from_end };
#if __SIZEOF_WCHAR_T__ == 2
range to{
reinterpret_cast(__to),
reinterpret_cast(__to_end)
};
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
codecvt_mode mode = {};
#else
codecvt_mode mode = little_endian;
#endif
auto res = ucs2_in(from, to, _M_maxcode, mode);
#elif __SIZEOF_WCHAR_T__ == 4
range to{
reinterpret_cast(__to),
reinterpret_cast(__to_end)
};
auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
#else
return codecvt_base::error;
#endif
__from_next = from.next;
__to_next = reinterpret_cast(to.next);
return res;
}
int
__codecvt_utf8_base::do_encoding() const throw()
{ return 0; } // UTF-8 is not a fixed-width encoding
bool
__codecvt_utf8_base::do_always_noconv() const throw()
{ return false; }
int
__codecvt_utf8_base::
do_length(state_type&, const extern_type* __from,
const extern_type* __end, size_t __max) const
{
#if __SIZEOF_WCHAR_T__ == 2
__end = ucs2_span(__from, __end, __max, _M_maxcode, _M_mode);
#elif __SIZEOF_WCHAR_T__ == 4
__end = ucs4_span(__from, __end, __max, _M_maxcode, _M_mode);
#else
__end = __from;
#endif
return __end - __from;
}
int
__codecvt_utf8_base::do_max_length() const throw()
{
#if __SIZEOF_WCHAR_T__ == 2
int max = 3; // See __codecvt_utf8_base::do_max_length()
#else
int max = 4; // See __codecvt_utf8_base::do_max_length()
#endif
if (_M_mode & consume_header)
max += sizeof(utf8_bom);
return max;
}
#endif
// Define members of codecvt_utf16 base class implementation.
// Converts from UTF-16 to UCS-2.
__codecvt_utf16_base::~__codecvt_utf16_base() { }
codecvt_base::result
__codecvt_utf16_base::
do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
const intern_type*& __from_next,
extern_type* __to, extern_type* __to_end,
extern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
__from_next = from.next;
__to_next = reinterpret_cast(to.next);
return res;
}
codecvt_base::result
__codecvt_utf16_base::
do_unshift(state_type&, extern_type* __to, extern_type*,
extern_type*& __to_next) const
{
__to_next = __to;
return noconv;
}
codecvt_base::result
__codecvt_utf16_base::
do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
const extern_type*& __from_next,
intern_type* __to, intern_type* __to_end,
intern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
__from_next = reinterpret_cast(from.next);
__to_next = to.next;
if (res == codecvt_base::ok && __from_next != __from_end)
res = codecvt_base::error;
return res;
}
int
__codecvt_utf16_base::do_encoding() const throw()
{ return 0; } // UTF-16 is not a fixed-width encoding
bool
__codecvt_utf16_base::do_always_noconv() const throw()
{ return false; }
int
__codecvt_utf16_base::
do_length(state_type&, const extern_type* __from,
const extern_type* __end, size_t __max) const
{
range from{ __from, __end };
const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
return reinterpret_cast(next) - __from;
}
int
__codecvt_utf16_base::do_max_length() const throw()
{
// A single UCS-2 character requires one UTF-16 code unit (so two chars).
// (UCS-2 cannot represent characters that use multiple UTF-16 code units).
int max = 2;
if (_M_mode & consume_header)
max += sizeof(utf16_bom);
return max;
}
// Define members of codecvt_utf16 base class implementation.
// Converts from UTF-16 to UTF-32 (aka UCS-4).
__codecvt_utf16_base::~__codecvt_utf16_base() { }
codecvt_base::result
__codecvt_utf16_base::
do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
const intern_type*& __from_next,
extern_type* __to, extern_type* __to_end,
extern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
__from_next = from.next;
__to_next = reinterpret_cast(to.next);
return res;
}
codecvt_base::result
__codecvt_utf16_base::
do_unshift(state_type&, extern_type* __to, extern_type*,
extern_type*& __to_next) const
{
__to_next = __to;
return noconv;
}
codecvt_base::result
__codecvt_utf16_base::
do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
const extern_type*& __from_next,
intern_type* __to, intern_type* __to_end,
intern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
__from_next = reinterpret_cast(from.next);
__to_next = to.next;
if (res == codecvt_base::ok && __from_next != __from_end)
res = codecvt_base::error;
return res;
}
int
__codecvt_utf16_base::do_encoding() const throw()
{ return 0; } // UTF-16 is not a fixed-width encoding
bool
__codecvt_utf16_base::do_always_noconv() const throw()
{ return false; }
int
__codecvt_utf16_base::
do_length(state_type&, const extern_type* __from,
const extern_type* __end, size_t __max) const
{
range from{ __from, __end };
const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
return reinterpret_cast(next) - __from;
}
int
__codecvt_utf16_base::do_max_length() const throw()
{
// A single UCS-4 character requires one or two UTF-16 code units
// (so up to four chars).
int max = 4;
if (_M_mode & consume_header)
max += sizeof(utf16_bom);
return max;
}
#ifdef _GLIBCXX_USE_WCHAR_T
// Define members of codecvt_utf16 base class implementation.
// Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
__codecvt_utf16_base::~__codecvt_utf16_base() { }
codecvt_base::result
__codecvt_utf16_base::
do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
const intern_type*& __from_next,
extern_type* __to, extern_type* __to_end,
extern_type*& __to_next) const
{
range to{ __to, __to_end };
#if __SIZEOF_WCHAR_T__ == 2
range from{
reinterpret_cast(__from),
reinterpret_cast(__from_end),
};
auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
#elif __SIZEOF_WCHAR_T__ == 4
range from{
reinterpret_cast(__from),
reinterpret_cast(__from_end),
};
auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
#else
return codecvt_base::error;
#endif
__from_next = reinterpret_cast(from.next);
__to_next = reinterpret_cast(to.next);
return res;
}
codecvt_base::result
__codecvt_utf16_base::
do_unshift(state_type&, extern_type* __to, extern_type*,
extern_type*& __to_next) const
{
__to_next = __to;
return noconv;
}
codecvt_base::result
__codecvt_utf16_base::
do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
const extern_type*& __from_next,
intern_type* __to, intern_type* __to_end,
intern_type*& __to_next) const
{
range from{ __from, __from_end };
#if __SIZEOF_WCHAR_T__ == 2
range to{
reinterpret_cast(__to),
reinterpret_cast(__to_end),
};
auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
#elif __SIZEOF_WCHAR_T__ == 4
range to{
reinterpret_cast(__to),
reinterpret_cast(__to_end),
};
auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
#else
return codecvt_base::error;
#endif
__from_next = reinterpret_cast(from.next);
__to_next = reinterpret_cast(to.next);
if (res == codecvt_base::ok && __from_next != __from_end)
res = codecvt_base::error;
return res;
}
int
__codecvt_utf16_base::do_encoding() const throw()
{ return 0; } // UTF-16 is not a fixed-width encoding
bool
__codecvt_utf16_base::do_always_noconv() const throw()
{ return false; }
int
__codecvt_utf16_base::
do_length(state_type&, const extern_type* __from,
const extern_type* __end, size_t __max) const
{
range from{ __from, __end };
#if __SIZEOF_WCHAR_T__ == 2
const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
#elif __SIZEOF_WCHAR_T__ == 4
const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
#endif
return reinterpret_cast(next) - __from;
}
int
__codecvt_utf16_base::do_max_length() const throw()
{
#if __SIZEOF_WCHAR_T__ == 2
int max = 2; // See __codecvt_utf16_base::do_max_length()
#else
int max = 4; // See __codecvt_utf16_base::do_max_length()
#endif
if (_M_mode & consume_header)
max += sizeof(utf16_bom);
return max;
}
#endif
// Define members of codecvt_utf8_utf16 base class implementation.
// Converts from UTF-8 to UTF-16.
__codecvt_utf8_utf16_base::~__codecvt_utf8_utf16_base() { }
codecvt_base::result
__codecvt_utf8_utf16_base::
do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
const intern_type*& __from_next,
extern_type* __to, extern_type* __to_end,
extern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
auto res = utf16_out(from, to, _M_maxcode, _M_mode);
__from_next = from.next;
__to_next = to.next;
return res;
}
codecvt_base::result
__codecvt_utf8_utf16_base::
do_unshift(state_type&, extern_type* __to, extern_type*,
extern_type*& __to_next) const
{
__to_next = __to;
return noconv;
}
codecvt_base::result
__codecvt_utf8_utf16_base::
do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
const extern_type*& __from_next,
intern_type* __to, intern_type* __to_end,
intern_type*& __to_next) const
{
range from{ __from, __from_end };
range to{ __to, __to_end };
codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
mode = codecvt_mode(mode | little_endian);
#endif
auto res = utf16_in(from, to, _M_maxcode, mode);
__from_next = from.next;
__to_next = to.next;
return res;
}
int
__codecvt_utf8_utf16_base::do_encoding() const throw()
{ return 0; } // UTF-8 is not a fixed-width encoding
bool
__codecvt_utf8_utf16_base::do_always_noconv() const throw()
{ return false; }
int
__codecvt_utf8_utf16_base::
do_length(state_type&, const extern_type* __from,
const extern_type* __end, size_t __max) const
{
__end = utf16_span(__from, __end, __max, _M_maxcode, _M_mode);
return __end - __from;
}
int
__codecvt_utf8_utf16_base::do_max_length() const throw()
{
// A single character can be 1 or 2 UTF-16 code units,
// requiring up to 4 UTF-8 code units.
int max = 4;
if (_M_mode & consume_header)
max += sizeof(utf8_bom);
return max;
}
// Define members of codecvt_utf8_utf16 base class implementation.
// Converts from UTF-8 to UTF-16.
__codecvt_utf8_utf16_base