diff options
Diffstat (limited to 'src/utf8/unchecked.h')
-rw-r--r-- | src/utf8/unchecked.h | 93 |
1 files changed, 61 insertions, 32 deletions
diff --git a/src/utf8/unchecked.h b/src/utf8/unchecked.h index cb24271..8fe83c9 100644 --- a/src/utf8/unchecked.h +++ b/src/utf8/unchecked.h @@ -32,29 +32,52 @@ DEALINGS IN THE SOFTWARE. namespace utf8 { - namespace unchecked + namespace unchecked { template <typename octet_iterator> octet_iterator append(uint32_t cp, octet_iterator result) { - if (cp < 0x80) // one octet - *(result++) = static_cast<uint8_t>(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); - *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); - *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80); - *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); + return internal::append(cp, result); + } + + template <typename octet_iterator, typename output_iterator> + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + out = utf8::unchecked::append (replacement, out); + start = end; + break; + case internal::INVALID_LEAD: + out = utf8::unchecked::append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::unchecked::append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } } - return result; + return out; + } + + template <typename octet_iterator, typename output_iterator> + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); } template <typename octet_iterator> @@ -85,13 +108,13 @@ namespace utf8 break; } ++it; - return cp; + return cp; } template <typename octet_iterator> uint32_t peek_next(octet_iterator it) { - return utf8::unchecked::next(it); + return utf8::unchecked::next(it); } template <typename octet_iterator> @@ -102,18 +125,19 @@ namespace utf8 return utf8::unchecked::next(temp); } - // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) - template <typename octet_iterator> - inline uint32_t previous(octet_iterator& it) - { - return utf8::unchecked::prior(it); - } - template <typename octet_iterator, typename distance_type> void advance (octet_iterator& it, distance_type n) { - for (distance_type i = 0; i < n; ++i) - utf8::unchecked::next(it); + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + utf8::unchecked::prior(it); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + utf8::unchecked::next(it); + } } template <typename octet_iterator> @@ -128,7 +152,7 @@ namespace utf8 template <typename u16bit_iterator, typename octet_iterator> octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { + { while (start != end) { uint32_t cp = utf8::internal::mask16(*start++); // Take care of surrogate pairs first @@ -138,7 +162,7 @@ namespace utf8 } result = utf8::unchecked::append(cp, result); } - return result; + return result; } template <typename u16bit_iterator, typename octet_iterator> @@ -176,9 +200,14 @@ namespace utf8 // The iterator class template <typename octet_iterator> - class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { + class iterator { octet_iterator it; public: + typedef uint32_t value_type; + typedef uint32_t* pointer; + typedef uint32_t& reference; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; iterator () {} explicit iterator (const octet_iterator& octet_it): it(octet_it) {} // the default "big three" are OK |