diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/utf8.h | 68 | ||||
-rw-r--r-- | src/utf8/checked.h | 86 | ||||
-rw-r--r-- | src/utf8/core.h | 92 | ||||
-rw-r--r-- | src/utf8/cpp11.h | 103 | ||||
-rw-r--r-- | src/utf8/cpp17.h | 103 | ||||
-rw-r--r-- | src/utf8/unchecked.h | 93 |
6 files changed, 415 insertions, 130 deletions
@@ -1,34 +1,34 @@ -// Copyright 2006 Nemanja Trifunovic
-
-/*
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-*/
-
-
-#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
-#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
-
-#include "utf8/checked.h"
-#include "utf8/unchecked.h"
-
-#endif // header guard
+// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "utf8/checked.h" +#include "utf8/unchecked.h" + +#endif // header guard diff --git a/src/utf8/checked.h b/src/utf8/checked.h index 1331155..512dcc2 100644 --- a/src/utf8/checked.h +++ b/src/utf8/checked.h @@ -1,4 +1,4 @@ -// Copyright 2006 Nemanja Trifunovic +// Copyright 2006-2016 Nemanja Trifunovic /* Permission is hereby granted, free of charge, to any person or organization @@ -41,8 +41,8 @@ namespace utf8 class invalid_code_point : public exception { uint32_t cp; public: - invalid_code_point(uint32_t cp) : cp(cp) {} - virtual const char* what() const throw() { return "Invalid code point"; } + invalid_code_point(uint32_t codepoint) : cp(codepoint) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid code point"; } uint32_t code_point() const {return cp;} }; @@ -50,7 +50,8 @@ namespace utf8 uint8_t u8; public: invalid_utf8 (uint8_t u) : u8(u) {} - virtual const char* what() const throw() { return "Invalid UTF-8"; } + invalid_utf8 (char c) : u8(static_cast<uint8_t>(c)) {} + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-8"; } uint8_t utf8_octet() const {return u8;} }; @@ -58,13 +59,13 @@ namespace utf8 uint16_t u16; public: invalid_utf16 (uint16_t u) : u16(u) {} - virtual const char* what() const throw() { return "Invalid UTF-16"; } + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Invalid UTF-16"; } uint16_t utf16_word() const {return u16;} }; class not_enough_room : public exception { public: - virtual const char* what() const throw() { return "Not enough space"; } + virtual const char* what() const UTF_CPP_NOEXCEPT UTF_CPP_OVERRIDE { return "Not enough space"; } }; /// The library API - functions intended to be called by the users @@ -75,24 +76,7 @@ namespace utf8 if (!utf8::internal::is_code_point_valid(cp)) throw invalid_code_point(cp); - if (cp < 0x80) // one octet - *(result++) = static_cast<uint8_t>(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); - *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); - *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80); - *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); - } - return result; + return internal::append(cp, result); } template <typename octet_iterator, typename output_iterator> @@ -107,7 +91,9 @@ namespace utf8 *out++ = *it; break; case internal::NOT_ENOUGH_ROOM: - throw not_enough_room(); + out = utf8::append (replacement, out); + start = end; + break; case internal::INVALID_LEAD: out = utf8::append (replacement, out); ++start; @@ -146,7 +132,7 @@ namespace utf8 case internal::INVALID_LEAD : case internal::INCOMPLETE_SEQUENCE : case internal::OVERLONG_SEQUENCE : - throw invalid_utf8(*it); + throw invalid_utf8(static_cast<uint8_t>(*it)); case internal::INVALID_CODE_POINT : throw invalid_code_point(cp); } @@ -174,23 +160,19 @@ namespace utf8 return utf8::peek_next(it, end); } - /// Deprecated in versions that include "prior" - template <typename octet_iterator> - uint32_t previous(octet_iterator& it, octet_iterator pass_start) - { - octet_iterator end = it; - while (utf8::internal::is_trail(*(--it))) - if (it == pass_start) - throw invalid_utf8(*it); // error - no lead byte in the sequence - octet_iterator temp = it; - return utf8::next(temp, end); - } - template <typename octet_iterator, typename distance_type> void advance (octet_iterator& it, distance_type n, octet_iterator end) { - for (distance_type i = 0; i < n; ++i) - utf8::next(it, end); + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + utf8::prior(it, end); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + utf8::next(it, end); + } } template <typename octet_iterator> @@ -233,7 +215,7 @@ namespace utf8 template <typename u16bit_iterator, typename octet_iterator> u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) { - while (start != end) { + while (start < end) { uint32_t cp = utf8::next(start, end); if (cp > 0xffff) { //make a surrogate pair *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); @@ -257,7 +239,7 @@ namespace utf8 template <typename octet_iterator, typename u32bit_iterator> u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) { - while (start != end) + while (start < end) (*result++) = utf8::next(start, end); return result; @@ -265,16 +247,21 @@ namespace utf8 // The iterator class template <typename octet_iterator> - class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { + class iterator { octet_iterator it; octet_iterator range_start; octet_iterator range_end; public: + typedef uint32_t value_type; + typedef uint32_t* pointer; + typedef uint32_t& reference; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; iterator () {} explicit iterator (const octet_iterator& octet_it, - const octet_iterator& range_start, - const octet_iterator& range_end) : - it(octet_it), range_start(range_start), range_end(range_end) + const octet_iterator& rangestart, + const octet_iterator& rangeend) : + it(octet_it), range_start(rangestart), range_end(rangeend) { if (it < range_start || it > range_end) throw std::out_of_range("Invalid utf-8 iterator position"); @@ -322,6 +309,11 @@ namespace utf8 } // namespace utf8 -#endif //header guard +#if UTF_CPP_CPLUSPLUS >= 201703L // C++ 17 or later +#include "cpp17.h" +#elif UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later +#include "cpp11.h" +#endif // C++ 11 or later +#endif //header guard diff --git a/src/utf8/core.h b/src/utf8/core.h index 693d388..34371ee 100644 --- a/src/utf8/core.h +++ b/src/utf8/core.h @@ -30,6 +30,23 @@ DEALINGS IN THE SOFTWARE. #include <iterator> +// Determine the C++ standard version. +// If the user defines UTF_CPP_CPLUSPLUS, use that. +// Otherwise, trust the unreliable predefined macro __cplusplus + +#if !defined UTF_CPP_CPLUSPLUS + #define UTF_CPP_CPLUSPLUS __cplusplus +#endif + +#if UTF_CPP_CPLUSPLUS >= 201103L // C++ 11 or later + #define UTF_CPP_OVERRIDE override + #define UTF_CPP_NOEXCEPT noexcept +#else // C++ 98/03 + #define UTF_CPP_OVERRIDE + #define UTF_CPP_NOEXCEPT throw() +#endif // C++ 11 or later + + namespace utf8 { // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers @@ -49,8 +66,8 @@ namespace internal const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; - const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); - const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; + const uint16_t LEAD_OFFSET = 0xd7c0u; // LEAD_SURROGATE_MIN - (0x10000 >> 10) + const uint32_t SURROGATE_OFFSET = 0xfca02400u; // 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN // Maximum valid value for a Unicode code point const uint32_t CODE_POINT_MAX = 0x0010ffffu; @@ -142,7 +159,7 @@ namespace internal if (!utf8::internal::is_trail(*it)) return INCOMPLETE_SEQUENCE; - + return UTF8_OK; } @@ -165,7 +182,7 @@ namespace internal { if (it == end) return NOT_ENOUGH_ROOM; - + code_point = utf8::internal::mask8(*it); UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) @@ -222,6 +239,9 @@ namespace internal template <typename octet_iterator> utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) { + if (it == end) + return NOT_ENOUGH_ROOM; + // Save the original value of it so we can go back in case of failure // Of course, it does not make much sense with i.e. stream iterators octet_iterator original_it = it; @@ -234,7 +254,7 @@ namespace internal // Get trail octets and calculate the code point utf_error err = UTF8_OK; switch (length) { - case 0: + case 0: return INVALID_LEAD; case 1: err = utf8::internal::get_sequence_1(it, end, cp); @@ -277,6 +297,55 @@ namespace internal return utf8::internal::validate_next(it, end, ignored); } + // Internal implementation of both checked and unchecked append() function + // This function will be invoked by the overloads below, as they will know + // the octet_type. + template <typename octet_iterator, typename octet_type> + octet_iterator append(uint32_t cp, octet_iterator result) { + if (cp < 0x80) // one octet + *(result++) = static_cast<octet_type>(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast<octet_type>((cp >> 6) | 0xc0); + *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast<octet_type>((cp >> 12) | 0xe0); + *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast<octet_type>((cp >> 18) | 0xf0); + *(result++) = static_cast<octet_type>(((cp >> 12) & 0x3f)| 0x80); + *(result++) = static_cast<octet_type>(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast<octet_type>((cp & 0x3f) | 0x80); + } + return result; + } + + // One of the following overloads will be invoked from the API calls + + // A simple (but dangerous) case: the caller appends byte(s) to a char array + inline char* append(uint32_t cp, char* result) { + return append<char*, char>(cp, result); + } + + // Hopefully, most common case: the caller uses back_inserter + // i.e. append(cp, std::back_inserter(str)); + template<typename container_type> + std::back_insert_iterator<container_type> append + (uint32_t cp, std::back_insert_iterator<container_type> result) { + return append<std::back_insert_iterator<container_type>, + typename container_type::value_type>(cp, result); + } + + // The caller uses some other kind of output operator - not covered above + // Note that in this case we are not able to determine octet_type + // so we assume it's uint_8; that can cause a conversion warning if we are wrong. + template <typename octet_iterator> + octet_iterator append(uint32_t cp, octet_iterator result) { + return append<octet_iterator, uint8_t>(cp, result); + } + } // namespace internal /// The library API - functions intended to be called by the users @@ -310,18 +379,7 @@ namespace internal ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) ); - } - - //Deprecated in release 2.3 - template <typename octet_iterator> - inline bool is_bom (octet_iterator it) - { - return ( - (utf8::internal::mask8(*it++)) == bom[0] && - (utf8::internal::mask8(*it++)) == bom[1] && - (utf8::internal::mask8(*it)) == bom[2] - ); - } + } } // namespace utf8 #endif // header guard diff --git a/src/utf8/cpp11.h b/src/utf8/cpp11.h new file mode 100644 index 0000000..2366f12 --- /dev/null +++ b/src/utf8/cpp11.h @@ -0,0 +1,103 @@ +// Copyright 2018 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 +#define UTF8_FOR_CPP_a184c22c_d012_11e8_a8d5_f2801f1b9fd1 + +#include "checked.h" +#include <string> + +namespace utf8 +{ + + inline void append(char32_t cp, std::string& s) + { + append(uint32_t(cp), std::back_inserter(s)); + } + + inline std::string utf16to8(const std::u16string& s) + { + std::string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(const std::string& s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::string utf32to8(const std::u32string& s) + { + std::string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(const std::string& s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::size_t find_invalid(const std::string& s) + { + std::string::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string::npos : static_cast<std::size_t>(invalid - s.begin()); + } + + inline bool is_valid(const std::string& s) + { + return is_valid(s.begin(), s.end()); + } + + inline std::string replace_invalid(const std::string& s, char32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(const std::string& s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline bool starts_with_bom(const std::string& s) + { + return starts_with_bom(s.begin(), s.end()); + } + +} // namespace utf8 + +#endif // header guard + diff --git a/src/utf8/cpp17.h b/src/utf8/cpp17.h new file mode 100644 index 0000000..32a77ce --- /dev/null +++ b/src/utf8/cpp17.h @@ -0,0 +1,103 @@ +// Copyright 2018 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 +#define UTF8_FOR_CPP_7e906c01_03a3_4daf_b420_ea7ea952b3c9 + +#include "checked.h" +#include <string> + +namespace utf8 +{ + + inline void append(char32_t cp, std::string& s) + { + append(uint32_t(cp), std::back_inserter(s)); + } + + inline std::string utf16to8(std::u16string_view s) + { + std::string result; + utf16to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u16string utf8to16(std::string_view s) + { + std::u16string result; + utf8to16(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::string utf32to8(std::u32string_view s) + { + std::string result; + utf32to8(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::u32string utf8to32(std::string_view s) + { + std::u32string result; + utf8to32(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline std::size_t find_invalid(std::string_view s) + { + std::string_view::const_iterator invalid = find_invalid(s.begin(), s.end()); + return (invalid == s.end()) ? std::string_view::npos : static_cast<std::size_t>(invalid - s.begin()); + } + + inline bool is_valid(std::string_view s) + { + return is_valid(s.begin(), s.end()); + } + + inline std::string replace_invalid(std::string_view s, char32_t replacement) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result), replacement); + return result; + } + + inline std::string replace_invalid(std::string_view s) + { + std::string result; + replace_invalid(s.begin(), s.end(), std::back_inserter(result)); + return result; + } + + inline bool starts_with_bom(std::string_view s) + { + return starts_with_bom(s.begin(), s.end()); + } + +} // namespace utf8 + +#endif // header guard + diff --git a/src/utf8/unchecked.h b/src/utf8/unchecked.h index cb24271..8fe83c9 100644 --- a/src/utf8/unchecked.h +++ b/src/utf8/unchecked.h @@ -32,29 +32,52 @@ DEALINGS IN THE SOFTWARE. namespace utf8 { - namespace unchecked + namespace unchecked { template <typename octet_iterator> octet_iterator append(uint32_t cp, octet_iterator result) { - if (cp < 0x80) // one octet - *(result++) = static_cast<uint8_t>(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); - *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); - *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80); - *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); + return internal::append(cp, result); + } + + template <typename octet_iterator, typename output_iterator> + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + out = utf8::unchecked::append (replacement, out); + start = end; + break; + case internal::INVALID_LEAD: + out = utf8::unchecked::append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::unchecked::append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } } - return result; + return out; + } + + template <typename octet_iterator, typename output_iterator> + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + return utf8::unchecked::replace_invalid(start, end, out, replacement_marker); } template <typename octet_iterator> @@ -85,13 +108,13 @@ namespace utf8 break; } ++it; - return cp; + return cp; } template <typename octet_iterator> uint32_t peek_next(octet_iterator it) { - return utf8::unchecked::next(it); + return utf8::unchecked::next(it); } template <typename octet_iterator> @@ -102,18 +125,19 @@ namespace utf8 return utf8::unchecked::next(temp); } - // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) - template <typename octet_iterator> - inline uint32_t previous(octet_iterator& it) - { - return utf8::unchecked::prior(it); - } - template <typename octet_iterator, typename distance_type> void advance (octet_iterator& it, distance_type n) { - for (distance_type i = 0; i < n; ++i) - utf8::unchecked::next(it); + const distance_type zero(0); + if (n < zero) { + // backward + for (distance_type i = n; i < zero; ++i) + utf8::unchecked::prior(it); + } else { + // forward + for (distance_type i = zero; i < n; ++i) + utf8::unchecked::next(it); + } } template <typename octet_iterator> @@ -128,7 +152,7 @@ namespace utf8 template <typename u16bit_iterator, typename octet_iterator> octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { + { while (start != end) { uint32_t cp = utf8::internal::mask16(*start++); // Take care of surrogate pairs first @@ -138,7 +162,7 @@ namespace utf8 } result = utf8::unchecked::append(cp, result); } - return result; + return result; } template <typename u16bit_iterator, typename octet_iterator> @@ -176,9 +200,14 @@ namespace utf8 // The iterator class template <typename octet_iterator> - class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { + class iterator { octet_iterator it; public: + typedef uint32_t value_type; + typedef uint32_t* pointer; + typedef uint32_t& reference; + typedef std::ptrdiff_t difference_type; + typedef std::bidirectional_iterator_tag iterator_category; iterator () {} explicit iterator (const octet_iterator& octet_it): it(octet_it) {} // the default "big three" are OK |