aboutsummaryrefslogtreecommitdiff
path: root/src/utf8/unchecked.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/utf8/unchecked.h')
-rw-r--r--src/utf8/unchecked.h93
1 files changed, 61 insertions, 32 deletions
diff --git a/src/utf8/unchecked.h b/src/utf8/unchecked.h
index cb24271..8fe83c9 100644
--- a/src/utf8/unchecked.h
+++ b/src/utf8/unchecked.h
@@ -32,29 +32,52 @@ DEALINGS IN THE SOFTWARE.
namespace utf8
{
- namespace unchecked
+ namespace unchecked
{
template <typename octet_iterator>
octet_iterator append(uint32_t cp, octet_iterator result)
{
- if (cp < 0x80) // one octet
- *(result++) = static_cast<uint8_t>(cp);
- else if (cp < 0x800) { // two octets
- *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
- *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
- }
- else if (cp < 0x10000) { // three octets
- *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
- *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
- *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
- }
- else { // four octets
- *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
- *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
- *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
- *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
+ return internal::append(cp, result);
+ }
+
+ template <typename octet_iterator, typename output_iterator>
+ output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
+ {
+ while (start != end) {
+ octet_iterator sequence_start = start;
+ internal::utf_error err_code = utf8::internal::validate_next(start, end);
+ switch (err_code) {
+ case internal::UTF8_OK :
+ for (octet_iterator it = sequence_start; it != start; ++it)
+ *out++ = *it;
+ break;
+ case internal::NOT_ENOUGH_ROOM:
+ out = utf8::unchecked::append (replacement, out);
+ start = end;
+ break;
+ case internal::INVALID_LEAD:
+ out = utf8::unchecked::append (replacement, out);
+ ++start;
+ break;
+ case internal::INCOMPLETE_SEQUENCE:
+ case internal::OVERLONG_SEQUENCE:
+ case internal::INVALID_CODE_POINT:
+ out = utf8::unchecked::append (replacement, out);
+ ++start;
+ // just one replacement mark for the sequence
+ while (start != end && utf8::internal::is_trail(*start))
+ ++start;
+ break;
+ }
}
- return result;
+ return out;
+ }
+
+ template <typename octet_iterator, typename output_iterator>
+ inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
+ {
+ static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
+ return utf8::unchecked::replace_invalid(start, end, out, replacement_marker);
}
template <typename octet_iterator>
@@ -85,13 +108,13 @@ namespace utf8
break;
}
++it;
- return cp;
+ return cp;
}
template <typename octet_iterator>
uint32_t peek_next(octet_iterator it)
{
- return utf8::unchecked::next(it);
+ return utf8::unchecked::next(it);
}
template <typename octet_iterator>
@@ -102,18 +125,19 @@ namespace utf8
return utf8::unchecked::next(temp);
}
- // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
- template <typename octet_iterator>
- inline uint32_t previous(octet_iterator& it)
- {
- return utf8::unchecked::prior(it);
- }
-
template <typename octet_iterator, typename distance_type>
void advance (octet_iterator& it, distance_type n)
{
- for (distance_type i = 0; i < n; ++i)
- utf8::unchecked::next(it);
+ const distance_type zero(0);
+ if (n < zero) {
+ // backward
+ for (distance_type i = n; i < zero; ++i)
+ utf8::unchecked::prior(it);
+ } else {
+ // forward
+ for (distance_type i = zero; i < n; ++i)
+ utf8::unchecked::next(it);
+ }
}
template <typename octet_iterator>
@@ -128,7 +152,7 @@ namespace utf8
template <typename u16bit_iterator, typename octet_iterator>
octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
- {
+ {
while (start != end) {
uint32_t cp = utf8::internal::mask16(*start++);
// Take care of surrogate pairs first
@@ -138,7 +162,7 @@ namespace utf8
}
result = utf8::unchecked::append(cp, result);
}
- return result;
+ return result;
}
template <typename u16bit_iterator, typename octet_iterator>
@@ -176,9 +200,14 @@ namespace utf8
// The iterator class
template <typename octet_iterator>
- class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
+ class iterator {
octet_iterator it;
public:
+ typedef uint32_t value_type;
+ typedef uint32_t* pointer;
+ typedef uint32_t& reference;
+ typedef std::ptrdiff_t difference_type;
+ typedef std::bidirectional_iterator_tag iterator_category;
iterator () {}
explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
// the default "big three" are OK