1 /*
2 *
3 * Copyright (c) 2004
4 * John Maddock
5 *
6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 *
10 */
11
12 /*
13 * LOCATION: see http://www.boost.org for most recent version.
14 * FILE unicode_iterator.hpp
15 * VERSION see <boost/version.hpp>
16 * DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
17 */
18
19 /****************************************************************************
20
21 Contents:
22 ~~~~~~~~~
23
24 1) Read Only, Input Adapters:
25 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26
27 template <class BaseIterator, class U8Type = ::boost::uint8_t>
28 class u32_to_u8_iterator;
29
30 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
31
32 template <class BaseIterator, class U32Type = ::boost::uint32_t>
33 class u8_to_u32_iterator;
34
35 Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
36
37 template <class BaseIterator, class U16Type = ::boost::uint16_t>
38 class u32_to_u16_iterator;
39
40 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
41
42 template <class BaseIterator, class U32Type = ::boost::uint32_t>
43 class u16_to_u32_iterator;
44
45 Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
46
47 2) Single pass output iterator adapters:
48
49 template <class BaseIterator>
50 class utf8_output_iterator;
51
52 Accepts UTF-32 code points and forwards them on as UTF-8 code points.
53
54 template <class BaseIterator>
55 class utf16_output_iterator;
56
57 Accepts UTF-32 code points and forwards them on as UTF-16 code points.
58
59 ****************************************************************************/
60
61 #ifndef BOOST_REGEX_V4_UNICODE_ITERATOR_HPP
62 #define BOOST_REGEX_V4_UNICODE_ITERATOR_HPP
63 #include <boost/cstdint.hpp>
64 #include <boost/regex/config.hpp>
65 #include <boost/iterator/iterator_facade.hpp>
66 #include <boost/static_assert.hpp>
67 #include <boost/throw_exception.hpp>
68 #include <stdexcept>
69 #ifndef BOOST_NO_STD_LOCALE
70 #include <sstream>
71 #include <ios>
72 #endif
73 #include <limits.h> // CHAR_BIT
74
75 #ifdef BOOST_REGEX_CXX03
76
77 #else
78 #endif
79
80 namespace boost{
81
82 namespace detail{
83
84 static const ::boost::uint16_t high_surrogate_base = 0xD7C0u;
85 static const ::boost::uint16_t low_surrogate_base = 0xDC00u;
86 static const ::boost::uint32_t ten_bit_mask = 0x3FFu;
87
is_high_surrogate(::boost::uint16_t v)88 inline bool is_high_surrogate(::boost::uint16_t v)
89 {
90 return (v & 0xFFFFFC00u) == 0xd800u;
91 }
is_low_surrogate(::boost::uint16_t v)92 inline bool is_low_surrogate(::boost::uint16_t v)
93 {
94 return (v & 0xFFFFFC00u) == 0xdc00u;
95 }
96 template <class T>
is_surrogate(T v)97 inline bool is_surrogate(T v)
98 {
99 return (v & 0xFFFFF800u) == 0xd800;
100 }
101
utf8_byte_count(boost::uint8_t c)102 inline unsigned utf8_byte_count(boost::uint8_t c)
103 {
104 // if the most significant bit with a zero in it is in position
105 // 8-N then there are N bytes in this UTF-8 sequence:
106 boost::uint8_t mask = 0x80u;
107 unsigned result = 0;
108 while(c & mask)
109 {
110 ++result;
111 mask >>= 1;
112 }
113 return (result == 0) ? 1 : ((result > 4) ? 4 : result);
114 }
115
utf8_trailing_byte_count(boost::uint8_t c)116 inline unsigned utf8_trailing_byte_count(boost::uint8_t c)
117 {
118 return utf8_byte_count(c) - 1;
119 }
120
121 #ifdef BOOST_MSVC
122 #pragma warning(push)
123 #pragma warning(disable:4100)
124 #endif
125 #ifndef BOOST_NO_EXCEPTIONS
126 BOOST_NORETURN
127 #endif
invalid_utf32_code_point(::boost::uint32_t val)128 inline void invalid_utf32_code_point(::boost::uint32_t val)
129 {
130 #ifndef BOOST_NO_STD_LOCALE
131 std::stringstream ss;
132 ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
133 std::out_of_range e(ss.str());
134 #else
135 std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-16 sequence");
136 #endif
137 boost::throw_exception(e);
138 }
139 #ifdef BOOST_MSVC
140 #pragma warning(pop)
141 #endif
142
143
144 } // namespace detail
145
146 template <class BaseIterator, class U16Type = ::boost::uint16_t>
147 class u32_to_u16_iterator
148 : public boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type>
149 {
150 typedef boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type;
151
152 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
153 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
154
155 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
156 BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16);
157 #endif
158
159 public:
160 typename base_type::reference
dereference() const161 dereference()const
162 {
163 if(m_current == 2)
164 extract_current();
165 return m_values[m_current];
166 }
equal(const u32_to_u16_iterator & that) const167 bool equal(const u32_to_u16_iterator& that)const
168 {
169 if(m_position == that.m_position)
170 {
171 // Both m_currents must be equal, or both even
172 // this is the same as saying their sum must be even:
173 return (m_current + that.m_current) & 1u ? false : true;
174 }
175 return false;
176 }
increment()177 void increment()
178 {
179 // if we have a pending read then read now, so that we know whether
180 // to skip a position, or move to a low-surrogate:
181 if(m_current == 2)
182 {
183 // pending read:
184 extract_current();
185 }
186 // move to the next surrogate position:
187 ++m_current;
188 // if we've reached the end skip a position:
189 if(m_values[m_current] == 0)
190 {
191 m_current = 2;
192 ++m_position;
193 }
194 }
decrement()195 void decrement()
196 {
197 if(m_current != 1)
198 {
199 // decrementing an iterator always leads to a valid position:
200 --m_position;
201 extract_current();
202 m_current = m_values[1] ? 1 : 0;
203 }
204 else
205 {
206 m_current = 0;
207 }
208 }
base() const209 BaseIterator base()const
210 {
211 return m_position;
212 }
213 // construct:
u32_to_u16_iterator()214 u32_to_u16_iterator() : m_position(), m_current(0)
215 {
216 m_values[0] = 0;
217 m_values[1] = 0;
218 m_values[2] = 0;
219 }
u32_to_u16_iterator(BaseIterator b)220 u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
221 {
222 m_values[0] = 0;
223 m_values[1] = 0;
224 m_values[2] = 0;
225 }
226 private:
227
extract_current() const228 void extract_current()const
229 {
230 // begin by checking for a code point out of range:
231 ::boost::uint32_t v = *m_position;
232 if(v >= 0x10000u)
233 {
234 if(v > 0x10FFFFu)
235 detail::invalid_utf32_code_point(*m_position);
236 // split into two surrogates:
237 m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
238 m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
239 m_current = 0;
240 BOOST_REGEX_ASSERT(detail::is_high_surrogate(m_values[0]));
241 BOOST_REGEX_ASSERT(detail::is_low_surrogate(m_values[1]));
242 }
243 else
244 {
245 // 16-bit code point:
246 m_values[0] = static_cast<U16Type>(*m_position);
247 m_values[1] = 0;
248 m_current = 0;
249 // value must not be a surrogate:
250 if(detail::is_surrogate(m_values[0]))
251 detail::invalid_utf32_code_point(*m_position);
252 }
253 }
254 BaseIterator m_position;
255 mutable U16Type m_values[3];
256 mutable unsigned m_current;
257 };
258
259 template <class BaseIterator, class U32Type = ::boost::uint32_t>
260 class u16_to_u32_iterator
261 : public boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
262 {
263 typedef boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
264 // special values for pending iterator reads:
265 BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
266
267 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
268 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
269
270 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16);
271 BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
272 #endif
273
274 public:
275 typename base_type::reference
dereference() const276 dereference()const
277 {
278 if(m_value == pending_read)
279 extract_current();
280 return m_value;
281 }
equal(const u16_to_u32_iterator & that) const282 bool equal(const u16_to_u32_iterator& that)const
283 {
284 return m_position == that.m_position;
285 }
increment()286 void increment()
287 {
288 // skip high surrogate first if there is one:
289 if(detail::is_high_surrogate(*m_position)) ++m_position;
290 ++m_position;
291 m_value = pending_read;
292 }
decrement()293 void decrement()
294 {
295 --m_position;
296 // if we have a low surrogate then go back one more:
297 if(detail::is_low_surrogate(*m_position))
298 --m_position;
299 m_value = pending_read;
300 }
base() const301 BaseIterator base()const
302 {
303 return m_position;
304 }
305 // construct:
u16_to_u32_iterator()306 u16_to_u32_iterator() : m_position()
307 {
308 m_value = pending_read;
309 }
u16_to_u32_iterator(BaseIterator b)310 u16_to_u32_iterator(BaseIterator b) : m_position(b)
311 {
312 m_value = pending_read;
313 }
314 //
315 // Range checked version:
316 //
u16_to_u32_iterator(BaseIterator b,BaseIterator start,BaseIterator end)317 u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
318 {
319 m_value = pending_read;
320 //
321 // The range must not start with a low surrogate, or end in a high surrogate,
322 // otherwise we run the risk of running outside the underlying input range.
323 // Likewise b must not be located at a low surrogate.
324 //
325 boost::uint16_t val;
326 if(start != end)
327 {
328 if((b != start) && (b != end))
329 {
330 val = *b;
331 if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
332 invalid_code_point(val);
333 }
334 val = *start;
335 if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
336 invalid_code_point(val);
337 val = *--end;
338 if(detail::is_high_surrogate(val))
339 invalid_code_point(val);
340 }
341 }
342 private:
invalid_code_point(::boost::uint16_t val)343 static void invalid_code_point(::boost::uint16_t val)
344 {
345 #ifndef BOOST_NO_STD_LOCALE
346 std::stringstream ss;
347 ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
348 std::out_of_range e(ss.str());
349 #else
350 std::out_of_range e("Misplaced UTF-16 surrogate encountered while trying to encode UTF-32 sequence");
351 #endif
352 boost::throw_exception(e);
353 }
extract_current() const354 void extract_current()const
355 {
356 m_value = static_cast<U32Type>(static_cast< ::boost::uint16_t>(*m_position));
357 // if the last value is a high surrogate then adjust m_position and m_value as needed:
358 if(detail::is_high_surrogate(*m_position))
359 {
360 // precondition; next value must have be a low-surrogate:
361 BaseIterator next(m_position);
362 ::boost::uint16_t t = *++next;
363 if((t & 0xFC00u) != 0xDC00u)
364 invalid_code_point(t);
365 m_value = (m_value - detail::high_surrogate_base) << 10;
366 m_value |= (static_cast<U32Type>(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask);
367 }
368 // postcondition; result must not be a surrogate:
369 if(detail::is_surrogate(m_value))
370 invalid_code_point(static_cast< ::boost::uint16_t>(m_value));
371 }
372 BaseIterator m_position;
373 mutable U32Type m_value;
374 };
375
376 template <class BaseIterator, class U8Type = ::boost::uint8_t>
377 class u32_to_u8_iterator
378 : public boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type>
379 {
380 typedef boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type> base_type;
381
382 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
383 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
384
385 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
386 BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8);
387 #endif
388
389 public:
390 typename base_type::reference
dereference() const391 dereference()const
392 {
393 if(m_current == 4)
394 extract_current();
395 return m_values[m_current];
396 }
equal(const u32_to_u8_iterator & that) const397 bool equal(const u32_to_u8_iterator& that)const
398 {
399 if(m_position == that.m_position)
400 {
401 // either the m_current's must be equal, or one must be 0 and
402 // the other 4: which means neither must have bits 1 or 2 set:
403 return (m_current == that.m_current)
404 || (((m_current | that.m_current) & 3) == 0);
405 }
406 return false;
407 }
increment()408 void increment()
409 {
410 // if we have a pending read then read now, so that we know whether
411 // to skip a position, or move to a low-surrogate:
412 if(m_current == 4)
413 {
414 // pending read:
415 extract_current();
416 }
417 // move to the next surrogate position:
418 ++m_current;
419 // if we've reached the end skip a position:
420 if(m_values[m_current] == 0)
421 {
422 m_current = 4;
423 ++m_position;
424 }
425 }
decrement()426 void decrement()
427 {
428 if((m_current & 3) == 0)
429 {
430 --m_position;
431 extract_current();
432 m_current = 3;
433 while(m_current && (m_values[m_current] == 0))
434 --m_current;
435 }
436 else
437 --m_current;
438 }
base() const439 BaseIterator base()const
440 {
441 return m_position;
442 }
443 // construct:
u32_to_u8_iterator()444 u32_to_u8_iterator() : m_position(), m_current(0)
445 {
446 m_values[0] = 0;
447 m_values[1] = 0;
448 m_values[2] = 0;
449 m_values[3] = 0;
450 m_values[4] = 0;
451 }
u32_to_u8_iterator(BaseIterator b)452 u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
453 {
454 m_values[0] = 0;
455 m_values[1] = 0;
456 m_values[2] = 0;
457 m_values[3] = 0;
458 m_values[4] = 0;
459 }
460 private:
461
extract_current() const462 void extract_current()const
463 {
464 boost::uint32_t c = *m_position;
465 if(c > 0x10FFFFu)
466 detail::invalid_utf32_code_point(c);
467 if(c < 0x80u)
468 {
469 m_values[0] = static_cast<unsigned char>(c);
470 m_values[1] = static_cast<unsigned char>(0u);
471 m_values[2] = static_cast<unsigned char>(0u);
472 m_values[3] = static_cast<unsigned char>(0u);
473 }
474 else if(c < 0x800u)
475 {
476 m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
477 m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
478 m_values[2] = static_cast<unsigned char>(0u);
479 m_values[3] = static_cast<unsigned char>(0u);
480 }
481 else if(c < 0x10000u)
482 {
483 m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
484 m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
485 m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
486 m_values[3] = static_cast<unsigned char>(0u);
487 }
488 else
489 {
490 m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
491 m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
492 m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
493 m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
494 }
495 m_current= 0;
496 }
497 BaseIterator m_position;
498 mutable U8Type m_values[5];
499 mutable unsigned m_current;
500 };
501
502 template <class BaseIterator, class U32Type = ::boost::uint32_t>
503 class u8_to_u32_iterator
504 : public boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
505 {
506 typedef boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
507 // special values for pending iterator reads:
508 BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
509
510 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS)
511 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
512
513 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8);
514 BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
515 #endif
516
517 public:
518 typename base_type::reference
dereference() const519 dereference()const
520 {
521 if(m_value == pending_read)
522 extract_current();
523 return m_value;
524 }
equal(const u8_to_u32_iterator & that) const525 bool equal(const u8_to_u32_iterator& that)const
526 {
527 return m_position == that.m_position;
528 }
increment()529 void increment()
530 {
531 // We must not start with a continuation character:
532 if((static_cast<boost::uint8_t>(*m_position) & 0xC0) == 0x80)
533 invalid_sequence();
534 // skip high surrogate first if there is one:
535 unsigned c = detail::utf8_byte_count(*m_position);
536 if(m_value == pending_read)
537 {
538 // Since we haven't read in a value, we need to validate the code points:
539 for(unsigned i = 0; i < c; ++i)
540 {
541 ++m_position;
542 // We must have a continuation byte:
543 if((i != c - 1) && ((static_cast<boost::uint8_t>(*m_position) & 0xC0) != 0x80))
544 invalid_sequence();
545 }
546 }
547 else
548 {
549 std::advance(m_position, c);
550 }
551 m_value = pending_read;
552 }
decrement()553 void decrement()
554 {
555 // Keep backtracking until we don't have a trailing character:
556 unsigned count = 0;
557 while((*--m_position & 0xC0u) == 0x80u) ++count;
558 // now check that the sequence was valid:
559 if(count != detail::utf8_trailing_byte_count(*m_position))
560 invalid_sequence();
561 m_value = pending_read;
562 }
base() const563 BaseIterator base()const
564 {
565 return m_position;
566 }
567 // construct:
u8_to_u32_iterator()568 u8_to_u32_iterator() : m_position()
569 {
570 m_value = pending_read;
571 }
u8_to_u32_iterator(BaseIterator b)572 u8_to_u32_iterator(BaseIterator b) : m_position(b)
573 {
574 m_value = pending_read;
575 }
576 //
577 // Checked constructor:
578 //
u8_to_u32_iterator(BaseIterator b,BaseIterator start,BaseIterator end)579 u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
580 {
581 m_value = pending_read;
582 //
583 // We must not start with a continuation character, or end with a
584 // truncated UTF-8 sequence otherwise we run the risk of going past
585 // the start/end of the underlying sequence:
586 //
587 if(start != end)
588 {
589 unsigned char v = *start;
590 if((v & 0xC0u) == 0x80u)
591 invalid_sequence();
592 if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
593 invalid_sequence();
594 BaseIterator pos = end;
595 do
596 {
597 v = *--pos;
598 }
599 while((start != pos) && ((v & 0xC0u) == 0x80u));
600 std::ptrdiff_t extra = detail::utf8_byte_count(v);
601 if(std::distance(pos, end) < extra)
602 invalid_sequence();
603 }
604 }
605 private:
invalid_sequence()606 static void invalid_sequence()
607 {
608 std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
609 boost::throw_exception(e);
610 }
extract_current() const611 void extract_current()const
612 {
613 m_value = static_cast<U32Type>(static_cast< ::boost::uint8_t>(*m_position));
614 // we must not have a continuation character:
615 if((m_value & 0xC0u) == 0x80u)
616 invalid_sequence();
617 // see how many extra bytes we have:
618 unsigned extra = detail::utf8_trailing_byte_count(*m_position);
619 // extract the extra bits, 6 from each extra byte:
620 BaseIterator next(m_position);
621 for(unsigned c = 0; c < extra; ++c)
622 {
623 ++next;
624 m_value <<= 6;
625 // We must have a continuation byte:
626 if((static_cast<boost::uint8_t>(*next) & 0xC0) != 0x80)
627 invalid_sequence();
628 m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu;
629 }
630 // we now need to remove a few of the leftmost bits, but how many depends
631 // upon how many extra bytes we've extracted:
632 static const boost::uint32_t masks[4] =
633 {
634 0x7Fu,
635 0x7FFu,
636 0xFFFFu,
637 0x1FFFFFu,
638 };
639 m_value &= masks[extra];
640 // check the result is in range:
641 if(m_value > static_cast<U32Type>(0x10FFFFu))
642 invalid_sequence();
643 // The result must not be a surrogate:
644 if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
645 invalid_sequence();
646 // We should not have had an invalidly encoded UTF8 sequence:
647 if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
648 invalid_sequence();
649 }
650 BaseIterator m_position;
651 mutable U32Type m_value;
652 };
653
654 template <class BaseIterator>
655 class utf16_output_iterator
656 {
657 public:
658 typedef void difference_type;
659 typedef void value_type;
660 typedef boost::uint32_t* pointer;
661 typedef boost::uint32_t& reference;
662 typedef std::output_iterator_tag iterator_category;
663
utf16_output_iterator(const BaseIterator & b)664 utf16_output_iterator(const BaseIterator& b)
665 : m_position(b){}
utf16_output_iterator(const utf16_output_iterator & that)666 utf16_output_iterator(const utf16_output_iterator& that)
667 : m_position(that.m_position){}
operator =(const utf16_output_iterator & that)668 utf16_output_iterator& operator=(const utf16_output_iterator& that)
669 {
670 m_position = that.m_position;
671 return *this;
672 }
operator *() const673 const utf16_output_iterator& operator*()const
674 {
675 return *this;
676 }
operator =(boost::uint32_t val) const677 void operator=(boost::uint32_t val)const
678 {
679 push(val);
680 }
operator ++()681 utf16_output_iterator& operator++()
682 {
683 return *this;
684 }
operator ++(int)685 utf16_output_iterator& operator++(int)
686 {
687 return *this;
688 }
base() const689 BaseIterator base()const
690 {
691 return m_position;
692 }
693 private:
push(boost::uint32_t v) const694 void push(boost::uint32_t v)const
695 {
696 if(v >= 0x10000u)
697 {
698 // begin by checking for a code point out of range:
699 if(v > 0x10FFFFu)
700 detail::invalid_utf32_code_point(v);
701 // split into two surrogates:
702 *m_position++ = static_cast<boost::uint16_t>(v >> 10) + detail::high_surrogate_base;
703 *m_position++ = static_cast<boost::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
704 }
705 else
706 {
707 // 16-bit code point:
708 // value must not be a surrogate:
709 if(detail::is_surrogate(v))
710 detail::invalid_utf32_code_point(v);
711 *m_position++ = static_cast<boost::uint16_t>(v);
712 }
713 }
714 mutable BaseIterator m_position;
715 };
716
717 template <class BaseIterator>
718 class utf8_output_iterator
719 {
720 public:
721 typedef void difference_type;
722 typedef void value_type;
723 typedef boost::uint32_t* pointer;
724 typedef boost::uint32_t& reference;
725 typedef std::output_iterator_tag iterator_category;
726
utf8_output_iterator(const BaseIterator & b)727 utf8_output_iterator(const BaseIterator& b)
728 : m_position(b){}
utf8_output_iterator(const utf8_output_iterator & that)729 utf8_output_iterator(const utf8_output_iterator& that)
730 : m_position(that.m_position){}
operator =(const utf8_output_iterator & that)731 utf8_output_iterator& operator=(const utf8_output_iterator& that)
732 {
733 m_position = that.m_position;
734 return *this;
735 }
operator *() const736 const utf8_output_iterator& operator*()const
737 {
738 return *this;
739 }
operator =(boost::uint32_t val) const740 void operator=(boost::uint32_t val)const
741 {
742 push(val);
743 }
operator ++()744 utf8_output_iterator& operator++()
745 {
746 return *this;
747 }
operator ++(int)748 utf8_output_iterator& operator++(int)
749 {
750 return *this;
751 }
base() const752 BaseIterator base()const
753 {
754 return m_position;
755 }
756 private:
push(boost::uint32_t c) const757 void push(boost::uint32_t c)const
758 {
759 if(c > 0x10FFFFu)
760 detail::invalid_utf32_code_point(c);
761 if(c < 0x80u)
762 {
763 *m_position++ = static_cast<unsigned char>(c);
764 }
765 else if(c < 0x800u)
766 {
767 *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
768 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
769 }
770 else if(c < 0x10000u)
771 {
772 *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
773 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
774 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
775 }
776 else
777 {
778 *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
779 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
780 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
781 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
782 }
783 }
784 mutable BaseIterator m_position;
785 };
786
787 } // namespace boost
788
789 #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP
790
791