fastcgi++
utf8_codecvt_facet.cpp
Go to the documentation of this file.
1 
2 // utf8_codecvt_facet.cpp
3 
4 // Copyright © 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
5 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
6 // Use, modification and distribution is subject to the Boost Software
7 // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
8 // http://www.boost.org/LICENSE_1_0.txt)
9 
10 // Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to
11 // learn how this file should be used.
12 
13 #include "utf8_codecvt.hpp"
14 
15 #include <boost/detail/utf8_codecvt_facet.hpp>
16 
17 #include <cstdlib> // for multi-byte converson routines
18 #include <cassert>
19 
20 #include <boost/limits.hpp>
21 #include <boost/config.hpp>
22 
23 // If we don't have wstring, then Unicode support
24 // is not available anyway, so we don't need to even
25 // compiler this file. This also fixes the problem
26 // with mingw, which can compile this file, but will
27 // generate link error when building DLL.
28 #ifndef BOOST_NO_STD_WSTRING
29 
30 BOOST_UTF8_BEGIN_NAMESPACE
31 
33 // implementation for wchar_t
34 
35 // Translate incoming UTF-8 into UCS-4
36 std::codecvt_base::result utf8_codecvt_facet::do_in(
37  std::mbstate_t& /*state*/,
38  const char * from,
39  const char * from_end,
40  const char * & from_next,
41  wchar_t * to,
42  wchar_t * to_end,
43  wchar_t * & to_next
44 ) const {
45  // Basic algorithm: The first octet determines how many
46  // octets total make up the UCS-4 character. The remaining
47  // "continuing octets" all begin with "10". To convert, subtract
48  // the amount that specifies the number of octets from the first
49  // octet. Subtract 0x80 (1000 0000) from each continuing octet,
50  // then mash the whole lot together. Note that each continuing
51  // octet only uses 6 bits as unique values, so only shift by
52  // multiples of 6 to combine.
53  while (from != from_end && to != to_end) {
54 
55  // Error checking on the first octet
56  if (invalid_leading_octet(*from)){
57  from_next = from;
58  to_next = to;
59  return std::codecvt_base::error;
60  }
61 
62  // The first octet is adjusted by a value dependent upon
63  // the number of "continuing octets" encoding the character
64  const int cont_octet_count = get_cont_octet_count(*from);
65  const wchar_t octet1_modifier_table[] = {
66  0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
67  };
68 
69  // The unsigned char conversion is necessary in case char is
70  // signed (I learned this the hard way)
71  wchar_t ucs_result =
72  (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
73 
74  // Invariants :
75  // 1) At the start of the loop, 'i' continuing characters have been
76  // processed
77  // 2) *from points to the next continuing character to be processed.
78  int i = 0;
79  while(i != cont_octet_count && from != from_end) {
80 
81  // Error checking on continuing characters
82  if (invalid_continuing_octet(*from)) {
83  from_next = from;
84  to_next = to;
85  return std::codecvt_base::error;
86  }
87 
88  ucs_result *= (1 << 6);
89 
90  // each continuing character has an extra (10xxxxxx)b attached to
91  // it that must be removed.
92  ucs_result += (unsigned char)(*from++) - 0x80;
93  ++i;
94  }
95 
96  // If the buffer ends with an incomplete unicode character...
97  if (from == from_end && i != cont_octet_count) {
98  // rewind "from" to before the current character translation
99  from_next = from - (i+1);
100  to_next = to;
101  return std::codecvt_base::partial;
102  }
103  *to++ = ucs_result;
104  }
105  from_next = from;
106  to_next = to;
107 
108  // Were we done converting or did we run out of destination space?
109  if(from == from_end) return std::codecvt_base::ok;
110  else return std::codecvt_base::partial;
111 }
112 
113 std::codecvt_base::result utf8_codecvt_facet::do_out(
114  std::mbstate_t& /*state*/,
115  const wchar_t * from,
116  const wchar_t * from_end,
117  const wchar_t * & from_next,
118  char * to,
119  char * to_end,
120  char * & to_next
121 ) const
122 {
123  // RG - consider merging this table with the other one
124  const wchar_t octet1_modifier_table[] = {
125  0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
126  };
127 
128  wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)();
129  while (from != from_end && to != to_end) {
130 
131  // Check for invalid UCS-4 character
132  if (*from > max_wchar) {
133  from_next = from;
134  to_next = to;
135  return std::codecvt_base::error;
136  }
137 
138  int cont_octet_count = get_cont_octet_out_count(*from);
139 
140  // RG - comment this formula better
141  int shift_exponent = (cont_octet_count) * 6;
142 
143  // Process the first character
144  *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] +
145  (unsigned char)(*from / (1 << shift_exponent)));
146 
147  // Process the continuation characters
148  // Invariants: At the start of the loop:
149  // 1) 'i' continuing octets have been generated
150  // 2) '*to' points to the next location to place an octet
151  // 3) shift_exponent is 6 more than needed for the next octet
152  int i = 0;
153  while (i != cont_octet_count && to != to_end) {
154  shift_exponent -= 6;
155  *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)));
156  ++i;
157  }
158  // If we filled up the out buffer before encoding the character
159  if(to == to_end && i != cont_octet_count) {
160  from_next = from;
161  to_next = to - (i+1);
162  return std::codecvt_base::partial;
163  }
164  *from++;
165  }
166  from_next = from;
167  to_next = to;
168  // Were we done or did we run out of destination space
169  if(from == from_end) return std::codecvt_base::ok;
170  else return std::codecvt_base::partial;
171 }
172 
173 // How many char objects can I process to get <= max_limit
174 // wchar_t objects?
175 int utf8_codecvt_facet::do_length(
176  BOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &,
177  const char * from,
178  const char * from_end,
179  std::size_t max_limit
180 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
181 ) const throw()
182 #else
183 ) const
184 #endif
185 {
186  // RG - this code is confusing! I need a better way to express it.
187  // and test cases.
188 
189  // Invariants:
190  // 1) last_octet_count has the size of the last measured character
191  // 2) char_count holds the number of characters shown to fit
192  // within the bounds so far (no greater than max_limit)
193  // 3) from_next points to the octet 'last_octet_count' before the
194  // last measured character.
195  int last_octet_count=0;
196  std::size_t char_count = 0;
197  const char* from_next = from;
198  // Use "<" because the buffer may represent incomplete characters
199  while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
200  from_next += last_octet_count;
201  last_octet_count = (get_octet_count(*from_next));
202  ++char_count;
203  }
204  return static_cast<int>(from_next-from_end);
205 }
206 
207 unsigned int utf8_codecvt_facet::get_octet_count(
208  unsigned char lead_octet
209 ){
210  // if the 0-bit (MSB) is 0, then 1 character
211  if (lead_octet <= 0x7f) return 1;
212 
213  // Otherwise the count number of consecutive 1 bits starting at MSB
214 // assert(0xc0 <= lead_octet && lead_octet <= 0xfd);
215 
216  if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
217  else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
218  else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
219  else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
220  else return 6;
221 }
222 BOOST_UTF8_END_NAMESPACE
223 
224 namespace {
225 template<std::size_t s>
227  if (word < 0x80) {
228  return 0;
229  }
230  if (word < 0x800) {
231  return 1;
232  }
233  return 2;
234 }
235 
236 // note the following code will generate on some platforms where
237 // wchar_t is defined as UCS2. The warnings are superfluous as
238 // the specialization is never instantitiated with such compilers.
239 template<>
241  if (word < 0x80) {
242  return 0;
243  }
244  if (word < 0x800) {
245  return 1;
246  }
247  if (word < 0x10000) {
248  return 2;
249  }
250  if (word < 0x200000) {
251  return 3;
252  }
253  if (word < 0x4000000) {
254  return 4;
255  }
256  return 5;
257 }
258 
259 } // namespace anonymous
260 
261 BOOST_UTF8_BEGIN_NAMESPACE
262 // How many "continuing octets" will be needed for this word
263 // == total octets - 1.
264 int utf8_codecvt_facet::get_cont_octet_out_count(
265  wchar_t word
266 ) const {
267  return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
268 }
269 BOOST_UTF8_END_NAMESPACE
270 
271 #endif