fastcgi++
|
00001 00002 // utf8_codecvt_facet.cpp 00003 00004 // Copyright © 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) 00005 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). 00006 // Use, modification and distribution is subject to the Boost Software 00007 // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at 00008 // http://www.boost.org/LICENSE_1_0.txt) 00009 00010 // Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to 00011 // learn how this file should be used. 00012 00013 #include "utf8_codecvt.hpp" 00014 00015 #include <boost/detail/utf8_codecvt_facet.hpp> 00016 00017 #include <cstdlib> // for multi-byte converson routines 00018 #include <cassert> 00019 00020 #include <boost/limits.hpp> 00021 #include <boost/config.hpp> 00022 00023 // If we don't have wstring, then Unicode support 00024 // is not available anyway, so we don't need to even 00025 // compiler this file. This also fixes the problem 00026 // with mingw, which can compile this file, but will 00027 // generate link error when building DLL. 00028 #ifndef BOOST_NO_STD_WSTRING 00029 00030 BOOST_UTF8_BEGIN_NAMESPACE 00031 00033 // implementation for wchar_t 00034 00035 // Translate incoming UTF-8 into UCS-4 00036 std::codecvt_base::result utf8_codecvt_facet::do_in( 00037 std::mbstate_t& /*state*/, 00038 const char * from, 00039 const char * from_end, 00040 const char * & from_next, 00041 wchar_t * to, 00042 wchar_t * to_end, 00043 wchar_t * & to_next 00044 ) const { 00045 // Basic algorithm: The first octet determines how many 00046 // octets total make up the UCS-4 character. The remaining 00047 // "continuing octets" all begin with "10". To convert, subtract 00048 // the amount that specifies the number of octets from the first 00049 // octet. Subtract 0x80 (1000 0000) from each continuing octet, 00050 // then mash the whole lot together. Note that each continuing 00051 // octet only uses 6 bits as unique values, so only shift by 00052 // multiples of 6 to combine. 00053 while (from != from_end && to != to_end) { 00054 00055 // Error checking on the first octet 00056 if (invalid_leading_octet(*from)){ 00057 from_next = from; 00058 to_next = to; 00059 return std::codecvt_base::error; 00060 } 00061 00062 // The first octet is adjusted by a value dependent upon 00063 // the number of "continuing octets" encoding the character 00064 const int cont_octet_count = get_cont_octet_count(*from); 00065 const wchar_t octet1_modifier_table[] = { 00066 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc 00067 }; 00068 00069 // The unsigned char conversion is necessary in case char is 00070 // signed (I learned this the hard way) 00071 wchar_t ucs_result = 00072 (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count]; 00073 00074 // Invariants : 00075 // 1) At the start of the loop, 'i' continuing characters have been 00076 // processed 00077 // 2) *from points to the next continuing character to be processed. 00078 int i = 0; 00079 while(i != cont_octet_count && from != from_end) { 00080 00081 // Error checking on continuing characters 00082 if (invalid_continuing_octet(*from)) { 00083 from_next = from; 00084 to_next = to; 00085 return std::codecvt_base::error; 00086 } 00087 00088 ucs_result *= (1 << 6); 00089 00090 // each continuing character has an extra (10xxxxxx)b attached to 00091 // it that must be removed. 00092 ucs_result += (unsigned char)(*from++) - 0x80; 00093 ++i; 00094 } 00095 00096 // If the buffer ends with an incomplete unicode character... 00097 if (from == from_end && i != cont_octet_count) { 00098 // rewind "from" to before the current character translation 00099 from_next = from - (i+1); 00100 to_next = to; 00101 return std::codecvt_base::partial; 00102 } 00103 *to++ = ucs_result; 00104 } 00105 from_next = from; 00106 to_next = to; 00107 00108 // Were we done converting or did we run out of destination space? 00109 if(from == from_end) return std::codecvt_base::ok; 00110 else return std::codecvt_base::partial; 00111 } 00112 00113 std::codecvt_base::result utf8_codecvt_facet::do_out( 00114 std::mbstate_t& /*state*/, 00115 const wchar_t * from, 00116 const wchar_t * from_end, 00117 const wchar_t * & from_next, 00118 char * to, 00119 char * to_end, 00120 char * & to_next 00121 ) const 00122 { 00123 // RG - consider merging this table with the other one 00124 const wchar_t octet1_modifier_table[] = { 00125 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc 00126 }; 00127 00128 wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)(); 00129 while (from != from_end && to != to_end) { 00130 00131 // Check for invalid UCS-4 character 00132 if (*from > max_wchar) { 00133 from_next = from; 00134 to_next = to; 00135 return std::codecvt_base::error; 00136 } 00137 00138 int cont_octet_count = get_cont_octet_out_count(*from); 00139 00140 // RG - comment this formula better 00141 int shift_exponent = (cont_octet_count) * 6; 00142 00143 // Process the first character 00144 *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] + 00145 (unsigned char)(*from / (1 << shift_exponent))); 00146 00147 // Process the continuation characters 00148 // Invariants: At the start of the loop: 00149 // 1) 'i' continuing octets have been generated 00150 // 2) '*to' points to the next location to place an octet 00151 // 3) shift_exponent is 6 more than needed for the next octet 00152 int i = 0; 00153 while (i != cont_octet_count && to != to_end) { 00154 shift_exponent -= 6; 00155 *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6))); 00156 ++i; 00157 } 00158 // If we filled up the out buffer before encoding the character 00159 if(to == to_end && i != cont_octet_count) { 00160 from_next = from; 00161 to_next = to - (i+1); 00162 return std::codecvt_base::partial; 00163 } 00164 *from++; 00165 } 00166 from_next = from; 00167 to_next = to; 00168 // Were we done or did we run out of destination space 00169 if(from == from_end) return std::codecvt_base::ok; 00170 else return std::codecvt_base::partial; 00171 } 00172 00173 // How many char objects can I process to get <= max_limit 00174 // wchar_t objects? 00175 int utf8_codecvt_facet::do_length( 00176 BOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &, 00177 const char * from, 00178 const char * from_end, 00179 std::size_t max_limit 00180 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) 00181 ) const throw() 00182 #else 00183 ) const 00184 #endif 00185 { 00186 // RG - this code is confusing! I need a better way to express it. 00187 // and test cases. 00188 00189 // Invariants: 00190 // 1) last_octet_count has the size of the last measured character 00191 // 2) char_count holds the number of characters shown to fit 00192 // within the bounds so far (no greater than max_limit) 00193 // 3) from_next points to the octet 'last_octet_count' before the 00194 // last measured character. 00195 int last_octet_count=0; 00196 std::size_t char_count = 0; 00197 const char* from_next = from; 00198 // Use "<" because the buffer may represent incomplete characters 00199 while (from_next+last_octet_count <= from_end && char_count <= max_limit) { 00200 from_next += last_octet_count; 00201 last_octet_count = (get_octet_count(*from_next)); 00202 ++char_count; 00203 } 00204 return static_cast<int>(from_next-from_end); 00205 } 00206 00207 unsigned int utf8_codecvt_facet::get_octet_count( 00208 unsigned char lead_octet 00209 ){ 00210 // if the 0-bit (MSB) is 0, then 1 character 00211 if (lead_octet <= 0x7f) return 1; 00212 00213 // Otherwise the count number of consecutive 1 bits starting at MSB 00214 // assert(0xc0 <= lead_octet && lead_octet <= 0xfd); 00215 00216 if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2; 00217 else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3; 00218 else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4; 00219 else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5; 00220 else return 6; 00221 } 00222 BOOST_UTF8_END_NAMESPACE 00223 00224 namespace { 00225 template<std::size_t s> 00226 int get_cont_octet_out_count_impl(wchar_t word){ 00227 if (word < 0x80) { 00228 return 0; 00229 } 00230 if (word < 0x800) { 00231 return 1; 00232 } 00233 return 2; 00234 } 00235 00236 // note the following code will generate on some platforms where 00237 // wchar_t is defined as UCS2. The warnings are superfluous as 00238 // the specialization is never instantitiated with such compilers. 00239 template<> 00240 int get_cont_octet_out_count_impl<4>(wchar_t word){ 00241 if (word < 0x80) { 00242 return 0; 00243 } 00244 if (word < 0x800) { 00245 return 1; 00246 } 00247 if (word < 0x10000) { 00248 return 2; 00249 } 00250 if (word < 0x200000) { 00251 return 3; 00252 } 00253 if (word < 0x4000000) { 00254 return 4; 00255 } 00256 return 5; 00257 } 00258 00259 } // namespace anonymous 00260 00261 BOOST_UTF8_BEGIN_NAMESPACE 00262 // How many "continuing octets" will be needed for this word 00263 // == total octets - 1. 00264 int utf8_codecvt_facet::get_cont_octet_out_count( 00265 wchar_t word 00266 ) const { 00267 return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word); 00268 } 00269 BOOST_UTF8_END_NAMESPACE 00270 00271 #endif