00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #include <boost/detail/utf8_codecvt_facet.hpp>
00014
00015 #include <cstdlib>
00016 #include <cassert>
00017
00018 #include <boost/limits.hpp>
00019 #include <boost/config.hpp>
00020
00021
00022
00023
00024
00025
00026 #ifndef BOOST_NO_STD_WSTRING
00027
00028 BOOST_UTF8_BEGIN_NAMESPACE
00029
00031
00032
00033
00034 std::codecvt_base::result utf8_codecvt_facet::do_in(
00035 std::mbstate_t& ,
00036 const char * from,
00037 const char * from_end,
00038 const char * & from_next,
00039 wchar_t * to,
00040 wchar_t * to_end,
00041 wchar_t * & to_next
00042 ) const {
00043
00044
00045
00046
00047
00048
00049
00050
00051 while (from != from_end && to != to_end) {
00052
00053
00054 if (invalid_leading_octet(*from)){
00055 from_next = from;
00056 to_next = to;
00057 return std::codecvt_base::error;
00058 }
00059
00060
00061
00062 const int cont_octet_count = get_cont_octet_count(*from);
00063 const wchar_t octet1_modifier_table[] = {
00064 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
00065 };
00066
00067
00068
00069 wchar_t ucs_result =
00070 (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
00071
00072
00073
00074
00075
00076 int i = 0;
00077 while(i != cont_octet_count && from != from_end) {
00078
00079
00080 if (invalid_continuing_octet(*from)) {
00081 from_next = from;
00082 to_next = to;
00083 return std::codecvt_base::error;
00084 }
00085
00086 ucs_result *= (1 << 6);
00087
00088
00089
00090 ucs_result += (unsigned char)(*from++) - 0x80;
00091 ++i;
00092 }
00093
00094
00095 if (from == from_end && i != cont_octet_count) {
00096
00097 from_next = from - (i+1);
00098 to_next = to;
00099 return std::codecvt_base::partial;
00100 }
00101 *to++ = ucs_result;
00102 }
00103 from_next = from;
00104 to_next = to;
00105
00106
00107 if(from == from_end) return std::codecvt_base::ok;
00108 else return std::codecvt_base::partial;
00109 }
00110
00111 std::codecvt_base::result utf8_codecvt_facet::do_out(
00112 std::mbstate_t& ,
00113 const wchar_t * from,
00114 const wchar_t * from_end,
00115 const wchar_t * & from_next,
00116 char * to,
00117 char * to_end,
00118 char * & to_next
00119 ) const
00120 {
00121
00122 const wchar_t octet1_modifier_table[] = {
00123 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
00124 };
00125
00126 wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)();
00127 while (from != from_end && to != to_end) {
00128
00129
00130 if (*from > max_wchar) {
00131 from_next = from;
00132 to_next = to;
00133 return std::codecvt_base::error;
00134 }
00135
00136 int cont_octet_count = get_cont_octet_out_count(*from);
00137
00138
00139 int shift_exponent = (cont_octet_count) * 6;
00140
00141
00142 *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] +
00143 (unsigned char)(*from / (1 << shift_exponent)));
00144
00145
00146
00147
00148
00149
00150 int i = 0;
00151 while (i != cont_octet_count && to != to_end) {
00152 shift_exponent -= 6;
00153 *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)));
00154 ++i;
00155 }
00156
00157 if(to == to_end && i != cont_octet_count) {
00158 from_next = from;
00159 to_next = to - (i+1);
00160 return std::codecvt_base::partial;
00161 }
00162 *from++;
00163 }
00164 from_next = from;
00165 to_next = to;
00166
00167 if(from == from_end) return std::codecvt_base::ok;
00168 else return std::codecvt_base::partial;
00169 }
00170
00171
00172
00173 int utf8_codecvt_facet::do_length(
00174 BOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &,
00175 const char * from,
00176 const char * from_end,
00177 std::size_t max_limit
00178 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
00179 ) const throw()
00180 #else
00181 ) const
00182 #endif
00183 {
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193 int last_octet_count=0;
00194 std::size_t char_count = 0;
00195 const char* from_next = from;
00196
00197 while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
00198 from_next += last_octet_count;
00199 last_octet_count = (get_octet_count(*from_next));
00200 ++char_count;
00201 }
00202 return static_cast<int>(from_next-from_end);
00203 }
00204
00205 unsigned int utf8_codecvt_facet::get_octet_count(
00206 unsigned char lead_octet
00207 ){
00208
00209 if (lead_octet <= 0x7f) return 1;
00210
00211
00212
00213
00214 if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
00215 else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
00216 else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
00217 else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
00218 else return 6;
00219 }
00220 BOOST_UTF8_END_NAMESPACE
00221
00222 namespace {
00223 template<std::size_t s>
00224 int get_cont_octet_out_count_impl(wchar_t word){
00225 if (word < 0x80) {
00226 return 0;
00227 }
00228 if (word < 0x800) {
00229 return 1;
00230 }
00231 return 2;
00232 }
00233
00234
00235
00236
00237 template<>
00238 int get_cont_octet_out_count_impl<4>(wchar_t word){
00239 if (word < 0x80) {
00240 return 0;
00241 }
00242 if (word < 0x800) {
00243 return 1;
00244 }
00245 if (word < 0x10000) {
00246 return 2;
00247 }
00248 if (word < 0x200000) {
00249 return 3;
00250 }
00251 if (word < 0x4000000) {
00252 return 4;
00253 }
00254 return 5;
00255 }
00256
00257 }
00258
00259 BOOST_UTF8_BEGIN_NAMESPACE
00260
00261
00262 int utf8_codecvt_facet::get_cont_octet_out_count(
00263 wchar_t word
00264 ) const {
00265 return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
00266 }
00267 BOOST_UTF8_END_NAMESPACE
00268
00269 #endif