Boost.Locale
|
00001 // 00002 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) 00003 // 00004 // Distributed under the Boost Software License, Version 1.0. (See 00005 // accompanying file LICENSE_1_0.txt or copy at 00006 // http://www.boost.org/LICENSE_1_0.txt) 00007 // 00008 #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED 00009 #define BOOST_LOCALE_UTF_HPP_INCLUDED 00010 00011 #include <boost/cstdint.hpp> 00012 00013 namespace boost { 00014 namespace locale { 00020 namespace utf { 00022 #ifdef __GNUC__ 00023 # define BOOST_LOCALE_LIKELY(x) __builtin_expect((x),1) 00024 # define BOOST_LOCALE_UNLIKELY(x) __builtin_expect((x),0) 00025 #else 00026 # define BOOST_LOCALE_LIKELY(x) (x) 00027 # define BOOST_LOCALE_UNLIKELY(x) (x) 00028 #endif 00029 00030 00034 typedef uint32_t code_point; 00035 00039 static const code_point illegal = 0xFFFFFFFFu; 00040 00044 static const code_point incomplete = 0xFFFFFFFEu; 00045 00049 inline bool is_valid_codepoint(code_point v) 00050 { 00051 if(v>0x10FFFF) 00052 return false; 00053 if(0xD800 <=v && v<= 0xDFFF) // surragates 00054 return false; 00055 return true; 00056 } 00057 00058 #ifdef BOOST_LOCALE_DOXYGEN 00059 00060 00061 00062 template<typename CharType,int size=sizeof(CharType)> 00063 struct utf_traits { 00067 typedef CharType char_type; 00082 template<typename Iterator> 00083 static code_point decode(Iterator &p,Iterator e); 00084 00092 static const int max_width; 00099 static int width(code_point value); 00100 00106 static int trail_length(char_type c); 00110 static bool is_trail(char_type c); 00114 static bool is_lead(char_type c); 00115 00126 template<typename Iterator> 00127 static Iterator encode(code_point value,Iterator out); 00133 template<typename Iterator> 00134 static code_point decode_valid(Iterator &p); 00135 }; 00136 00137 #else 00138 00139 template<typename CharType,int size=sizeof(CharType)> 00140 struct utf_traits; 00141 00142 template<typename CharType> 00143 struct utf_traits<CharType,1> { 00144 00145 typedef CharType char_type; 00146 00147 static int trail_length(char_type ci) 00148 { 00149 unsigned char c = ci; 00150 if(c < 128) 00151 return 0; 00152 if(BOOST_LOCALE_UNLIKELY(c < 194)) 00153 return -1; 00154 if(c < 224) 00155 return 1; 00156 if(c < 240) 00157 return 2; 00158 if(BOOST_LOCALE_LIKELY(c <=244)) 00159 return 3; 00160 return -1; 00161 } 00162 00163 static const int max_width = 4; 00164 00165 static int width(code_point value) 00166 { 00167 if(value <=0x7F) { 00168 return 1; 00169 } 00170 else if(value <=0x7FF) { 00171 return 2; 00172 } 00173 else if(BOOST_LOCALE_LIKELY(value <=0xFFFF)) { 00174 return 3; 00175 } 00176 else { 00177 return 4; 00178 } 00179 } 00180 00181 static bool is_trail(char_type ci) 00182 { 00183 unsigned char c=ci; 00184 return (c & 0xC0)==0x80; 00185 } 00186 00187 static bool is_lead(char_type ci) 00188 { 00189 return !is_trail(ci); 00190 } 00191 00192 template<typename Iterator> 00193 static code_point decode(Iterator &p,Iterator e) 00194 { 00195 if(BOOST_LOCALE_UNLIKELY(p==e)) 00196 return incomplete; 00197 00198 unsigned char lead = *p++; 00199 00200 // First byte is fully validated here 00201 int trail_size = trail_length(lead); 00202 00203 if(BOOST_LOCALE_UNLIKELY(trail_size < 0)) 00204 return illegal; 00205 00206 // 00207 // Ok as only ASCII may be of size = 0 00208 // also optimize for ASCII text 00209 // 00210 if(trail_size == 0) 00211 return lead; 00212 00213 code_point c = lead & ((1<<(6-trail_size))-1); 00214 00215 // Read the rest 00216 unsigned char tmp; 00217 switch(trail_size) { 00218 case 3: 00219 if(BOOST_LOCALE_UNLIKELY(p==e)) 00220 return incomplete; 00221 tmp = *p++; 00222 if (!is_trail(tmp)) 00223 return illegal; 00224 c = (c << 6) | ( tmp & 0x3F); 00225 case 2: 00226 if(BOOST_LOCALE_UNLIKELY(p==e)) 00227 return incomplete; 00228 tmp = *p++; 00229 if (!is_trail(tmp)) 00230 return illegal; 00231 c = (c << 6) | ( tmp & 0x3F); 00232 case 1: 00233 if(BOOST_LOCALE_UNLIKELY(p==e)) 00234 return incomplete; 00235 tmp = *p++; 00236 if (!is_trail(tmp)) 00237 return illegal; 00238 c = (c << 6) | ( tmp & 0x3F); 00239 } 00240 00241 // Check code point validity: no surrogates and 00242 // valid range 00243 if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c))) 00244 return illegal; 00245 00246 // make sure it is the most compact representation 00247 if(BOOST_LOCALE_UNLIKELY(width(c)!=trail_size + 1)) 00248 return illegal; 00249 00250 return c; 00251 00252 } 00253 00254 template<typename Iterator> 00255 static code_point decode_valid(Iterator &p) 00256 { 00257 unsigned char lead = *p++; 00258 if(lead < 192) 00259 return lead; 00260 00261 int trail_size; 00262 00263 if(lead < 224) 00264 trail_size = 1; 00265 else if(BOOST_LOCALE_LIKELY(lead < 240)) // non-BMP rare 00266 trail_size = 2; 00267 else 00268 trail_size = 3; 00269 00270 code_point c = lead & ((1<<(6-trail_size))-1); 00271 00272 switch(trail_size) { 00273 case 3: 00274 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F); 00275 case 2: 00276 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F); 00277 case 1: 00278 c = (c << 6) | ( static_cast<unsigned char>(*p++) & 0x3F); 00279 } 00280 00281 return c; 00282 } 00283 00284 00285 00286 template<typename Iterator> 00287 static Iterator encode(code_point value,Iterator out) 00288 { 00289 if(value <= 0x7F) { 00290 *out++ = static_cast<char_type>(value); 00291 } 00292 else if(value <= 0x7FF) { 00293 *out++ = static_cast<char_type>((value >> 6) | 0xC0); 00294 *out++ = static_cast<char_type>((value & 0x3F) | 0x80); 00295 } 00296 else if(BOOST_LOCALE_LIKELY(value <= 0xFFFF)) { 00297 *out++ = static_cast<char_type>((value >> 12) | 0xE0); 00298 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80); 00299 *out++ = static_cast<char_type>((value & 0x3F) | 0x80); 00300 } 00301 else { 00302 *out++ = static_cast<char_type>((value >> 18) | 0xF0); 00303 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80); 00304 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80); 00305 *out++ = static_cast<char_type>((value & 0x3F) | 0x80); 00306 } 00307 return out; 00308 } 00309 }; // utf8 00310 00311 template<typename CharType> 00312 struct utf_traits<CharType,2> { 00313 typedef CharType char_type; 00314 00315 // See RFC 2781 00316 static bool is_first_surrogate(uint16_t x) 00317 { 00318 return 0xD800 <=x && x<= 0xDBFF; 00319 } 00320 static bool is_second_surrogate(uint16_t x) 00321 { 00322 return 0xDC00 <=x && x<= 0xDFFF; 00323 } 00324 static code_point combine_surrogate(uint16_t w1,uint16_t w2) 00325 { 00326 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000; 00327 } 00328 static int trail_length(char_type c) 00329 { 00330 if(is_first_surrogate(c)) 00331 return 1; 00332 if(is_second_surrogate(c)) 00333 return -1; 00334 return 0; 00335 } 00339 static bool is_trail(char_type c) 00340 { 00341 return is_second_surrogate(c); 00342 } 00346 static bool is_lead(char_type c) 00347 { 00348 return !is_second_surrogate(c); 00349 } 00350 00351 template<typename It> 00352 static code_point decode(It ¤t,It last) 00353 { 00354 if(BOOST_LOCALE_UNLIKELY(current == last)) 00355 return incomplete; 00356 uint16_t w1=*current++; 00357 if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) { 00358 return w1; 00359 } 00360 if(w1 > 0xDBFF) 00361 return illegal; 00362 if(current==last) 00363 return incomplete; 00364 uint16_t w2=*current++; 00365 if(w2 < 0xDC00 || 0xDFFF < w2) 00366 return illegal; 00367 return combine_surrogate(w1,w2); 00368 } 00369 template<typename It> 00370 static code_point decode_valid(It ¤t) 00371 { 00372 uint16_t w1=*current++; 00373 if(BOOST_LOCALE_LIKELY(w1 < 0xD800 || 0xDFFF < w1)) { 00374 return w1; 00375 } 00376 uint16_t w2=*current++; 00377 return combine_surrogate(w1,w2); 00378 } 00379 00380 static const int max_width = 2; 00381 static int width(code_point u) 00382 { 00383 return u>=0x10000 ? 2 : 1; 00384 } 00385 template<typename It> 00386 static It encode(code_point u,It out) 00387 { 00388 if(BOOST_LOCALE_LIKELY(u<=0xFFFF)) { 00389 *out++ = static_cast<char_type>(u); 00390 } 00391 else { 00392 u -= 0x10000; 00393 *out++ = static_cast<char_type>(0xD800 | (u>>10)); 00394 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF)); 00395 } 00396 return out; 00397 } 00398 }; // utf16; 00399 00400 00401 template<typename CharType> 00402 struct utf_traits<CharType,4> { 00403 typedef CharType char_type; 00404 static int trail_length(char_type c) 00405 { 00406 if(is_valid_codepoint(c)) 00407 return 0; 00408 return -1; 00409 } 00410 static bool is_trail(char_type /*c*/) 00411 { 00412 return false; 00413 } 00414 static bool is_lead(char_type /*c*/) 00415 { 00416 return true; 00417 } 00418 00419 template<typename It> 00420 static code_point decode_valid(It ¤t) 00421 { 00422 return *current++; 00423 } 00424 00425 template<typename It> 00426 static code_point decode(It ¤t,It last) 00427 { 00428 if(BOOST_LOCALE_UNLIKELY(current == last)) 00429 return boost::locale::utf::incomplete; 00430 code_point c=*current++; 00431 if(BOOST_LOCALE_UNLIKELY(!is_valid_codepoint(c))) 00432 return boost::locale::utf::illegal; 00433 return c; 00434 } 00435 static const int max_width = 1; 00436 static int width(code_point /*u*/) 00437 { 00438 return 1; 00439 } 00440 template<typename It> 00441 static It encode(code_point u,It out) 00442 { 00443 *out++ = static_cast<char_type>(u); 00444 return out; 00445 } 00446 00447 }; // utf32 00448 00449 #endif 00450 00451 00452 } // utf 00453 } // locale 00454 } // boost 00455 00456 00457 #endif 00458 00459 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4 00460