libstdc++
codecvt_specializations.h
Go to the documentation of this file.
00001 // Locale support (codecvt) -*- C++ -*-
00002 
00003 // Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
00004 // 2008, 2009, 2010
00005 // Free Software Foundation, Inc.
00006 //
00007 // This file is part of the GNU ISO C++ Library.  This library is free
00008 // software; you can redistribute it and/or modify it under the
00009 // terms of the GNU General Public License as published by the
00010 // Free Software Foundation; either version 3, or (at your option)
00011 // any later version.
00012 
00013 // This library is distributed in the hope that it will be useful,
00014 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016 // GNU General Public License for more details.
00017 
00018 // Under Section 7 of GPL version 3, you are granted additional
00019 // permissions described in the GCC Runtime Library Exception, version
00020 // 3.1, as published by the Free Software Foundation.
00021 
00022 // You should have received a copy of the GNU General Public License and
00023 // a copy of the GCC Runtime Library Exception along with this program;
00024 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00025 // <http://www.gnu.org/licenses/>.
00026 
00027 //
00028 // ISO C++ 14882: 22.2.1.5 Template class codecvt
00029 //
00030 
00031 // Written by Benjamin Kosnik <bkoz@redhat.com>
00032 
00033 /** @file ext/codecvt_specializations.h
00034  *  This file is a GNU extension to the Standard C++ Library.
00035  */
00036 
00037 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
00038 #define _EXT_CODECVT_SPECIALIZATIONS_H 1
00039 
00040 #include <bits/c++config.h>
00041 #include <locale>
00042 #include <iconv.h>
00043 
00044 namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
00045 {
00046 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00047 
00048   /// Extension to use iconv for dealing with character encodings.
00049   // This includes conversions and comparisons between various character
00050   // sets.  This object encapsulates data that may need to be shared between
00051   // char_traits, codecvt and ctype.
00052   class encoding_state
00053   {
00054   public:
00055     // Types: 
00056     // NB: A conversion descriptor subsumes and enhances the
00057     // functionality of a simple state type such as mbstate_t.
00058     typedef iconv_t descriptor_type;
00059     
00060   protected:
00061     // Name of internal character set encoding.
00062     std::string         _M_int_enc;
00063 
00064     // Name of external character set encoding.
00065     std::string     _M_ext_enc;
00066 
00067     // Conversion descriptor between external encoding to internal encoding.
00068     descriptor_type _M_in_desc;
00069 
00070     // Conversion descriptor between internal encoding to external encoding.
00071     descriptor_type _M_out_desc;
00072 
00073     // The byte-order marker for the external encoding, if necessary.
00074     int         _M_ext_bom;
00075 
00076     // The byte-order marker for the internal encoding, if necessary.
00077     int         _M_int_bom;
00078 
00079     // Number of external bytes needed to construct one complete
00080     // character in the internal encoding.
00081     // NB: -1 indicates variable, or stateful, encodings.
00082     int         _M_bytes;
00083 
00084   public:
00085     explicit 
00086     encoding_state() 
00087     : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
00088     { }
00089 
00090     explicit 
00091     encoding_state(const char* __int, const char* __ext, 
00092            int __ibom = 0, int __ebom = 0, int __bytes = 1)
00093     : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0), 
00094       _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
00095     { init(); }
00096 
00097     // 21.1.2 traits typedefs
00098     // p4
00099     // typedef STATE_T state_type
00100     // requires: state_type shall meet the requirements of
00101     // CopyConstructible types (20.1.3)
00102     // NB: This does not preserve the actual state of the conversion
00103     // descriptor member, but it does duplicate the encoding
00104     // information.
00105     encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
00106     { construct(__obj); }
00107 
00108     // Need assignment operator as well.
00109     encoding_state&
00110     operator=(const encoding_state& __obj)
00111     {
00112       construct(__obj);
00113       return *this;
00114     }
00115 
00116     ~encoding_state()
00117     { destroy(); } 
00118 
00119     bool
00120     good() const throw()
00121     { 
00122       const descriptor_type __err = (iconv_t)(-1);
00123       bool __test = _M_in_desc && _M_in_desc != __err; 
00124       __test &=  _M_out_desc && _M_out_desc != __err;
00125       return __test;
00126     }
00127     
00128     int
00129     character_ratio() const
00130     { return _M_bytes; }
00131 
00132     const std::string
00133     internal_encoding() const
00134     { return _M_int_enc; }
00135 
00136     int 
00137     internal_bom() const
00138     { return _M_int_bom; }
00139 
00140     const std::string
00141     external_encoding() const
00142     { return _M_ext_enc; }
00143 
00144     int 
00145     external_bom() const
00146     { return _M_ext_bom; }
00147 
00148     const descriptor_type&
00149     in_descriptor() const
00150     { return _M_in_desc; }
00151 
00152     const descriptor_type&
00153     out_descriptor() const
00154     { return _M_out_desc; }
00155 
00156   protected:
00157     void
00158     init()
00159     {
00160       const descriptor_type __err = (iconv_t)(-1);
00161       const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
00162       if (!_M_in_desc && __have_encodings)
00163     {
00164       _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
00165       if (_M_in_desc == __err)
00166         std::__throw_runtime_error(__N("encoding_state::_M_init "
00167                     "creating iconv input descriptor failed"));
00168     }
00169       if (!_M_out_desc && __have_encodings)
00170     {
00171       _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
00172       if (_M_out_desc == __err)
00173         std::__throw_runtime_error(__N("encoding_state::_M_init "
00174                   "creating iconv output descriptor failed"));
00175     }
00176     }
00177 
00178     void
00179     construct(const encoding_state& __obj)
00180     {
00181       destroy();
00182       _M_int_enc = __obj._M_int_enc;
00183       _M_ext_enc = __obj._M_ext_enc;
00184       _M_ext_bom = __obj._M_ext_bom;
00185       _M_int_bom = __obj._M_int_bom;
00186       _M_bytes = __obj._M_bytes;
00187       init();
00188     }
00189 
00190     void
00191     destroy() throw()
00192     {
00193       const descriptor_type __err = (iconv_t)(-1);
00194       if (_M_in_desc && _M_in_desc != __err) 
00195     {
00196       iconv_close(_M_in_desc);
00197       _M_in_desc = 0;
00198     }
00199       if (_M_out_desc && _M_out_desc != __err) 
00200     {
00201       iconv_close(_M_out_desc);
00202       _M_out_desc = 0;
00203     }
00204     }
00205   };
00206 
00207   /// encoding_char_traits
00208   // Custom traits type with encoding_state for the state type, and the
00209   // associated fpos<encoding_state> for the position type, all other
00210   // bits equivalent to the required char_traits instantiations.
00211   template<typename _CharT>
00212     struct encoding_char_traits : public std::char_traits<_CharT>
00213     {
00214       typedef encoding_state                state_type;
00215       typedef typename std::fpos<state_type>        pos_type;
00216     };
00217 
00218 _GLIBCXX_END_NAMESPACE_VERSION
00219 } // namespace
00220 
00221 
00222 namespace std _GLIBCXX_VISIBILITY(default)
00223 {
00224 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00225 
00226   using __gnu_cxx::encoding_state;
00227 
00228   /// codecvt<InternT, _ExternT, encoding_state> specialization.
00229   // This partial specialization takes advantage of iconv to provide
00230   // code conversions between a large number of character encodings.
00231   template<typename _InternT, typename _ExternT>
00232     class codecvt<_InternT, _ExternT, encoding_state>
00233     : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
00234     {
00235     public:      
00236       // Types:
00237       typedef codecvt_base::result          result;
00238       typedef _InternT                  intern_type;
00239       typedef _ExternT                  extern_type;
00240       typedef __gnu_cxx::encoding_state         state_type;
00241       typedef state_type::descriptor_type       descriptor_type;
00242 
00243       // Data Members:
00244       static locale::id         id;
00245 
00246       explicit 
00247       codecvt(size_t __refs = 0)
00248       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
00249       { }
00250 
00251       explicit 
00252       codecvt(state_type& __enc, size_t __refs = 0)
00253       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
00254       { }
00255 
00256      protected:
00257       virtual 
00258       ~codecvt() { }
00259 
00260       virtual result
00261       do_out(state_type& __state, const intern_type* __from, 
00262          const intern_type* __from_end, const intern_type*& __from_next,
00263          extern_type* __to, extern_type* __to_end,
00264          extern_type*& __to_next) const;
00265 
00266       virtual result
00267       do_unshift(state_type& __state, extern_type* __to, 
00268          extern_type* __to_end, extern_type*& __to_next) const;
00269 
00270       virtual result
00271       do_in(state_type& __state, const extern_type* __from, 
00272         const extern_type* __from_end, const extern_type*& __from_next,
00273         intern_type* __to, intern_type* __to_end, 
00274         intern_type*& __to_next) const;
00275 
00276       virtual int 
00277       do_encoding() const throw();
00278 
00279       virtual bool 
00280       do_always_noconv() const throw();
00281 
00282       virtual int 
00283       do_length(state_type&, const extern_type* __from, 
00284         const extern_type* __end, size_t __max) const;
00285 
00286       virtual int 
00287       do_max_length() const throw();
00288     };
00289 
00290   template<typename _InternT, typename _ExternT>
00291     locale::id 
00292     codecvt<_InternT, _ExternT, encoding_state>::id;
00293 
00294   // This adaptor works around the signature problems of the second
00295   // argument to iconv():  SUSv2 and others use 'const char**', but glibc 2.2
00296   // uses 'char**', which matches the POSIX 1003.1-2001 standard.
00297   // Using this adaptor, g++ will do the work for us.
00298   template<typename _Tp>
00299     inline size_t
00300     __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
00301                     iconv_t __cd, char** __inbuf, size_t* __inbytes,
00302                     char** __outbuf, size_t* __outbytes)
00303     { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
00304 
00305   template<typename _InternT, typename _ExternT>
00306     codecvt_base::result
00307     codecvt<_InternT, _ExternT, encoding_state>::
00308     do_out(state_type& __state, const intern_type* __from, 
00309        const intern_type* __from_end, const intern_type*& __from_next,
00310        extern_type* __to, extern_type* __to_end,
00311        extern_type*& __to_next) const
00312     {
00313       result __ret = codecvt_base::error;
00314       if (__state.good())
00315     {
00316       const descriptor_type& __desc = __state.out_descriptor();
00317       const size_t __fmultiple = sizeof(intern_type);
00318       size_t __fbytes = __fmultiple * (__from_end - __from);
00319       const size_t __tmultiple = sizeof(extern_type);
00320       size_t __tbytes = __tmultiple * (__to_end - __to); 
00321       
00322       // Argument list for iconv specifies a byte sequence. Thus,
00323       // all to/from arrays must be brutally casted to char*.
00324       char* __cto = reinterpret_cast<char*>(__to);
00325       char* __cfrom;
00326       size_t __conv;
00327 
00328       // Some encodings need a byte order marker as the first item
00329       // in the byte stream, to designate endian-ness. The default
00330       // value for the byte order marker is NULL, so if this is
00331       // the case, it's not necessary and we can just go on our
00332       // merry way.
00333       int __int_bom = __state.internal_bom();
00334       if (__int_bom)
00335         {     
00336           size_t __size = __from_end - __from;
00337           intern_type* __cfixed = static_cast<intern_type*>
00338         (__builtin_alloca(sizeof(intern_type) * (__size + 1)));
00339           __cfixed[0] = static_cast<intern_type>(__int_bom);
00340           char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
00341           __cfrom = reinterpret_cast<char*>(__cfixed);
00342           __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00343                                         &__fbytes, &__cto, &__tbytes); 
00344         }
00345       else
00346         {
00347           intern_type* __cfixed = const_cast<intern_type*>(__from);
00348           __cfrom = reinterpret_cast<char*>(__cfixed);
00349           __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes, 
00350                        &__cto, &__tbytes); 
00351         }
00352 
00353       if (__conv != size_t(-1))
00354         {
00355           __from_next = reinterpret_cast<const intern_type*>(__cfrom);
00356           __to_next = reinterpret_cast<extern_type*>(__cto);
00357           __ret = codecvt_base::ok;
00358         }
00359       else 
00360         {
00361           if (__fbytes < __fmultiple * (__from_end - __from))
00362         {
00363           __from_next = reinterpret_cast<const intern_type*>(__cfrom);
00364           __to_next = reinterpret_cast<extern_type*>(__cto);
00365           __ret = codecvt_base::partial;
00366         }
00367           else
00368         __ret = codecvt_base::error;
00369         }
00370     }
00371       return __ret; 
00372     }
00373 
00374   template<typename _InternT, typename _ExternT>
00375     codecvt_base::result
00376     codecvt<_InternT, _ExternT, encoding_state>::
00377     do_unshift(state_type& __state, extern_type* __to, 
00378            extern_type* __to_end, extern_type*& __to_next) const
00379     {
00380       result __ret = codecvt_base::error;
00381       if (__state.good())
00382     {
00383       const descriptor_type& __desc = __state.in_descriptor();
00384       const size_t __tmultiple = sizeof(intern_type);
00385       size_t __tlen = __tmultiple * (__to_end - __to); 
00386       
00387       // Argument list for iconv specifies a byte sequence. Thus,
00388       // all to/from arrays must be brutally casted to char*.
00389       char* __cto = reinterpret_cast<char*>(__to);
00390       size_t __conv = __iconv_adaptor(iconv,__desc, 0, 0,
00391                                           &__cto, &__tlen); 
00392       
00393       if (__conv != size_t(-1))
00394         {
00395           __to_next = reinterpret_cast<extern_type*>(__cto);
00396           if (__tlen == __tmultiple * (__to_end - __to))
00397         __ret = codecvt_base::noconv;
00398           else if (__tlen == 0)
00399         __ret = codecvt_base::ok;
00400           else
00401         __ret = codecvt_base::partial;
00402         }
00403       else 
00404         __ret = codecvt_base::error;
00405     }
00406       return __ret; 
00407     }
00408    
00409   template<typename _InternT, typename _ExternT>
00410     codecvt_base::result
00411     codecvt<_InternT, _ExternT, encoding_state>::
00412     do_in(state_type& __state, const extern_type* __from, 
00413       const extern_type* __from_end, const extern_type*& __from_next,
00414       intern_type* __to, intern_type* __to_end, 
00415       intern_type*& __to_next) const
00416     { 
00417       result __ret = codecvt_base::error;
00418       if (__state.good())
00419     {
00420       const descriptor_type& __desc = __state.in_descriptor();
00421       const size_t __fmultiple = sizeof(extern_type);
00422       size_t __flen = __fmultiple * (__from_end - __from);
00423       const size_t __tmultiple = sizeof(intern_type);
00424       size_t __tlen = __tmultiple * (__to_end - __to); 
00425       
00426       // Argument list for iconv specifies a byte sequence. Thus,
00427       // all to/from arrays must be brutally casted to char*.
00428       char* __cto = reinterpret_cast<char*>(__to);
00429       char* __cfrom;
00430       size_t __conv;
00431 
00432       // Some encodings need a byte order marker as the first item
00433       // in the byte stream, to designate endian-ness. The default
00434       // value for the byte order marker is NULL, so if this is
00435       // the case, it's not necessary and we can just go on our
00436       // merry way.
00437       int __ext_bom = __state.external_bom();
00438       if (__ext_bom)
00439         {     
00440           size_t __size = __from_end - __from;
00441           extern_type* __cfixed =  static_cast<extern_type*>
00442         (__builtin_alloca(sizeof(extern_type) * (__size + 1)));
00443           __cfixed[0] = static_cast<extern_type>(__ext_bom);
00444           char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
00445           __cfrom = reinterpret_cast<char*>(__cfixed);
00446           __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00447                                        &__flen, &__cto, &__tlen); 
00448         }
00449       else
00450         {
00451           extern_type* __cfixed = const_cast<extern_type*>(__from);
00452           __cfrom = reinterpret_cast<char*>(__cfixed);
00453           __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00454                                        &__flen, &__cto, &__tlen); 
00455         }
00456 
00457       
00458       if (__conv != size_t(-1))
00459         {
00460           __from_next = reinterpret_cast<const extern_type*>(__cfrom);
00461           __to_next = reinterpret_cast<intern_type*>(__cto);
00462           __ret = codecvt_base::ok;
00463         }
00464       else 
00465         {
00466           if (__flen < static_cast<size_t>(__from_end - __from))
00467         {
00468           __from_next = reinterpret_cast<const extern_type*>(__cfrom);
00469           __to_next = reinterpret_cast<intern_type*>(__cto);
00470           __ret = codecvt_base::partial;
00471         }
00472           else
00473         __ret = codecvt_base::error;
00474         }
00475     }
00476       return __ret; 
00477     }
00478   
00479   template<typename _InternT, typename _ExternT>
00480     int 
00481     codecvt<_InternT, _ExternT, encoding_state>::
00482     do_encoding() const throw()
00483     {
00484       int __ret = 0;
00485       if (sizeof(_ExternT) <= sizeof(_InternT))
00486     __ret = sizeof(_InternT) / sizeof(_ExternT);
00487       return __ret; 
00488     }
00489   
00490   template<typename _InternT, typename _ExternT>
00491     bool 
00492     codecvt<_InternT, _ExternT, encoding_state>::
00493     do_always_noconv() const throw()
00494     { return false; }
00495   
00496   template<typename _InternT, typename _ExternT>
00497     int 
00498     codecvt<_InternT, _ExternT, encoding_state>::
00499     do_length(state_type&, const extern_type* __from, 
00500           const extern_type* __end, size_t __max) const
00501     { return std::min(__max, static_cast<size_t>(__end - __from)); }
00502 
00503   // _GLIBCXX_RESOLVE_LIB_DEFECTS
00504   // 74.  Garbled text for codecvt::do_max_length
00505   template<typename _InternT, typename _ExternT>
00506     int 
00507     codecvt<_InternT, _ExternT, encoding_state>::
00508     do_max_length() const throw()
00509     { return 1; }
00510 
00511 _GLIBCXX_END_NAMESPACE_VERSION
00512 } // namespace
00513 
00514 #endif