c++-gtk-utils
convert.h
Go to the documentation of this file.
00001 /* Copyright (C) 2005 to 2011 Chris Vine
00002 
00003 The library comprised in this file or of which this file is part is
00004 distributed by Chris Vine under the GNU Lesser General Public
00005 License as follows:
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Lesser General Public License
00009    as published by the Free Software Foundation; either version 2.1 of
00010    the License, or (at your option) any later version.
00011 
00012    This library is distributed in the hope that it will be useful, but
00013    WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Lesser General Public License, version 2.1, for more details.
00016 
00017    You should have received a copy of the GNU Lesser General Public
00018    License, version 2.1, along with this library (see the file LGPL.TXT
00019    which came with this source code package in the src/utils sub-directory);
00020    if not, write to the Free Software Foundation, Inc.,
00021    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
00022 
00023 However, it is not intended that the object code of a program whose
00024 source code instantiates a template from this file or uses macros or
00025 inline functions (of any length) should by reason only of that
00026 instantiation or use be subject to the restrictions of use in the GNU
00027 Lesser General Public License.  With that in mind, the words "and
00028 macros, inline functions and instantiations of templates (of any
00029 length)" shall be treated as substituted for the words "and small
00030 macros and small inline functions (ten lines or less in length)" in
00031 the fourth paragraph of section 5 of that licence.  This does not
00032 affect any other reason why object code may be subject to the
00033 restrictions in that licence (nor for the avoidance of doubt does it
00034 affect the application of section 2 of that licence to modifications
00035 of the source code in this file).
00036 
00037 */
00038 
00039 #ifndef CGU_CONVERT_H
00040 #define CGU_CONVERT_H
00041 
00042 #include <string>
00043 #include <iterator>
00044 #include <exception>
00045 
00046 #include <glib.h>
00047 
00048 #include <c++-gtk-utils/shared_handle.h>
00049 #include <c++-gtk-utils/cgu_config.h>
00050 
00051 namespace Cgu {
00052 
00053 /**
00054  * @file convert.h
00055  * @brief This file contains functions for converting between
00056  * character sets.
00057  *
00058  * \#include <c++-gtk-utils/convert.h>
00059  *
00060  * This file contains functions for converting between character sets.
00061  * If you want these functions to work, you will generally have needed
00062  * to have set the locale in the relevant program with either
00063  * <em>std::locale::global(std::locale(""))</em> (from the C++
00064  * standard library) or <em>setlocale(LC_ALL,"")</em> (from the C
00065  * standard library).
00066  */ 
00067 
00068 /**
00069  * @namespace Cgu::Utf8
00070  * @brief This namespace contains utilities relevant to the use of
00071  * UTF-8 in programs.
00072  *
00073  * \#include <c++-gtk-utils/convert.h> (for conversion and validation
00074  * functions)
00075  *
00076  * \#include <c++-gtk-utils/reassembler.h> (for Reassembler class)
00077  * @sa convert.h reassembler.h
00078  *
00079  * This namespace contains utilities relevant to the use of UTF-8 in
00080  * programs.  If you want these functions to work, you will generally
00081  * have needed to have set the locale in the relevant program with
00082  * either <em>std::locale::global(std::locale(""))</em> (from the C++
00083  * standard library) or <em>setlocale(LC_ALL,"")</em> (from the C standard
00084  * library).
00085  */ 
00086 
00087 namespace Utf8 {
00088 
00089 class ConversionError: public std::exception {
00090   GcharSharedHandle message;
00091 public:
00092   virtual const char* what() const throw() {return (const char*)message.get();}
00093   ConversionError(const char* msg):
00094     message(g_strdup_printf("Utf8::ConversionError: %s", msg)) {}
00095   ConversionError(GError* error):
00096     message(g_strdup_printf("Utf8::ConversionError: %s", error->message)) {}
00097   ~ConversionError() throw() {}
00098 };
00099 
00100 /**
00101  * Converts text from UTF-8 to the system's Unicode wide character
00102  * representation, which will be UTF-32/UCS-4 for systems with a wide
00103  * character size of 4 (almost all unix-like systems), and UTF-16 for
00104  * systems with a wide character size of 2.
00105  * @param input Text in valid UTF-8 format.
00106  * @return The input text converted to UTF-32 or UTF-16.
00107  * @exception Cgu::Utf8::ConversionError This exception will be thrown
00108  * if conversion fails because the input string is not in valid UTF-8
00109  * format or the system does not support wide character Unicode
00110  * strings.
00111  * @exception std::bad_alloc This function might throw std::bad_alloc
00112  * if memory is exhausted and the system throws in that case.
00113  */
00114 std::wstring uniwide_from_utf8(const std::string& input);
00115 
00116 /**
00117  * Converts text from the system's Unicode wide character
00118  * representation, which will be UTF-32/UCS-4 for systems with a wide
00119  * character size of 4 (almost all unix-like systems) and UTF-16 for
00120  * systems with a wide character size of 2, to narrow character UTF-8
00121  * format.
00122  * @param input Text in valid UTF-32 or UTF-16 format.
00123  * @return The input text converted to UTF-8.
00124  * @exception Cgu::Utf8::ConversionError This exception will be thrown
00125  * if conversion fails because the input string is not in valid
00126  * UTF-32/UCS-4 or UTF-16 format or the system does not support wide
00127  * character Unicode strings.
00128  * @exception std::bad_alloc This function might throw std::bad_alloc
00129  * if memory is exhausted and the system throws in that case.
00130  */
00131 std::string uniwide_to_utf8(const std::wstring& input);
00132 
00133 /**
00134  * Converts text from UTF-8 to UTF-32/USC-4.
00135  * @param input Text in valid UTF-8 format.
00136  * @return The input text converted to UTF-32.
00137  * @exception Cgu::Utf8::ConversionError This exception will be thrown
00138  * if conversion fails because the input string is not in valid UTF-8
00139  * format or the system does not support wide character Unicode
00140  * strings.
00141  * @exception std::bad_alloc This function might throw std::bad_alloc
00142  * if memory is exhausted and the system throws in that case.
00143  */
00144 std::u32string utf32_from_utf8(const std::string& input);
00145 
00146 /**
00147  * Converts text from UFF-32/UTF4 to narrow character UTF-8 format.
00148  * @param input Text in valid UTF-32 format.
00149  * @return The input text converted to UTF-8.
00150  * @exception Cgu::Utf8::ConversionError This exception will be thrown
00151  * if conversion fails because the input string is not in valid
00152  * UTF-32/UCS-4 format or the system does not support wide character
00153  * Unicode strings.
00154  * @exception std::bad_alloc This function might throw std::bad_alloc
00155  * if memory is exhausted and the system throws in that case.
00156  */
00157 std::string utf32_to_utf8(const std::u32string& input);
00158 
00159 /**
00160  * Converts text from UTF-8 to UTF-16.
00161  * @param input Text in valid UTF-8 format.
00162  * @return The input text converted to UTF-16.
00163  * @exception Cgu::Utf8::ConversionError This exception will be thrown
00164  * if conversion fails because the input string is not in valid UTF-8
00165  * format or the system does not support wide character Unicode
00166  * strings.
00167  * @exception std::bad_alloc This function might throw std::bad_alloc
00168  * if memory is exhausted and the system throws in that case.
00169  */
00170 std::u16string utf16_from_utf8(const std::string& input);
00171 
00172 /**
00173  * Converts text from UFF-16 to narrow character UTF-8 format.
00174  * @param input Text in valid UTF-16 format.
00175  * @return The input text converted to UTF-8.
00176  * @exception Cgu::Utf8::ConversionError This exception will be thrown
00177  * if conversion fails because the input string is not in valid UTF-16
00178  * format or the system does not support wide character Unicode
00179  * strings.
00180  * @exception std::bad_alloc This function might throw std::bad_alloc
00181  * if memory is exhausted and the system throws in that case.
00182  */
00183 std::string utf16_to_utf8(const std::u16string& input);
00184 
00185 /**
00186  * Converts text from UTF-8 to the system's wide character locale
00187  * representation.  For this function to work correctly, the system's
00188  * installed iconv() must support conversion to a generic wchar_t
00189  * target, but in POSIX whether it does so is implementation defined
00190  * (GNU's C library implemention does).  For most unix-like systems
00191  * the wide character representation will be Unicode (UCS-4/UTF-32 or
00192  * UTF-16), and where that is the case use the uniwide_from_utf8()
00193  * function instead, which will not rely on the generic target being
00194  * available.
00195  * @param input Text in valid UTF-8 format.
00196  * @return The input text converted to the system's wide character
00197  * locale representation.
00198  * @exception Cgu::Utf8::ConversionError This exception will be thrown
00199  * if conversion fails because the input string is not in valid UTF-8
00200  * format, or cannot be converted to the system's wide character
00201  * locale representation (eg because the input characters cannot be
00202  * represented by that encoding, or the system's installed iconv()
00203  * function does not support conversion to a generic wchar_t target).
00204  * @exception std::bad_alloc This function might throw std::bad_alloc
00205  * if memory is exhausted and the system throws in that case.
00206  */
00207 
00208 std::wstring wide_from_utf8(const std::string& input);
00209 
00210 /**
00211  * Converts text from the system's wide character locale
00212  * representation to UTF-8.  For this function to work correctly, the
00213  * system's installed iconv() must support conversion from a generic
00214  * wchar_t target, but in POSIX whether it does so is implementation
00215  * defined (GNU's C library implemention does).  For most unix-like
00216  * systems the wide character representation will be Unicode
00217  * (UCS-4/UTF-32 or UTF-16), and where that is the case use the
00218  * uniwide_to_utf8() function instead, which will not rely on the
00219  * generic target being available.
00220  * @param input Text in a valid wide character locale format.
00221  * @return The input text converted to UTF-8.
00222  * @exception Cgu::Utf8::ConversionError This exception will be thrown
00223  * if conversion fails because the input string is not in a valid wide
00224  * character locale format, or cannot be converted to UTF-8 (eg
00225  * because the system's installed iconv() function does not support
00226  * conversion from a generic wchar_t target).
00227  * @exception std::bad_alloc This function might throw std::bad_alloc
00228  * if memory is exhausted and the system throws in that case.
00229  */
00230 std::string wide_to_utf8(const std::wstring& input);
00231 
00232 /**
00233  * Converts text from UTF-8 to the system's filename encoding.
00234  * @param input Text in valid UTF-8 format.
00235  * @return The input text converted to filename encoding.
00236  * @exception Cgu::Utf8::ConversionError This exception will be thrown
00237  * if conversion fails because the input string is not in valid UTF-8
00238  * format, or cannot be converted to filename encoding (eg because the
00239  * input characters cannot be represented by that encoding).
00240  * @exception std::bad_alloc This function might throw std::bad_alloc
00241  * if memory is exhausted and the system throws in that case.
00242  * @note glib takes the system's filename encoding from the
00243  * environmental variables G_FILENAME_ENCODING and G_BROKEN_FILENAMES.
00244  * If G_BROKEN_FILENAMES is set to 1 and G_FILENAME_ENCODING is not
00245  * set, it will be assumed that the filename encoding is the same as
00246  * the locale encoding.  If G_FILENAME_ENCODING is set, then
00247  * G_BROKEN_FILENAMES is ignored, and filename encoding is taken from
00248  * the value held by G_FILENAME_ENCODING.
00249  */
00250 std::string filename_from_utf8(const std::string& input);
00251 
00252 /**
00253  * Converts text from the system's filename encoding to UTF-8.
00254  * @param input Text in valid filename encoding.
00255  * @return The input text converted to UTF-8.
00256  * @exception Cgu::Utf8::ConversionError This exception will be thrown
00257  * if conversion fails because the input string is not in valid
00258  * filename encoding.
00259  * @exception std::bad_alloc This function might throw std::bad_alloc
00260  * if memory is exhausted and the system throws in that case.
00261  * @note glib takes the system's filename encoding from the
00262  * environmental variables G_FILENAME_ENCODING and G_BROKEN_FILENAMES.
00263  * If G_BROKEN_FILENAMES is set to 1 and G_FILENAME_ENCODING is not
00264  * set, it will be assumed that the filename encoding is the same as
00265  * the locale encoding.  If G_FILENAME_ENCODING is set, then
00266  * G_BROKEN_FILENAMES is ignored, and filename encoding is taken from
00267  * the value held by G_FILENAME_ENCODING.
00268  */
00269 std::string filename_to_utf8(const std::string& input); 
00270 
00271 /**
00272  * Converts text from UTF-8 to the system's locale encoding.
00273  * @param input Text in valid UTF-8 format.
00274  * @return The input text converted to locale encoding.
00275  * @exception Cgu::Utf8::ConversionError This exception will be thrown
00276  * if conversion fails because the input string is not in valid UTF-8
00277  * format, or cannot be converted to locale encoding (eg because the
00278  * input characters cannot be represented by that encoding).
00279  * @exception std::bad_alloc This function might throw std::bad_alloc
00280  * if memory is exhausted and the system throws in that case.
00281  */
00282 std::string locale_from_utf8(const std::string& input);
00283 
00284 /**
00285  * Converts text from the system's locale encoding to UTF-8.
00286  * @param input Text in valid locale encoding.
00287  * @return The input text converted to UTF-8.
00288  * @exception Cgu::Utf8::ConversionError This exception will be thrown
00289  * if conversion fails because the input string is not in valid locale
00290  * encoding.
00291  * @exception std::bad_alloc This function might throw std::bad_alloc
00292  * if memory is exhausted and the system throws in that case.
00293  */
00294 std::string locale_to_utf8(const std::string& input); 
00295 
00296 /**
00297  * Indicates whether the input text comprises valid UTF-8.
00298  * @param text The text to be tested.
00299  * @return true if the input text is in valid UTF-8 format, otherwise
00300  * false.
00301  * @exception std::bad_alloc This function might throw std::bad_alloc
00302  * if std::string::data() might throw when memory is exhausted.
00303  * @note \#include <c++-gtk-utils/convert.h> for this function.
00304  */
00305 inline bool validate(const std::string& text) {
00306   return g_utf8_validate(text.data(), text.size(), 0);
00307 }
00308 
00309 /************** Iterator class **************/
00310 
00311 /** 
00312  * @class Iterator convert.h c++-gtk-utils/convert.h
00313  * @brief A class which will iterate through a std::string object by
00314  * reference to unicode characters rather than by bytes.
00315  * @sa Cgu::Utf8::ReverseIterator
00316  *
00317  * The Cgu::Utf8::Iterator class does the same as
00318  * std::string::const_iterator, except that when iterating through a
00319  * std::string object using the ++ and -- postfix and prefix
00320  * operators, it iterates by increments of whole unicode code points
00321  * rather than by reference to bytes.  In addition, the dereferencing
00322  * operator returns the whole unicode code point (a UCS-4 gunichar
00323  * type) rather than a char type.
00324  *
00325  * Where, as in practically all unix-like systems, sizeof(wchar_t) ==
00326  * 4, then the gunichar return value of the dereferencing operator can
00327  * be converted by a simple static_cast to the wchar_t type.  So far
00328  * as displaying individual code points is concerned however, it
00329  * should be noted that because unicode allows combining characters, a
00330  * unicode code point may not contain the whole representation of a
00331  * character as displayed.  This effect can be dealt with for all
00332  * characters capable of representation by Level 1 unicode (ie by
00333  * precomposed characters) using g_utf8_normalize() before iterating.
00334  * There will still however be some non-European scripts, in
00335  * particular some Chinese/Japanese/Korean ideograms, where
00336  * description of the ideogram requires more than one code point to be
00337  * finally resolved.  For these, printing individual code points
00338  * sequentially one by one directly to a display (say with std::wcout)
00339  * may or not may not have the desired result, depending on how the
00340  * display device (eg console) deals with that case.
00341  *
00342  * A Cgu::Utf8::Iterator only allows reading from and not writing to
00343  * the std::string object being iterated through.  This is because in
00344  * UTF-8 the representation of any one unicode code point will require
00345  * between 1 and 6 bytes: accordingly modifying a UTF-8 string may
00346  * change its length (in bytes) even though the number of unicode
00347  * characters stays the same.  For the same reason, this iterator is a
00348  * bidirectional iterator but not a random access iterator.
00349  *
00350  * The std::string object concerned should contain valid UTF-8 text.
00351  * If necessary, this should be checked with Cgu::Utf8::validate()
00352  * first.  In addition, before use, the Cgu::Utf8::Iterator object
00353  * must be initialized by a std::string::const_iterator or
00354  * std::string::iterator object pointing to the first byte of a valid
00355  * UTF-8 character in the string (or by another Cgu::Utf8::Iterator
00356  * object or by a Cgu::Utf8::ReverseIterator object), and iteration
00357  * will begin at the point of initialization: therefore, assuming the
00358  * string contains valid UTF-8 text, passing std::string::begin() to a
00359  * Cgu::Utf8::Iterator object will always be safe.  Initialization by
00360  * std::string::end() is also valid if the first iteration is
00361  * backwards with the -- operator.  This initialization can be done
00362  * either in the constructor or by assignment.  Comparison operators
00363  * ==, !=, <, <=, > and >= are provided enabling the position of
00364  * Cgu::Utf8::Iterator objects to be compared with each other or with
00365  * std::string::const_iterator and std::string::iterator objects.
00366  *
00367  * This is an example:
00368  * @code
00369  * using namespace Cgu;
00370  *
00371  * std::wstring wide_str(L"ßøǿón");
00372  * std::string narrow_str(Utf8::uniwide_to_utf8(wide_str));
00373  *
00374  * Utf8::Iterator iter;
00375  * for (iter = narrow_str.begin();
00376  *      iter != narrow_str.end();
00377  *      ++iter)
00378  *   std::wcout << static_cast<wchar_t>(*iter) << std::endl;
00379  * @endcode
00380  *
00381  * This class assumes in using g_utf8_next_char(), g_utf8_prev_char()
00382  * and g_utf8_get_char() that the std::string object keeps its
00383  * internal string in contiguous storage.  This is required by the
00384  * C++11 standard, but not formally by C++98/C++03.  However, known
00385  * implementations of std::string in fact store the string
00386  * contiguously.
00387  */ 
00388 
00389 class ReverseIterator;
00390 
00391 class Iterator {
00392 public:
00393   typedef gunichar value_type;
00394   typedef gunichar reference;  // read only
00395   typedef void pointer;        // read only
00396   typedef std::string::difference_type difference_type;
00397   typedef std::bidirectional_iterator_tag iterator_category;
00398 
00399 private:  
00400   std::string::const_iterator pos;
00401 public:
00402 
00403 /**
00404  * Increments the iterator so that it moves from the beginning of the
00405  * current UTF-8 character to the beginning of the next UTF-8
00406  * character.  It is a prefix operator.  It will not throw.
00407  * @return A reference to the iterator in its new position.
00408  */
00409   Iterator& operator++();
00410 
00411 /**
00412  * Increments the iterator so that it moves from the beginning of the
00413  * current UTF-8 character to the beginning of the next UTF-8
00414  * character.  It is a postfix operator.  It will not throw provided
00415  * that copy constructing and assigning a std::string::const_iterator
00416  * object does not throw, as it will not in any sane implementation.
00417  * @return A copy of the iterator in its former position.
00418  */
00419   Iterator operator++(int);
00420 
00421 /**
00422  * Decrements the iterator so that it moves from the beginning of the
00423  * current UTF-8 character to the beginning of the previous UTF-8
00424  * character.  It is a prefix operator.  It will not throw.
00425  * @return A reference to the iterator in its new position.
00426  */
00427   Iterator& operator--();
00428 
00429 /**
00430  * Decrements the iterator so that it moves from the beginning of the
00431  * current UTF-8 character to the beginning of the previous UTF-8
00432  * character.  It is a postfix operator.  It will not throw provided
00433  * that copy constructing and assigning a std::string::const_iterator
00434  * object does not throw, as it will not in any sane implementation.
00435  * @return A copy of the iterator in its former position.
00436  */
00437   Iterator operator--(int);
00438 
00439 /**
00440  * Assigns a std::string::const_iterator object to this object.  It
00441  * should point to the beginning of a UTF-8 character (eg
00442  * std::string::begin()) or to std::string::end().  It will not throw
00443  * provided assigning a std::string::const_iterator object does not
00444  * throw, as it will not in any sane implementation.
00445  * @param iter The std::string::const_iterator.
00446  * @return A reference to this Cgu::Utf8::Iterator object after
00447  * assignment.
00448  */
00449   Iterator& operator=(const std::string::const_iterator& iter) {pos = iter; return *this;}
00450 
00451 /**
00452  * Assigns a std::string::iterator object to this object.  It should
00453  * point to the beginning of a UTF-8 character (eg
00454  * std::string::begin()) or to std::string::end().  It will not throw
00455  * provided assigning a std::string::const_iterator object does not
00456  * throw, as it will not in any sane implementation.
00457  * @param iter The std::string::iterator.
00458  * @return A reference to this Cgu::Utf8::Iterator object after
00459  * assignment.
00460  */
00461   Iterator& operator=(const std::string::iterator& iter) {pos = iter; return *this;}
00462 
00463 /**
00464  * Assigns a Cgu::Utf8::Iterator object to this object.  It will not
00465  * throw provided assigning a std::string::const_iterator object does
00466  * not throw, as it will not in any sane implementation.
00467  * @param iter The iterator.
00468  * @return A reference to this Cgu::Utf8::Iterator object after
00469  * assignment.
00470  */
00471   Iterator& operator=(const Iterator& iter) {pos = iter.pos; return *this;}
00472 
00473 /**
00474  * Assigns a Cgu::Utf8::ReverseIterator object to this object, so that
00475  * this iterator adopts the same physical position (but the logical
00476  * position will be offset to the following UTF-8 character).  It will
00477  * not throw provided assigning a std::string::const_iterator object
00478  * does not throw, as it will not in any sane implementation.
00479  * @param iter The iterator.
00480  * @return A reference to this Cgu::Utf8::Iterator object after
00481  * assignment.
00482  */
00483   Iterator& operator=(const ReverseIterator& iter);
00484 
00485 /**
00486  * The dereference operator.
00487  * @return A 32-bit gunichar object containing the whole unicode code
00488  * point which is currently represented by this iterator.  It will not
00489  * throw.
00490  */
00491   Iterator::value_type operator*() const {return g_utf8_get_char(&(*pos));}
00492 
00493 /**
00494  * @return The current underlying std::string::const_iterator kept by
00495  * this iterator.  Once this iterator has been correctly initialized,
00496  * that will point to the beginning of the UTF-8 character currently
00497  * represented by this iterator or to std::string::end().  It will not
00498  * throw provided assigning a std::string::const_iterator object does
00499  * not throw, as it will not in any sane implementation.
00500  */
00501   std::string::const_iterator base() const {return pos;}
00502 
00503 /**
00504  * Constructs this iterator and initialises it with a
00505  * std::string::const_iterator object.  It should point to the
00506  * beginning of a UTF-8 character (eg std::string::begin()) or to
00507  * std::string::end().  It will not throw provided that copy
00508  * constructing a std::string::const_iterator object does not throw,
00509  * as it will not in any sane implementation.  This is a type
00510  * conversion constructor (it is not marked explicit) so that it can
00511  * be used with Cgu::Utf8::Iterator comparison operators to compare
00512  * the position of Cgu::Utf8::Iterator with
00513  * std::string::const_iterator objects.
00514  * @param iter The std::string::const_iterator.
00515  */
00516   Iterator(const std::string::const_iterator& iter): pos(iter) {}
00517 
00518 /**
00519  * Constructs this iterator and initialises it with a
00520  * std::string::iterator object.  It should point to the beginning of
00521  * a UTF-8 character (eg std::string::begin()) or to
00522  * std::string::end().  It will not throw provided that copy
00523  * constructing a std::string::const_iterator object does not throw,
00524  * as it will not in any sane implementation.  This is a type
00525  * conversion constructor (it is not marked explicit) so that it can
00526  * be used with Cgu::Utf8::Iterator comparison operators to compare
00527  * the position of Cgu::Utf8::Iterator with std::string::iterator
00528  * objects.
00529  * @param iter The std::string::iterator.
00530  */
00531   Iterator(const std::string::iterator& iter): pos(iter) {}
00532 
00533 /**
00534  * Constructs this iterator and initialises it with another
00535  * Cgu::Utf8::Iterator object.  It will not throw provided that copy
00536  * constructing a std::string::const_iterator object does not throw,
00537  * as it will not in any sane implementation.
00538  * @param iter The iterator.
00539  */
00540   Iterator(const Iterator& iter): pos(iter.pos) {}
00541 
00542 /**
00543  * Constructs this iterator and initialises it with a
00544  * Cgu::Utf8::ReverseIterator object, so that this iterator adopts the
00545  * same physical position (but the logical position will be offset to
00546  * the following UTF-8 character).  It will not throw provided that
00547  * copy constructing a std::string::const_iterator object does not
00548  * throw, as it will not in any sane implementation.
00549  * @param iter The iterator.
00550  */
00551   explicit Iterator(const ReverseIterator& iter);
00552 
00553 /**
00554  * The default constructor will not throw.
00555  */
00556   Iterator() {}
00557 
00558 /* Only has effect if --with-glib-memory-slices-compat or
00559  * --with-glib-memory-slices-no-compat option picked */
00560   CGU_GLIB_MEMORY_SLICES_FUNCS
00561 };
00562 
00563 inline Iterator& Iterator::operator++() {
00564   const std::string::value_type* tmp = &(*pos);
00565   // using g_utf8_next_char is safe even when pos points to the last character -
00566   // that macro calls up the g_utf8_skip look-up table rather than attempting to
00567   // read the following character, so we can safely iterate to std::string::end()
00568   pos += g_utf8_next_char(tmp) - tmp;
00569   return *this;
00570 }
00571 
00572 inline Iterator Iterator::operator++(int) {
00573   Iterator tmp{*this};
00574   ++(*this);
00575   return tmp;
00576 }
00577  
00578 inline Iterator& Iterator::operator--() {
00579   // we might be iterating from std::string::end() so we need
00580   // to decrement before dereferencing and then increment again
00581   const std::string::value_type* tmp = &(*(pos-1));
00582   ++tmp;
00583   pos -= tmp - g_utf8_prev_char(tmp);
00584   return *this;
00585 }
00586 
00587 inline Iterator Iterator::operator--(int) {
00588   Iterator tmp{*this};
00589   --(*this);
00590   return tmp;
00591 }
00592 
00593 /**
00594  * The comparison operators will not throw provided assigning a
00595  * std::string::const_iterator object does not throw, as it will not
00596  * in any sane implementation.
00597  */
00598 inline bool operator==(const Iterator& iter1, const Iterator& iter2) {
00599   return (iter1.base() == iter2.base());
00600 }
00601  
00602 /**
00603  * The comparison operators will not throw provided assigning a
00604  * std::string::const_iterator object does not throw, as it will not
00605  * in any sane implementation.
00606  */
00607 inline bool operator!=(const Iterator& iter1, const Iterator& iter2) {
00608   return (iter1.base() != iter2.base());
00609 }
00610  
00611 /**
00612  * The comparison operators will not throw provided assigning a
00613  * std::string::const_iterator object does not throw, as it will not
00614  * in any sane implementation.
00615  */
00616 inline bool operator<(const Iterator& iter1, const Iterator& iter2) {
00617   return (iter1.base() < iter2.base());
00618 }
00619  
00620 /**
00621  * The comparison operators will not throw provided assigning a
00622  * std::string::const_iterator object does not throw, as it will not
00623  * in any sane implementation.
00624  */
00625 inline bool operator<=(const Iterator& iter1, const Iterator& iter2) {
00626   return (iter1.base() <= iter2.base());
00627 }
00628  
00629 /**
00630  * The comparison operators will not throw provided assigning a
00631  * std::string::const_iterator object does not throw, as it will not
00632  * in any sane implementation.
00633  */
00634 inline bool operator>(const Iterator& iter1, const Iterator& iter2) {
00635   return (iter1.base() > iter2.base());
00636 }
00637  
00638 /**
00639  * The comparison operators will not throw provided assigning a
00640  * std::string::const_iterator object does not throw, as it will not
00641  * in any sane implementation.
00642  */
00643 inline bool operator>=(const Iterator& iter1, const Iterator& iter2) {
00644   return (iter1.base() >= iter2.base());
00645 }
00646  
00647 /************** ReverseIterator class **************/
00648 
00649 /**
00650  * @class ReverseIterator convert.h c++-gtk-utils/convert.h
00651  * @brief A class which will iterate in reverse through a std::string
00652  * object by reference to unicode characters rather than by bytes.
00653  * @sa Cgu::Utf8::Iterator
00654  *
00655  * The Cgu::Utf8::ReverseIterator class does the same as
00656  * std::string::const_reverse_iterator, except that when iterating
00657  * through a std::string object using the ++ and -- postfix and prefix
00658  * operators, it iterates by increments of whole unicode code points
00659  * rather than by reference to bytes.  In addition, the dereferencing
00660  * operator returns the whole unicode code point (a UCS-4 gunichar
00661  * type) rather than a char type.
00662  *
00663  * Before use, the Cgu::Utf8::ReverseIterator object must be
00664  * initialized by a std::string::const_reverse_iterator or
00665  * std::string::reverse_iterator object representing the first byte of
00666  * a valid UTF-8 character in the string (or by another
00667  * Cgu::Utf8::ReverseIterator object or by a Cgu::Utf8::Iterator
00668  * object): so assuming the string contains valid UTF-8 text, it is
00669  * always valid to initialise a Cgu::Utf8::ReverseIterator with
00670  * std::string::rbegin().  Initialization by std::string::rend() is
00671  * also valid if the first interation is backwards with the --
00672  * operator.  This initialization can be done either in the
00673  * constructor or by assignment.  Comparison operators ==, !=, <, <=,
00674  * > and >= are provided enabling the position of
00675  * Cgu::Utf8::ReverseIterator objects to be compared with each other
00676  * or with std::string::const_reverse_iterator and
00677  * std::string::reverse_iterator objects.
00678  *
00679  * This is an example:
00680  * @code
00681  * using namespace Cgu;
00682  *
00683  * std::wstring wide_str(L"ßøǿón");
00684  * std::string narrow_str(Utf8::uniwide_to_utf8(wide_str));
00685  *
00686  * Utf8::ReverseIterator iter;
00687  * for (iter = narrow_str.rbegin();
00688  *      iter != narrow_str.rend();
00689  *      ++iter)
00690  *   std::wcout << static_cast<wchar_t>(*iter) << std::endl;
00691  * @endcode
00692  * 
00693  * For further information on its use, see the Utf8::Iterator
00694  * documentation.
00695  */
00696 
00697 class ReverseIterator {
00698 public:
00699   typedef gunichar value_type;
00700   typedef gunichar reference;  // read only
00701   typedef void pointer;        // read only
00702   typedef std::string::difference_type difference_type;
00703   typedef std::bidirectional_iterator_tag iterator_category;
00704 
00705 private:  
00706   std::string::const_iterator pos;
00707   // we use cache to make iterating and then dereferencing more efficient
00708   mutable std::string::const_iterator cache;
00709 public:
00710 
00711 /**
00712  * Increments the iterator in the reverse direction so that it moves
00713  * from the beginning of the current UTF-8 character to the beginning
00714  * of the previous UTF-8 character in the std::string object
00715  * concerned.  It is a prefix operator.  It will not throw provided
00716  * assigning a std::string::const_iterator object does not throw, as
00717  * it will not in any sane implementation.
00718  * @return A reference to the iterator in its new position
00719  */
00720   ReverseIterator& operator++();
00721 
00722 /**
00723  * Increments the iterator in the reverse direction so that it moves
00724  * from the beginning of the current UTF-8 character to the beginning
00725  * of the previous UTF-8 character in the std::string object
00726  * concerned.  It is a postfix operator.  It will not throw provided
00727  * that copy constructing and assigning a std::string::const_iterator
00728  * object does not throw, as it will not in any sane implementation.
00729  * @return A copy of the iterator in its former position
00730  */
00731   ReverseIterator operator++(int);
00732 
00733 /**
00734  * Decrements the iterator in the reverse direction so that it moves
00735  * from the beginning of the current UTF-8 character to the beginning
00736  * of the following UTF-8 character in the std::string object
00737  * concerned.  It is a prefix operator.  It will not throw provided
00738  * assigning a std::string::const_iterator object does not throw, as
00739  * it will not in any sane implementation.
00740  * @return A reference to the iterator in its new position
00741  */
00742   ReverseIterator& operator--();
00743 
00744 /**
00745  * Decrements the iterator in the reverse direction so that it moves
00746  * from the beginning of the current UTF-8 character to the beginning
00747  * of the following UTF-8 character in the std::string object
00748  * concerned.  It is a postfix operator.  It will not throw provided
00749  * that copy constructing and assigning a std::string::const_iterator
00750  * object does not throw, as it will not in any sane implementation.
00751  * @return A copy of the iterator in its former position
00752  */
00753   ReverseIterator operator--(int);
00754 
00755 /**
00756  * Assigns a std::string::const_reverse_iterator object to this
00757  * object.  It should represent the beginning of a UTF-8 character (eg
00758  * std::string::rbegin()) or comprise std::string::rend().  It will
00759  * not throw provided assigning a std::string::const_iterator object
00760  * does not throw, as it will not in any sane implementation.
00761  * @param iter The const_reverse_iterator.
00762  * @return A reference to this Cgu::Utf8::ReverseIterator object after
00763  * assignment.
00764  */
00765   ReverseIterator& operator=(const std::string::const_reverse_iterator& iter) {pos = iter.base(); cache = pos; return *this;}
00766 
00767 /**
00768  * Assigns a std::string::reverse_iterator object to this object.  It
00769  * should represent the beginning of a UTF-8 character (eg
00770  * std::string::rbegin()) or comprise std::string::rend().  It will
00771  * not throw provided assigning a std::string::const_iterator object
00772  * does not throw, as it will not in any sane implementation.
00773  * @param iter The reverse_iterator.
00774  * @return A reference to this Cgu::Utf8::ReverseIterator object after
00775  * assignment.
00776  */
00777   ReverseIterator& operator=(const std::string::reverse_iterator& iter) {pos = iter.base(); cache = pos; return *this;}
00778 
00779 /**
00780  * Assigns a Cgu::Utf8::ReverseIterator object to this object.  It
00781  * will not throw provided assigning a std::string::const_iterator
00782  * object does not throw, as it will not in any sane implementation.
00783  * @param iter The iterator.
00784  * @return A reference to this Cgu::Utf8::ReverseIterator object after
00785  * assignment.
00786  */
00787   ReverseIterator& operator=(const ReverseIterator& iter) {pos = iter.pos; cache = iter.cache; return *this;}
00788 
00789 /**
00790  * Assigns a Cgu::Utf8::Iterator object to this object, so that this
00791  * iterator adopts the same physical position (but the logical
00792  * position will be offset to the previous UTF-8 character in the
00793  * std::string object concerned).  It will not throw provided
00794  * assigning a std::string::const_iterator object does not throw, as
00795  * it will not in any sane implementation.
00796  * @param iter The iterator.
00797  * @return A reference to this Cgu::Utf8::ReverseIterator object after
00798  * assignment.
00799  */
00800   ReverseIterator& operator=(const Iterator& iter) {pos = iter.base(); cache = pos; return *this;}
00801 
00802 /**
00803  * The dereference operator.
00804  * @return A 32-bit gunichar object containing the whole unicode code
00805  * point which is currently represented by this iterator.  It will not
00806  * throw.
00807  */
00808   ReverseIterator::value_type operator*() const;
00809 
00810 /**
00811  * @return The current underlying std::string::const_iterator kept by
00812  * this iterator.  Once this iterator has been correctly initialized,
00813  * that will point to the beginning of the UTF-8 character after the
00814  * one currently represented by this iterator or to
00815  * std::string::end().  It will not throw provided assigning a
00816  * std::string::const_iterator object does not throw, as it will not
00817  * in any sane implementation.
00818  */
00819   std::string::const_iterator base() const {return pos;}
00820 
00821 /**
00822  * Constructs this iterator and initialises it with a
00823  * std::string::const_reverse_iterator object.  It should represent
00824  * the beginning of a UTF-8 character (eg std::string::rbegin()) or
00825  * comprise std::string::rend().  It will not throw provided that copy
00826  * constructing a std::string::const_iterator object does not throw,
00827  * as it will not in any sane implementation.  This is a type
00828  * conversion constructor (it is not marked explicit) so that it can
00829  * be used with Cgu::Utf8::ReverseIterator comparison operators to
00830  * compare the position of Cgu::Utf8::ReverseIterator with
00831  * std::string::const_reverse_iterator objects.
00832  * @param iter The const_reverse_iterator.
00833  */
00834   ReverseIterator(const std::string::const_reverse_iterator& iter): pos(iter.base()), cache(pos) {}
00835 
00836 /**
00837  * Constructs this iterator and initialises it with a
00838  * std::string::reverse_iterator object.  It should represent the
00839  * beginning of a UTF-8 character (eg std::string::rbegin()) or
00840  * comprise std::string::rend().  It will not throw provided that copy
00841  * constructing a std::string::const_iterator object does not throw,
00842  * as it will not in any sane implementation.  This is a type
00843  * conversion constructor (it is not marked explicit) so that it can
00844  * be used with Cgu::Utf8::ReverseIterator comparison operators to
00845  * compare the position of Cgu::Utf8::ReverseIterator with
00846  * std::string::reverse_iterator objects.
00847  * @param iter The reverse_iterator.
00848  */
00849   ReverseIterator(const std::string::reverse_iterator& iter): pos(iter.base()), cache(pos) {}
00850 
00851 /**
00852  * Constructs this iterator and initialises it with another
00853  * Cgu::Utf8::ReverseIterator object.  It will not throw provided that
00854  * copy constructing a std::string::const_iterator object does not
00855  * throw, as it will not in any sane implementation.
00856  * @param iter The iterator.
00857  */
00858   ReverseIterator(const ReverseIterator& iter): pos(iter.pos), cache(iter.cache) {}
00859 
00860 /**
00861  * Constructs this iterator and initialises it with a
00862  * Cgu::Utf8::Iterator object, so that this iterator adopts the same
00863  * physical position (but the logical position will be offset to the
00864  * previous UTF-8 character in the std::string object concerned).  It
00865  * will not throw provided that copy constructing a
00866  * std::string::const_iterator object does not throw, as it will not
00867  * in any sane implementation.
00868  * @param iter The iterator.
00869  */
00870   explicit ReverseIterator(const Iterator& iter): pos(iter.base()), cache(pos) {}
00871 
00872 /**
00873  * The default constructor will not throw.
00874  */
00875   ReverseIterator() {}
00876 
00877 /* Only has effect if --with-glib-memory-slices-compat or
00878  * --with-glib-memory-slices-no-compat option picked */
00879   CGU_GLIB_MEMORY_SLICES_FUNCS
00880 };
00881 
00882 inline ReverseIterator& ReverseIterator::operator++() {
00883 
00884   if (pos > cache) pos = cache;
00885 
00886   else {
00887     // we might be iterating from std::string::end()/std::string::rbegin() so
00888     // we need to decrement before dereferencing and then increment again
00889     const std::string::value_type* tmp = &(*(pos-1));
00890     ++tmp;
00891     pos -= tmp - g_utf8_prev_char(tmp);
00892   }
00893   return *this;
00894 }
00895 
00896 inline ReverseIterator ReverseIterator::operator++(int) {
00897   ReverseIterator tmp{*this};
00898   ++(*this);
00899   return tmp;
00900 }
00901  
00902 inline ReverseIterator& ReverseIterator::operator--() {
00903   cache = pos;
00904   const std::string::value_type* tmp = &(*pos);
00905   // using g_utf8_next_char is safe even when pos points to the first character -
00906   // that macro calls up the g_utf8_skip look-up table rather than attempting to
00907   // read the following character, so we can safely iterate to std::string::rbegin()
00908   pos += g_utf8_next_char(tmp) - tmp;
00909   return *this;
00910 }
00911 
00912 inline ReverseIterator ReverseIterator::operator--(int) {
00913   ReverseIterator tmp{*this};
00914   --(*this);
00915   return tmp;
00916 }
00917 
00918 inline ReverseIterator::value_type ReverseIterator::operator*() const {
00919   Iterator tmp{*this};
00920   --tmp;
00921   cache = tmp.base();
00922   return g_utf8_get_char(&(*(tmp.base())));
00923 }
00924 
00925 /**
00926  * The comparison operators will not throw provided assigning a
00927  * std::string::const_iterator object does not throw, as it will not
00928  * in any sane implementation.
00929  */
00930 inline bool operator==(const ReverseIterator& iter1, const ReverseIterator& iter2) {
00931   return (iter1.base() == iter2.base());
00932 }
00933  
00934 /**
00935  * The comparison operators will not throw provided assigning a
00936  * std::string::const_iterator object does not throw, as it will not
00937  * in any sane implementation.
00938  */
00939 inline bool operator!=(const ReverseIterator& iter1, const ReverseIterator& iter2) {
00940   return (iter1.base() != iter2.base());
00941 }
00942  
00943 /**
00944  * The comparison operators will not throw provided assigning a
00945  * std::string::const_iterator object does not throw, as it will not
00946  * in any sane implementation.  Ordering is viewed from the
00947  * perspective of the logical operation (reverse iteration), so that
00948  * for example an iterator at position std::string::rbegin() is less
00949  * than an iterator at position std::string::rend().
00950  */
00951 inline bool operator<(const ReverseIterator& iter1, const ReverseIterator& iter2) {
00952   return (iter1.base() > iter2.base());
00953 }
00954  
00955 /**
00956  * The comparison operators will not throw provided assigning a
00957  * std::string::const_iterator object does not throw, as it will not
00958  * in any sane implementation.  Ordering is viewed from the
00959  * perspective of the logical operation (reverse iteration), so that
00960  * for example an iterator at position std::string::rbegin() is less
00961  * than an iterator at position std::string::rend().
00962  */
00963 inline bool operator<=(const ReverseIterator& iter1, const ReverseIterator& iter2) {
00964   return (iter1.base() >= iter2.base());
00965 }
00966  
00967 /**
00968  * The comparison operators will not throw provided assigning a
00969  * std::string::const_iterator object does not throw, as it will not
00970  * in any sane implementation.  Ordering is viewed from the
00971  * perspective of the logical operation (reverse iteration), so that
00972  * for example an iterator at position std::string::rbegin() is less
00973  * than an iterator at position std::string::rend().
00974  */
00975 inline bool operator>(const ReverseIterator& iter1, const ReverseIterator& iter2) {
00976   return (iter1.base() < iter2.base());
00977 }
00978 
00979 /**
00980  * The comparison operators will not throw provided assigning a
00981  * std::string::const_iterator object does not throw, as it will not
00982  * in any sane implementation.  Ordering is viewed from the
00983  * perspective of the logical operation (reverse iteration), so that
00984  * for example an iterator at position std::string::rbegin() is less
00985  * than an iterator at position std::string::rend().
00986  */
00987 inline bool operator>=(const ReverseIterator& iter1, const ReverseIterator& iter2) {
00988   return (iter1.base() <= iter2.base());
00989 }
00990  
00991 /*** Iterator class methods which require ReverseIterator as a complete type ***/
00992 
00993 inline Iterator& Iterator::operator=(const ReverseIterator& iter) {
00994   pos = iter.base();
00995   return *this;
00996 }
00997 
00998 inline Iterator::Iterator(const ReverseIterator& iter): pos(iter.base()) {}
00999 
01000 } // namespace Utf8
01001 
01002 } // namespace Cgu
01003 
01004 #endif