c++-gtk-utils

reassembler.h

Go to the documentation of this file.
00001 /* Copyright (C) 2005 to 2010 Chris Vine
00002 
00003 The library comprised in this file or of which this file is part is
00004 distributed by Chris Vine under the GNU Lesser General Public
00005 License as follows:
00006 
00007    This library is free software; you can redistribute it and/or
00008    modify it under the terms of the GNU Lesser General Public License
00009    as published by the Free Software Foundation; either version 2.1 of
00010    the License, or (at your option) any later version.
00011 
00012    This library is distributed in the hope that it will be useful, but
00013    WITHOUT ANY WARRANTY; without even the implied warranty of
00014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015    Lesser General Public License, version 2.1, for more details.
00016 
00017    You should have received a copy of the GNU Lesser General Public
00018    License, version 2.1, along with this library (see the file LGPL.TXT
00019    which came with this source code package in the c++-gtk-utils
00020    sub-directory); if not, write to the Free Software Foundation, Inc.,
00021    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
00022 
00023 */
00024 
00025 #ifndef CGU_REASSEMBLER_H
00026 #define CGU_REASSEMBLER_H
00027 
00028 #include <c++-gtk-utils/shared_handle.h>
00029 #include <c++-gtk-utils/cgu_config.h>
00030 
00031 namespace Cgu {
00032 
00033 namespace Utf8 {
00034 
00035 
00036 /**
00037  * @class Reassembler reassembler.h c++-gtk-utils/reassembler.h
00038  * @brief A class for reassembling UTF-8 strings sent over pipes and
00039  * sockets so they form complete valid UTF-8 characters.
00040  *
00041  * Utf8::Reassembler is a functor class which takes in a partially
00042  * formed UTF-8 string and returns a nul-terminated string comprising
00043  * such of the input string (after inserting, at the beginning, any
00044  * partially formed UTF-8 character which was at the end of the input
00045  * string passed in previous calls to the functor) as forms complete
00046  * UTF-8 characters (storing any partial character at the end for the
00047  * next call to the functor).  If the input string contains invalid
00048  * UTF-8 after adding any stored previous part character (apart from
00049  * any partially formed character at the end of the input string) then
00050  * operator() will return a null Cgu::SharedHandle<char*> object (that
00051  * is, Cgu::SharedHandle<char*>::get() will return 0).  Such input
00052  * will not be treated as invalid if it consists only of a single
00053  * partly formed UTF-8 character which could be valid if further bytes
00054  * were received and added to it.  In that case the returned
00055  * SharedHandle<char*> object will contain an allocated string of zero
00056  * length, comprising only a terminating \0 character, rather than a
00057  * NULL pointer.
00058  *
00059  * This enables UTF-8 strings to be sent over pipes, sockets, etc and
00060  * displayed in a GTK+ object at the receiving end
00061  *
00062  * Note that for efficiency reasons the memory held in the returned
00063  * Cgu::SharedHandle<char*> object may be greater than the length of
00064  * the nul-terminated string that is contained in that memory: just
00065  * let the Cgu::SharedHandle<char*> object manage the memory, and use
00066  * the contents like any other nul-terminated string.
00067  *
00068  * This class is not needed if std::getline(), with its default '\\n'
00069  * delimiter, is used to read UTF-8 characters using, say,
00070  * Cgu::fdistream, because a whole '\\n' delimited line of UTF-8
00071  * characters will always be complete.
00072  *
00073  * This is an example of its use, reading from a pipe until it is
00074  * closed by the writer and putting the received text in a
00075  * GtkTextBuffer object:
00076  * @code
00077  *   using namespace Cgu;
00078  *
00079  *   GtkTextIter end;
00080  *   GtkTextBuffer* text_buffer = gtk_text_view_get_buffer(GTK_TEXT_VIEW(text_view));
00081  *   gtk_text_buffer_get_end_iter(text_buffer, &end);
00082  *
00083  *   Utf8::Reassembler reassembler;
00084  *   const int BSIZE = 1024;
00085  *   char read_buffer[BSIZE];
00086  *   ssize_t res;
00087  *   do {
00088  *     res = ::read(fd, read_buffer, BSIZE);
00089  *     if (res > 0) {
00090  *       SharedHandle<char*> utf8(reassembler(read_buffer, res));
00091  *       if (utf8.get()) {
00092  *         gtk_text_buffer_insert(text_buffer, &end,
00093  *                                utf8.get(), std::strlen(utf8));
00094  *       }
00095  *       else std::cerr << "Invalid utf8 text sent over pipe\n";
00096  *     }
00097  *   } while (res && (res != -1 || errno == EINTR));
00098  * @endcode
00099  *
00100  * This class maintains an array as a data member, containing partly
00101  * formed characters from previous calls to operator(), and should not
00102  * be copied.  There should be no reason to do so, but unfortunately
00103  * enforcing this by explicitly precluding copy construction and copy
00104  * assignment was overlooked when this class was first provided.  At
00105  * the next API break, the copy constructor will be explicitly deleted
00106  * and moving only allowed.  Where a Reassembler object is to be
00107  * moved, use std::move and the code will be safe against this change
00108  * in the future.
00109  */
00110 
00111 class Reassembler {
00112   size_t stored;
00113   const static size_t buff_size = 6;
00114   char buffer[buff_size];
00115   char* join_buffer(const char*, size_t);
00116 public:
00117 /**
00118  * Takes a byte array of wholly or partly formed UTF-8 characters to
00119  * be converted (after taking account of previous calls to the method)
00120  * to a valid string of wholly formed characters.
00121  * @param input The input array.
00122  * @param size The number of bytes in the input (not the number of
00123  * UTF-8 characters).
00124  * @return A Cgu::SharedHandle<char*> object holding a nul-terminated
00125  * string comprising such of the input (after inserting, at the
00126  * beginning, any partially formed UTF-8 character which was at the
00127  * end of the input passed in previous calls to the functor) as forms
00128  * complete UTF-8 characters (storing any partial character at the end
00129  * for the next call to the functor).  If the input is invalid after
00130  * such recombination, then a null Cgu::SharedHandle<char*> object is
00131  * returned (that is, Cgu::SharedHandle<char*>::get() will return 0).
00132  * Such input will not be treated as invalid if it consists only of a
00133  * single partly formed UTF-8 character which could be valid if
00134  * further bytes were received and added to it.  In that case the
00135  * returned Cgu::SharedHandle<char*> object will contain an allocated
00136  * string of zero length, comprising only a terminating \0 character,
00137  * rather than a NULL pointer.
00138  * @exception std::bad_alloc The method might throw std::bad_alloc if
00139  * memory is exhausted and the system throws in that case.  It will
00140  * not throw any other exception.
00141  */
00142   Cgu::SharedHandle<char*> operator()(const char* input, size_t size);
00143 
00144 /**
00145  * Gets the number of bytes of a partially formed UTF-8 character
00146  * stored for the next call to operator()().  It will not throw.
00147  * @return The number of bytes.
00148  */
00149   size_t get_stored() const {return stored;}
00150 
00151 /**
00152  * Resets the Reassembler, by discarding any partially formed UTF-8
00153  * character from previous calls to operator()().  It will not throw.
00154  */
00155   void reset() {stored = 0;}
00156 
00157 /**
00158  * The constructor will not throw.
00159  */
00160   Reassembler(): stored(0) {}
00161 
00162   // TODO: At the next API break, provide a default and move
00163   // constructor and move assignment operator, and omit a copy
00164   // constructor and copy assignment operator: this class maintains an
00165   // array as a data member
00166 
00167 /* Only has effect if --with-glib-memory-slices-compat or
00168  * --with-glib-memory-slices-no-compat option picked */
00169   CGU_GLIB_MEMORY_SLICES_FUNCS
00170 };
00171 
00172 } // namespace Utf8
00173 
00174 } // namespace Cgu
00175 
00176 #endif