c++-gtk-utils
|
00001 /* Copyright (C) 2005 to 2010 Chris Vine 00002 00003 The library comprised in this file or of which this file is part is 00004 distributed by Chris Vine under the GNU Lesser General Public 00005 License as follows: 00006 00007 This library is free software; you can redistribute it and/or 00008 modify it under the terms of the GNU Lesser General Public License 00009 as published by the Free Software Foundation; either version 2.1 of 00010 the License, or (at your option) any later version. 00011 00012 This library is distributed in the hope that it will be useful, but 00013 WITHOUT ANY WARRANTY; without even the implied warranty of 00014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 Lesser General Public License, version 2.1, for more details. 00016 00017 You should have received a copy of the GNU Lesser General Public 00018 License, version 2.1, along with this library (see the file LGPL.TXT 00019 which came with this source code package in the c++-gtk-utils 00020 sub-directory); if not, write to the Free Software Foundation, Inc., 00021 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 00022 00023 */ 00024 00025 #ifndef CGU_REASSEMBLER_H 00026 #define CGU_REASSEMBLER_H 00027 00028 #include <c++-gtk-utils/shared_handle.h> 00029 #include <c++-gtk-utils/cgu_config.h> 00030 00031 namespace Cgu { 00032 00033 namespace Utf8 { 00034 00035 00036 /** 00037 * @class Reassembler reassembler.h c++-gtk-utils/reassembler.h 00038 * @brief A class for reassembling UTF-8 strings sent over pipes and 00039 * sockets so they form complete valid UTF-8 characters. 00040 * 00041 * Utf8::Reassembler is a functor class which takes in a partially 00042 * formed UTF-8 string and returns a nul-terminated string comprising 00043 * such of the input string (after inserting, at the beginning, any 00044 * partially formed UTF-8 character which was at the end of the input 00045 * string passed in previous calls to the functor) as forms complete 00046 * UTF-8 characters (storing any partial character at the end for the 00047 * next call to the functor). If the input string contains invalid 00048 * UTF-8 after adding any stored previous part character (apart from 00049 * any partially formed character at the end of the input string) then 00050 * operator() will return a null Cgu::SharedHandle<char*> object (that 00051 * is, Cgu::SharedHandle<char*>::get() will return 0). Such input 00052 * will not be treated as invalid if it consists only of a single 00053 * partly formed UTF-8 character which could be valid if further bytes 00054 * were received and added to it. In that case the returned 00055 * SharedHandle<char*> object will contain an allocated string of zero 00056 * length, comprising only a terminating \0 character, rather than a 00057 * NULL pointer. 00058 * 00059 * This enables UTF-8 strings to be sent over pipes, sockets, etc and 00060 * displayed in a GTK+ object at the receiving end 00061 * 00062 * Note that for efficiency reasons the memory held in the returned 00063 * Cgu::SharedHandle<char*> object may be greater than the length of 00064 * the nul-terminated string that is contained in that memory: just 00065 * let the Cgu::SharedHandle<char*> object manage the memory, and use 00066 * the contents like any other nul-terminated string. 00067 * 00068 * This class is not needed if std::getline(), with its default '\\n' 00069 * delimiter, is used to read UTF-8 characters using, say, 00070 * Cgu::fdistream, because a whole '\\n' delimited line of UTF-8 00071 * characters will always be complete. 00072 * 00073 * This is an example of its use, reading from a pipe until it is 00074 * closed by the writer and putting the received text in a 00075 * GtkTextBuffer object: 00076 * @code 00077 * using namespace Cgu; 00078 * 00079 * GtkTextIter end; 00080 * GtkTextBuffer* text_buffer = gtk_text_view_get_buffer(GTK_TEXT_VIEW(text_view)); 00081 * gtk_text_buffer_get_end_iter(text_buffer, &end); 00082 * 00083 * Utf8::Reassembler reassembler; 00084 * const int BSIZE = 1024; 00085 * char read_buffer[BSIZE]; 00086 * ssize_t res; 00087 * do { 00088 * res = ::read(fd, read_buffer, BSIZE); 00089 * if (res > 0) { 00090 * SharedHandle<char*> utf8(reassembler(read_buffer, res)); 00091 * if (utf8.get()) { 00092 * gtk_text_buffer_insert(text_buffer, &end, 00093 * utf8.get(), std::strlen(utf8)); 00094 * } 00095 * else std::cerr << "Invalid utf8 text sent over pipe\n"; 00096 * } 00097 * } while (res && (res != -1 || errno == EINTR)); 00098 * @endcode 00099 * 00100 * This class maintains an array as a data member, containing partly 00101 * formed characters from previous calls to operator(), and should not 00102 * be copied. There should be no reason to do so, but unfortunately 00103 * enforcing this by explicitly precluding copy construction and copy 00104 * assignment was overlooked when this class was first provided. At 00105 * the next API break, the copy constructor will be explicitly deleted 00106 * and moving only allowed. Where a Reassembler object is to be 00107 * moved, use std::move and the code will be safe against this change 00108 * in the future. 00109 */ 00110 00111 class Reassembler { 00112 size_t stored; 00113 const static size_t buff_size = 6; 00114 char buffer[buff_size]; 00115 char* join_buffer(const char*, size_t); 00116 public: 00117 /** 00118 * Takes a byte array of wholly or partly formed UTF-8 characters to 00119 * be converted (after taking account of previous calls to the method) 00120 * to a valid string of wholly formed characters. 00121 * @param input The input array. 00122 * @param size The number of bytes in the input (not the number of 00123 * UTF-8 characters). 00124 * @return A Cgu::SharedHandle<char*> object holding a nul-terminated 00125 * string comprising such of the input (after inserting, at the 00126 * beginning, any partially formed UTF-8 character which was at the 00127 * end of the input passed in previous calls to the functor) as forms 00128 * complete UTF-8 characters (storing any partial character at the end 00129 * for the next call to the functor). If the input is invalid after 00130 * such recombination, then a null Cgu::SharedHandle<char*> object is 00131 * returned (that is, Cgu::SharedHandle<char*>::get() will return 0). 00132 * Such input will not be treated as invalid if it consists only of a 00133 * single partly formed UTF-8 character which could be valid if 00134 * further bytes were received and added to it. In that case the 00135 * returned Cgu::SharedHandle<char*> object will contain an allocated 00136 * string of zero length, comprising only a terminating \0 character, 00137 * rather than a NULL pointer. 00138 * @exception std::bad_alloc The method might throw std::bad_alloc if 00139 * memory is exhausted and the system throws in that case. It will 00140 * not throw any other exception. 00141 */ 00142 Cgu::SharedHandle<char*> operator()(const char* input, size_t size); 00143 00144 /** 00145 * Gets the number of bytes of a partially formed UTF-8 character 00146 * stored for the next call to operator()(). It will not throw. 00147 * @return The number of bytes. 00148 */ 00149 size_t get_stored() const {return stored;} 00150 00151 /** 00152 * Resets the Reassembler, by discarding any partially formed UTF-8 00153 * character from previous calls to operator()(). It will not throw. 00154 */ 00155 void reset() {stored = 0;} 00156 00157 /** 00158 * The constructor will not throw. 00159 */ 00160 Reassembler(): stored(0) {} 00161 00162 // TODO: At the next API break, provide a default and move 00163 // constructor and move assignment operator, and omit a copy 00164 // constructor and copy assignment operator: this class maintains an 00165 // array as a data member 00166 00167 /* Only has effect if --with-glib-memory-slices-compat or 00168 * --with-glib-memory-slices-no-compat option picked */ 00169 CGU_GLIB_MEMORY_SLICES_FUNCS 00170 }; 00171 00172 } // namespace Utf8 00173 00174 } // namespace Cgu 00175 00176 #endif