Package translate :: Package misc :: Module textwrap
[hide private]
[frames] | no frames]

Source Code for Module translate.misc.textwrap

  1  # -*- coding: utf-8 -*- 
  2  """Text wrapping and filling. 
  3  """ 
  4   
  5  # Copyright (C) 1999-2001 Gregory P. Ward. 
  6  # Copyright (C) 2002, 2003 Python Software Foundation. 
  7  # Written by Greg Ward <gward@python.net> 
  8   
  9  __revision__ = "$Id: textwrap.py 9228 2008-12-13 04:50:49Z friedelwolff $" 
 10   
 11  import string, re 
 12   
 13  # Do the right thing with boolean values for all known Python versions 
 14  # (so this module can be copied to projects that don't depend on Python 
 15  # 2.3, e.g. Optik and Docutils). 
 16  try: 
 17      True, False 
 18  except NameError: 
 19      (True, False) = (1, 0) 
 20   
 21  __all__ = ['TextWrapper', 'wrap', 'fill'] 
 22   
 23  # Hardcode the recognized whitespace characters to the US-ASCII 
 24  # whitespace characters.  The main reason for doing this is that in 
 25  # ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales 
 26  # that character winds up in string.whitespace.  Respecting 
 27  # string.whitespace in those cases would 1) make textwrap treat 0xa0 the 
 28  # same as any other whitespace char, which is clearly wrong (it's a 
 29  # *non-breaking* space), 2) possibly cause problems with Unicode, 
 30  # since 0xa0 is not in range(128). 
 31  _whitespace = '\t\n\x0b\x0c\r ' 
 32   
33 -class TextWrapper:
34 """ 35 Object for wrapping/filling text. The public interface consists of 36 the wrap() and fill() methods; the other methods are just there for 37 subclasses to override in order to tweak the default behaviour. 38 If you want to completely replace the main wrapping algorithm, 39 you'll probably have to override _wrap_chunks(). 40 41 Several instance attributes control various aspects of wrapping: 42 width (default: 70) 43 the maximum width of wrapped lines (unless break_long_words 44 is false) 45 initial_indent (default: "") 46 string that will be prepended to the first line of wrapped 47 output. Counts towards the line's width. 48 subsequent_indent (default: "") 49 string that will be prepended to all lines save the first 50 of wrapped output; also counts towards each line's width. 51 expand_tabs (default: true) 52 Expand tabs in input text to spaces before further processing. 53 Each tab will become 1 .. 8 spaces, depending on its position in 54 its line. If false, each tab is treated as a single character. 55 drop_whitespace (default: true) 56 Drop leading and trailing whitespace from lines. 57 replace_whitespace (default: true) 58 Replace all whitespace characters in the input text by spaces 59 after tab expansion. Note that if expand_tabs is false and 60 replace_whitespace is true, every tab will be converted to a 61 single space! 62 fix_sentence_endings (default: false) 63 Ensure that sentence-ending punctuation is always followed 64 by two spaces. Off by default because the algorithm is 65 (unavoidably) imperfect. 66 break_long_words (default: true) 67 Break words longer than 'width'. If false, those words will not 68 be broken, and some lines might be longer than 'width'. 69 """ 70 71 whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace)) 72 73 unicode_whitespace_trans = {} 74 uspace = ord(u' ') 75 for x in map(ord, _whitespace): 76 unicode_whitespace_trans[x] = uspace 77 78 # This funky little regex is just the trick for splitting 79 # text up into word-wrappable chunks. E.g. 80 # "Hello there -- you goof-ball, use the -b option!" 81 # splits into 82 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! 83 # (after stripping out empty strings). 84 wordsep_re = re.compile( 85 r'(\s+|' # any whitespace 86 r'%|' # gettext handles % like whitespace 87 r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words 88 r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash 89 90 # XXX this is not locale- or charset-aware -- string.lowercase 91 # is US-ASCII only (and therefore English-only) 92 sentence_end_re = re.compile(r'[%s]' # lowercase letter 93 r'[\.\!\?]' # sentence-ending punct. 94 r'[\"\']?' # optional end-of-quote 95 % string.lowercase) 96 97
98 - def __init__(self, 99 width=70, 100 initial_indent="", 101 subsequent_indent="", 102 expand_tabs=True, 103 drop_whitespace=True, 104 replace_whitespace=True, 105 fix_sentence_endings=False, 106 break_long_words=True):
107 self.width = width 108 self.initial_indent = initial_indent 109 self.subsequent_indent = subsequent_indent 110 self.expand_tabs = expand_tabs 111 self.drop_whitespace = drop_whitespace 112 self.replace_whitespace = replace_whitespace 113 self.fix_sentence_endings = fix_sentence_endings 114 self.break_long_words = break_long_words
115 116 117 # -- Private methods ----------------------------------------------- 118 # (possibly useful for subclasses to override) 119
120 - def _munge_whitespace(self, text):
121 """_munge_whitespace(text : string) -> string 122 123 Munge whitespace in text: expand tabs and convert all other 124 whitespace characters to spaces. Eg. " foo\tbar\n\nbaz" 125 becomes " foo bar baz". 126 """ 127 if self.expand_tabs: 128 text = text.expandtabs() 129 if self.replace_whitespace: 130 if isinstance(text, str): 131 text = text.translate(self.whitespace_trans) 132 elif isinstance(text, unicode): 133 text = text.translate(self.unicode_whitespace_trans) 134 return text
135 136
137 - def _split(self, text):
138 """_split(text : string) -> [string] 139 140 Split the text to wrap into indivisible chunks. Chunks are 141 not quite the same as words; see wrap_chunks() for full 142 details. As an example, the text 143 Look, goof-ball -- use the -b option! 144 breaks into the following chunks: 145 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ', 146 'use', ' ', 'the', ' ', '-b', ' ', 'option!' 147 """ 148 chunks = self.wordsep_re.split(text) 149 chunks = filter(None, chunks) 150 return chunks
151
152 - def _fix_sentence_endings(self, chunks):
153 """_fix_sentence_endings(chunks : [string]) 154 155 Correct for sentence endings buried in 'chunks'. Eg. when the 156 original text contains "... foo.\nBar ...", munge_whitespace() 157 and split() will convert that to [..., "foo.", " ", "Bar", ...] 158 which has one too few spaces; this method simply changes the one 159 space to two. 160 """ 161 i = 0 162 pat = self.sentence_end_re 163 while i < len(chunks)-1: 164 if chunks[i+1] == " " and pat.search(chunks[i]): 165 chunks[i+1] = " " 166 i += 2 167 else: 168 i += 1
169
170 - def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
171 """_handle_long_word(chunks : [string], 172 cur_line : [string], 173 cur_len : int, width : int) 174 175 Handle a chunk of text (most likely a word, not whitespace) that 176 is too long to fit in any line. 177 """ 178 space_left = max(width - cur_len, 1) 179 180 # If we're allowed to break long words, then do so: put as much 181 # of the next chunk onto the current line as will fit. 182 if self.break_long_words: 183 cur_line.append(reversed_chunks[-1][:space_left]) 184 reversed_chunks[-1] = reversed_chunks[-1][space_left:] 185 186 # Otherwise, we have to preserve the long word intact. Only add 187 # it to the current line if there's nothing already there -- 188 # that minimizes how much we violate the width constraint. 189 elif not cur_line: 190 cur_line.append(reversed_chunks.pop())
191 192 # If we're not allowed to break long words, and there's already 193 # text on the current line, do nothing. Next time through the 194 # main loop of _wrap_chunks(), we'll wind up here again, but 195 # cur_len will be zero, so the next line will be entirely 196 # devoted to the long word that we can't handle right now. 197
198 - def _wrap_chunks(self, chunks):
199 """_wrap_chunks(chunks : [string]) -> [string] 200 201 Wrap a sequence of text chunks and return a list of lines of 202 length 'self.width' or less. (If 'break_long_words' is false, 203 some lines may be longer than this.) Chunks correspond roughly 204 to words and the whitespace between them: each chunk is 205 indivisible (modulo 'break_long_words'), but a line break can 206 come between any two chunks. Chunks should not have internal 207 whitespace; ie. a chunk is either all whitespace or a "word". 208 Whitespace chunks will be removed from the beginning and end of 209 lines, but apart from that whitespace is preserved. 210 """ 211 lines = [] 212 if self.width <= 0: 213 raise ValueError("invalid width %r (must be > 0)" % self.width) 214 215 # Arrange in reverse order so items can be efficiently popped 216 # from a stack of chucks. 217 chunks.reverse() 218 219 while chunks: 220 221 # Start the list of chunks that will make up the current line. 222 # cur_len is just the length of all the chunks in cur_line. 223 cur_line = [] 224 cur_len = 0 225 226 # Figure out which static string will prefix this line. 227 if lines: 228 indent = self.subsequent_indent 229 else: 230 indent = self.initial_indent 231 232 # Maximum width for this line. 233 width = self.width - len(indent) 234 235 # First chunk on line is whitespace -- drop it, unless this 236 # is the very beginning of the text (ie. no lines started yet). 237 if self.drop_whitespace and chunks[-1].strip() == '' and lines: 238 del chunks[-1] 239 240 while chunks: 241 l = len(chunks[-1]) 242 243 # Can at least squeeze this chunk onto the current line. 244 if cur_len + l <= width: 245 cur_line.append(chunks.pop()) 246 cur_len += l 247 248 # Nope, this line is full. 249 else: 250 break 251 252 # The current line is full, and the next chunk is too big to 253 # fit on *any* line (not just this one). 254 if chunks and len(chunks[-1]) > width: 255 self._handle_long_word(chunks, cur_line, cur_len, width) 256 257 # If the last chunk on this line is all whitespace, drop it. 258 if self.drop_whitespace and cur_line and cur_line[-1].strip() == '': 259 del cur_line[-1] 260 261 # Convert current line back to a string and store it in list 262 # of all lines (return value). 263 if cur_line: 264 lines.append(indent + ''.join(cur_line)) 265 266 return lines
267 268 269 # -- Public interface ---------------------------------------------- 270
271 - def wrap(self, text):
272 """wrap(text : string) -> [string] 273 274 Reformat the single paragraph in 'text' so it fits in lines of 275 no more than 'self.width' columns, and return a list of wrapped 276 lines. Tabs in 'text' are expanded with string.expandtabs(), 277 and all other whitespace characters (including newline) are 278 converted to space. 279 """ 280 text = self._munge_whitespace(text) 281 chunks = self._split(text) 282 if self.fix_sentence_endings: 283 self._fix_sentence_endings(chunks) 284 return self._wrap_chunks(chunks)
285
286 - def fill(self, text):
287 """fill(text : string) -> string 288 289 Reformat the single paragraph in 'text' to fit in lines of no 290 more than 'self.width' columns, and return a new string 291 containing the entire wrapped paragraph. 292 """ 293 return "\n".join(self.wrap(text))
294 295 296 # -- Convenience interface --------------------------------------------- 297
298 -def wrap(text, width=70, **kwargs):
299 """Wrap a single paragraph of text, returning a list of wrapped lines. 300 301 Reformat the single paragraph in 'text' so it fits in lines of no 302 more than 'width' columns, and return a list of wrapped lines. By 303 default, tabs in 'text' are expanded with string.expandtabs(), and 304 all other whitespace characters (including newline) are converted to 305 space. See TextWrapper class for available keyword args to customize 306 wrapping behaviour. 307 """ 308 w = TextWrapper(width=width, **kwargs) 309 return w.wrap(text)
310
311 -def fill(text, width=70, **kwargs):
312 """Fill a single paragraph of text, returning a new string. 313 314 Reformat the single paragraph in 'text' to fit in lines of no more 315 than 'width' columns, and return a new string containing the entire 316 wrapped paragraph. As with wrap(), tabs are expanded and other 317 whitespace characters converted to space. See TextWrapper class for 318 available keyword args to customize wrapping behaviour. 319 """ 320 w = TextWrapper(width=width, **kwargs) 321 return w.fill(text)
322 323 324 # -- Loosely related functionality ------------------------------------- 325 326 _whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE) 327 _leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE) 328
329 -def dedent(text):
330 """Remove any common leading whitespace from every line in `text`. 331 332 This can be used to make triple-quoted strings line up with the left 333 edge of the display, while still presenting them in the source code 334 in indented form. 335 336 Note that tabs and spaces are both treated as whitespace, but they 337 are not equal: the lines " hello" and "\thello" are 338 considered to have no common leading whitespace. (This behaviour is 339 new in Python 2.5; older versions of this module incorrectly 340 expanded tabs before searching for common leading whitespace.) 341 """ 342 # Look for the longest leading string of spaces and tabs common to 343 # all lines. 344 margin = None 345 text = _whitespace_only_re.sub('', text) 346 indents = _leading_whitespace_re.findall(text) 347 for indent in indents: 348 if margin is None: 349 margin = indent 350 351 # Current line more deeply indented than previous winner: 352 # no change (previous winner is still on top). 353 elif indent.startswith(margin): 354 pass 355 356 # Current line consistent with and no deeper than previous winner: 357 # it's the new winner. 358 elif margin.startswith(indent): 359 margin = indent 360 361 # Current line and previous winner have no common whitespace: 362 # there is no margin. 363 else: 364 margin = "" 365 break 366 367 # sanity check (testing/debugging only) 368 if 0 and margin: 369 for line in text.split("\n"): 370 assert not line or line.startswith(margin), \ 371 "line = %r, margin = %r" % (line, margin) 372 373 if margin: 374 text = re.sub(r'(?m)^' + margin, '', text) 375 return text
376 377 if __name__ == "__main__": 378 #print dedent("\tfoo\n\tbar") 379 #print dedent(" \thello there\n \t how are you?") 380 print dedent("Hello there.\n This is indented.") 381