| 1 | from __future__ import print_function  # for OPy compiler | 
| 2 | """Text wrapping and filling. | 
| 3 | """ | 
| 4 |  | 
| 5 | # Copyright (C) 1999-2001 Gregory P. Ward. | 
| 6 | # Copyright (C) 2002, 2003 Python Software Foundation. | 
| 7 | # Written by Greg Ward <gward@python.net> | 
| 8 |  | 
| 9 | __revision__ = "$Id$" | 
| 10 |  | 
| 11 | import string, re | 
| 12 |  | 
| 13 | try: | 
| 14 | _unicode = unicode | 
| 15 | except NameError: | 
| 16 | # If Python is built without Unicode support, the unicode type | 
| 17 | # will not exist. Fake one. | 
| 18 | class _unicode(object): | 
| 19 | pass | 
| 20 |  | 
| 21 | # Do the right thing with boolean values for all known Python versions | 
| 22 | # (so this module can be copied to projects that don't depend on Python | 
| 23 | # 2.3, e.g. Optik and Docutils) by uncommenting the block of code below. | 
| 24 | #try: | 
| 25 | #    True, False | 
| 26 | #except NameError: | 
| 27 | #    (True, False) = (1, 0) | 
| 28 |  | 
| 29 | __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent'] | 
| 30 |  | 
| 31 | # Hardcode the recognized whitespace characters to the US-ASCII | 
| 32 | # whitespace characters.  The main reason for doing this is that in | 
| 33 | # ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales | 
| 34 | # that character winds up in string.whitespace.  Respecting | 
| 35 | # string.whitespace in those cases would 1) make textwrap treat 0xa0 the | 
| 36 | # same as any other whitespace char, which is clearly wrong (it's a | 
| 37 | # *non-breaking* space), 2) possibly cause problems with Unicode, | 
| 38 | # since 0xa0 is not in range(128). | 
| 39 | _whitespace = '\t\n\x0b\x0c\r ' | 
| 40 |  | 
| 41 | class TextWrapper: | 
| 42 | """ | 
| 43 | Object for wrapping/filling text.  The public interface consists of | 
| 44 | the wrap() and fill() methods; the other methods are just there for | 
| 45 | subclasses to override in order to tweak the default behaviour. | 
| 46 | If you want to completely replace the main wrapping algorithm, | 
| 47 | you'll probably have to override _wrap_chunks(). | 
| 48 |  | 
| 49 | Several instance attributes control various aspects of wrapping: | 
| 50 | width (default: 70) | 
| 51 | the maximum width of wrapped lines (unless break_long_words | 
| 52 | is false) | 
| 53 | initial_indent (default: "") | 
| 54 | string that will be prepended to the first line of wrapped | 
| 55 | output.  Counts towards the line's width. | 
| 56 | subsequent_indent (default: "") | 
| 57 | string that will be prepended to all lines save the first | 
| 58 | of wrapped output; also counts towards each line's width. | 
| 59 | expand_tabs (default: true) | 
| 60 | Expand tabs in input text to spaces before further processing. | 
| 61 | Each tab will become 1 .. 8 spaces, depending on its position in | 
| 62 | its line.  If false, each tab is treated as a single character. | 
| 63 | replace_whitespace (default: true) | 
| 64 | Replace all whitespace characters in the input text by spaces | 
| 65 | after tab expansion.  Note that if expand_tabs is false and | 
| 66 | replace_whitespace is true, every tab will be converted to a | 
| 67 | single space! | 
| 68 | fix_sentence_endings (default: false) | 
| 69 | Ensure that sentence-ending punctuation is always followed | 
| 70 | by two spaces.  Off by default because the algorithm is | 
| 71 | (unavoidably) imperfect. | 
| 72 | break_long_words (default: true) | 
| 73 | Break words longer than 'width'.  If false, those words will not | 
| 74 | be broken, and some lines might be longer than 'width'. | 
| 75 | break_on_hyphens (default: true) | 
| 76 | Allow breaking hyphenated words. If true, wrapping will occur | 
| 77 | preferably on whitespaces and right after hyphens part of | 
| 78 | compound words. | 
| 79 | drop_whitespace (default: true) | 
| 80 | Drop leading and trailing whitespace from lines. | 
| 81 | """ | 
| 82 |  | 
| 83 | whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace)) | 
| 84 |  | 
| 85 | unicode_whitespace_trans = {} | 
| 86 | uspace = ord(u' ') | 
| 87 | for x in map(ord, _whitespace): | 
| 88 | unicode_whitespace_trans[x] = uspace | 
| 89 |  | 
| 90 | # This funky little regex is just the trick for splitting | 
| 91 | # text up into word-wrappable chunks.  E.g. | 
| 92 | #   "Hello there -- you goof-ball, use the -b option!" | 
| 93 | # splits into | 
| 94 | #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! | 
| 95 | # (after stripping out empty strings). | 
| 96 | wordsep_re = re.compile( | 
| 97 | r'(\s+|'                                  # any whitespace | 
| 98 | r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|'   # hyphenated words | 
| 99 | r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')   # em-dash | 
| 100 |  | 
| 101 | # This less funky little regex just split on recognized spaces. E.g. | 
| 102 | #   "Hello there -- you goof-ball, use the -b option!" | 
| 103 | # splits into | 
| 104 | #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ | 
| 105 | wordsep_simple_re = re.compile(r'(\s+)') | 
| 106 |  | 
| 107 | # XXX this is not locale- or charset-aware -- string.lowercase | 
| 108 | # is US-ASCII only (and therefore English-only) | 
| 109 | sentence_end_re = re.compile(r'[%s]'              # lowercase letter | 
| 110 | r'[\.\!\?]'          # sentence-ending punct. | 
| 111 | r'[\"\']?'           # optional end-of-quote | 
| 112 | r'\Z'                # end of chunk | 
| 113 | % string.lowercase) | 
| 114 |  | 
| 115 |  | 
| 116 | def __init__(self, | 
| 117 | width=70, | 
| 118 | initial_indent="", | 
| 119 | subsequent_indent="", | 
| 120 | expand_tabs=True, | 
| 121 | replace_whitespace=True, | 
| 122 | fix_sentence_endings=False, | 
| 123 | break_long_words=True, | 
| 124 | drop_whitespace=True, | 
| 125 | break_on_hyphens=True): | 
| 126 | self.width = width | 
| 127 | self.initial_indent = initial_indent | 
| 128 | self.subsequent_indent = subsequent_indent | 
| 129 | self.expand_tabs = expand_tabs | 
| 130 | self.replace_whitespace = replace_whitespace | 
| 131 | self.fix_sentence_endings = fix_sentence_endings | 
| 132 | self.break_long_words = break_long_words | 
| 133 | self.drop_whitespace = drop_whitespace | 
| 134 | self.break_on_hyphens = break_on_hyphens | 
| 135 |  | 
| 136 | # recompile the regexes for Unicode mode -- done in this clumsy way for | 
| 137 | # backwards compatibility because it's rather common to monkey-patch | 
| 138 | # the TextWrapper class' wordsep_re attribute. | 
| 139 | self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U) | 
| 140 | self.wordsep_simple_re_uni = re.compile( | 
| 141 | self.wordsep_simple_re.pattern, re.U) | 
| 142 |  | 
| 143 |  | 
| 144 | # -- Private methods ----------------------------------------------- | 
| 145 | # (possibly useful for subclasses to override) | 
| 146 |  | 
| 147 | def _munge_whitespace(self, text): | 
| 148 | """_munge_whitespace(text : string) -> string | 
| 149 |  | 
| 150 | Munge whitespace in text: expand tabs and convert all other | 
| 151 | whitespace characters to spaces.  Eg. " foo\\tbar\\n\\nbaz" | 
| 152 | becomes " foo    bar  baz". | 
| 153 | """ | 
| 154 | if self.expand_tabs: | 
| 155 | text = text.expandtabs() | 
| 156 | if self.replace_whitespace: | 
| 157 | if isinstance(text, str): | 
| 158 | text = text.translate(self.whitespace_trans) | 
| 159 | elif isinstance(text, _unicode): | 
| 160 | text = text.translate(self.unicode_whitespace_trans) | 
| 161 | return text | 
| 162 |  | 
| 163 |  | 
| 164 | def _split(self, text): | 
| 165 | """_split(text : string) -> [string] | 
| 166 |  | 
| 167 | Split the text to wrap into indivisible chunks.  Chunks are | 
| 168 | not quite the same as words; see _wrap_chunks() for full | 
| 169 | details.  As an example, the text | 
| 170 | Look, goof-ball -- use the -b option! | 
| 171 | breaks into the following chunks: | 
| 172 | 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ', | 
| 173 | 'use', ' ', 'the', ' ', '-b', ' ', 'option!' | 
| 174 | if break_on_hyphens is True, or in: | 
| 175 | 'Look,', ' ', 'goof-ball', ' ', '--', ' ', | 
| 176 | 'use', ' ', 'the', ' ', '-b', ' ', option!' | 
| 177 | otherwise. | 
| 178 | """ | 
| 179 | if isinstance(text, _unicode): | 
| 180 | if self.break_on_hyphens: | 
| 181 | pat = self.wordsep_re_uni | 
| 182 | else: | 
| 183 | pat = self.wordsep_simple_re_uni | 
| 184 | else: | 
| 185 | if self.break_on_hyphens: | 
| 186 | pat = self.wordsep_re | 
| 187 | else: | 
| 188 | pat = self.wordsep_simple_re | 
| 189 | chunks = pat.split(text) | 
| 190 | chunks = filter(None, chunks)  # remove empty chunks | 
| 191 | return chunks | 
| 192 |  | 
| 193 | def _fix_sentence_endings(self, chunks): | 
| 194 | """_fix_sentence_endings(chunks : [string]) | 
| 195 |  | 
| 196 | Correct for sentence endings buried in 'chunks'.  Eg. when the | 
| 197 | original text contains "... foo.\\nBar ...", munge_whitespace() | 
| 198 | and split() will convert that to [..., "foo.", " ", "Bar", ...] | 
| 199 | which has one too few spaces; this method simply changes the one | 
| 200 | space to two. | 
| 201 | """ | 
| 202 | i = 0 | 
| 203 | patsearch = self.sentence_end_re.search | 
| 204 | while i < len(chunks)-1: | 
| 205 | if chunks[i+1] == " " and patsearch(chunks[i]): | 
| 206 | chunks[i+1] = "  " | 
| 207 | i += 2 | 
| 208 | else: | 
| 209 | i += 1 | 
| 210 |  | 
| 211 | def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): | 
| 212 | """_handle_long_word(chunks : [string], | 
| 213 | cur_line : [string], | 
| 214 | cur_len : int, width : int) | 
| 215 |  | 
| 216 | Handle a chunk of text (most likely a word, not whitespace) that | 
| 217 | is too long to fit in any line. | 
| 218 | """ | 
| 219 | # Figure out when indent is larger than the specified width, and make | 
| 220 | # sure at least one character is stripped off on every pass | 
| 221 | if width < 1: | 
| 222 | space_left = 1 | 
| 223 | else: | 
| 224 | space_left = width - cur_len | 
| 225 |  | 
| 226 | # If we're allowed to break long words, then do so: put as much | 
| 227 | # of the next chunk onto the current line as will fit. | 
| 228 | if self.break_long_words: | 
| 229 | cur_line.append(reversed_chunks[-1][:space_left]) | 
| 230 | reversed_chunks[-1] = reversed_chunks[-1][space_left:] | 
| 231 |  | 
| 232 | # Otherwise, we have to preserve the long word intact.  Only add | 
| 233 | # it to the current line if there's nothing already there -- | 
| 234 | # that minimizes how much we violate the width constraint. | 
| 235 | elif not cur_line: | 
| 236 | cur_line.append(reversed_chunks.pop()) | 
| 237 |  | 
| 238 | # If we're not allowed to break long words, and there's already | 
| 239 | # text on the current line, do nothing.  Next time through the | 
| 240 | # main loop of _wrap_chunks(), we'll wind up here again, but | 
| 241 | # cur_len will be zero, so the next line will be entirely | 
| 242 | # devoted to the long word that we can't handle right now. | 
| 243 |  | 
| 244 | def _wrap_chunks(self, chunks): | 
| 245 | """_wrap_chunks(chunks : [string]) -> [string] | 
| 246 |  | 
| 247 | Wrap a sequence of text chunks and return a list of lines of | 
| 248 | length 'self.width' or less.  (If 'break_long_words' is false, | 
| 249 | some lines may be longer than this.)  Chunks correspond roughly | 
| 250 | to words and the whitespace between them: each chunk is | 
| 251 | indivisible (modulo 'break_long_words'), but a line break can | 
| 252 | come between any two chunks.  Chunks should not have internal | 
| 253 | whitespace; ie. a chunk is either all whitespace or a "word". | 
| 254 | Whitespace chunks will be removed from the beginning and end of | 
| 255 | lines, but apart from that whitespace is preserved. | 
| 256 | """ | 
| 257 | lines = [] | 
| 258 | if self.width <= 0: | 
| 259 | raise ValueError("invalid width %r (must be > 0)" % self.width) | 
| 260 |  | 
| 261 | # Arrange in reverse order so items can be efficiently popped | 
| 262 | # from a stack of chucks. | 
| 263 | chunks.reverse() | 
| 264 |  | 
| 265 | while chunks: | 
| 266 |  | 
| 267 | # Start the list of chunks that will make up the current line. | 
| 268 | # cur_len is just the length of all the chunks in cur_line. | 
| 269 | cur_line = [] | 
| 270 | cur_len = 0 | 
| 271 |  | 
| 272 | # Figure out which static string will prefix this line. | 
| 273 | if lines: | 
| 274 | indent = self.subsequent_indent | 
| 275 | else: | 
| 276 | indent = self.initial_indent | 
| 277 |  | 
| 278 | # Maximum width for this line. | 
| 279 | width = self.width - len(indent) | 
| 280 |  | 
| 281 | # First chunk on line is whitespace -- drop it, unless this | 
| 282 | # is the very beginning of the text (ie. no lines started yet). | 
| 283 | if self.drop_whitespace and chunks[-1].strip() == '' and lines: | 
| 284 | del chunks[-1] | 
| 285 |  | 
| 286 | while chunks: | 
| 287 | l = len(chunks[-1]) | 
| 288 |  | 
| 289 | # Can at least squeeze this chunk onto the current line. | 
| 290 | if cur_len + l <= width: | 
| 291 | cur_line.append(chunks.pop()) | 
| 292 | cur_len += l | 
| 293 |  | 
| 294 | # Nope, this line is full. | 
| 295 | else: | 
| 296 | break | 
| 297 |  | 
| 298 | # The current line is full, and the next chunk is too big to | 
| 299 | # fit on *any* line (not just this one). | 
| 300 | if chunks and len(chunks[-1]) > width: | 
| 301 | self._handle_long_word(chunks, cur_line, cur_len, width) | 
| 302 |  | 
| 303 | # If the last chunk on this line is all whitespace, drop it. | 
| 304 | if self.drop_whitespace and cur_line and cur_line[-1].strip() == '': | 
| 305 | del cur_line[-1] | 
| 306 |  | 
| 307 | # Convert current line back to a string and store it in list | 
| 308 | # of all lines (return value). | 
| 309 | if cur_line: | 
| 310 | lines.append(indent + ''.join(cur_line)) | 
| 311 |  | 
| 312 | return lines | 
| 313 |  | 
| 314 |  | 
| 315 | # -- Public interface ---------------------------------------------- | 
| 316 |  | 
| 317 | def wrap(self, text): | 
| 318 | """wrap(text : string) -> [string] | 
| 319 |  | 
| 320 | Reformat the single paragraph in 'text' so it fits in lines of | 
| 321 | no more than 'self.width' columns, and return a list of wrapped | 
| 322 | lines.  Tabs in 'text' are expanded with string.expandtabs(), | 
| 323 | and all other whitespace characters (including newline) are | 
| 324 | converted to space. | 
| 325 | """ | 
| 326 | text = self._munge_whitespace(text) | 
| 327 | chunks = self._split(text) | 
| 328 | if self.fix_sentence_endings: | 
| 329 | self._fix_sentence_endings(chunks) | 
| 330 | return self._wrap_chunks(chunks) | 
| 331 |  | 
| 332 | def fill(self, text): | 
| 333 | """fill(text : string) -> string | 
| 334 |  | 
| 335 | Reformat the single paragraph in 'text' to fit in lines of no | 
| 336 | more than 'self.width' columns, and return a new string | 
| 337 | containing the entire wrapped paragraph. | 
| 338 | """ | 
| 339 | return "\n".join(self.wrap(text)) | 
| 340 |  | 
| 341 |  | 
| 342 | # -- Convenience interface --------------------------------------------- | 
| 343 |  | 
| 344 | def wrap(text, width=70, **kwargs): | 
| 345 | """Wrap a single paragraph of text, returning a list of wrapped lines. | 
| 346 |  | 
| 347 | Reformat the single paragraph in 'text' so it fits in lines of no | 
| 348 | more than 'width' columns, and return a list of wrapped lines.  By | 
| 349 | default, tabs in 'text' are expanded with string.expandtabs(), and | 
| 350 | all other whitespace characters (including newline) are converted to | 
| 351 | space.  See TextWrapper class for available keyword args to customize | 
| 352 | wrapping behaviour. | 
| 353 | """ | 
| 354 | w = TextWrapper(width=width, **kwargs) | 
| 355 | return w.wrap(text) | 
| 356 |  | 
| 357 | def fill(text, width=70, **kwargs): | 
| 358 | """Fill a single paragraph of text, returning a new string. | 
| 359 |  | 
| 360 | Reformat the single paragraph in 'text' to fit in lines of no more | 
| 361 | than 'width' columns, and return a new string containing the entire | 
| 362 | wrapped paragraph.  As with wrap(), tabs are expanded and other | 
| 363 | whitespace characters converted to space.  See TextWrapper class for | 
| 364 | available keyword args to customize wrapping behaviour. | 
| 365 | """ | 
| 366 | w = TextWrapper(width=width, **kwargs) | 
| 367 | return w.fill(text) | 
| 368 |  | 
| 369 |  | 
| 370 | # -- Loosely related functionality ------------------------------------- | 
| 371 |  | 
| 372 | _whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE) | 
| 373 | _leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE) | 
| 374 |  | 
| 375 | def dedent(text): | 
| 376 | """Remove any common leading whitespace from every line in `text`. | 
| 377 |  | 
| 378 | This can be used to make triple-quoted strings line up with the left | 
| 379 | edge of the display, while still presenting them in the source code | 
| 380 | in indented form. | 
| 381 |  | 
| 382 | Note that tabs and spaces are both treated as whitespace, but they | 
| 383 | are not equal: the lines "  hello" and "\\thello" are | 
| 384 | considered to have no common leading whitespace.  (This behaviour is | 
| 385 | new in Python 2.5; older versions of this module incorrectly | 
| 386 | expanded tabs before searching for common leading whitespace.) | 
| 387 | """ | 
| 388 | # Look for the longest leading string of spaces and tabs common to | 
| 389 | # all lines. | 
| 390 | margin = None | 
| 391 | text = _whitespace_only_re.sub('', text) | 
| 392 | indents = _leading_whitespace_re.findall(text) | 
| 393 | for indent in indents: | 
| 394 | if margin is None: | 
| 395 | margin = indent | 
| 396 |  | 
| 397 | # Current line more deeply indented than previous winner: | 
| 398 | # no change (previous winner is still on top). | 
| 399 | elif indent.startswith(margin): | 
| 400 | pass | 
| 401 |  | 
| 402 | # Current line consistent with and no deeper than previous winner: | 
| 403 | # it's the new winner. | 
| 404 | elif margin.startswith(indent): | 
| 405 | margin = indent | 
| 406 |  | 
| 407 | # Find the largest common whitespace between current line and previous | 
| 408 | # winner. | 
| 409 | else: | 
| 410 | for i, (x, y) in enumerate(zip(margin, indent)): | 
| 411 | if x != y: | 
| 412 | margin = margin[:i] | 
| 413 | break | 
| 414 | else: | 
| 415 | margin = margin[:len(indent)] | 
| 416 |  | 
| 417 | # sanity check (testing/debugging only) | 
| 418 | if 0 and margin: | 
| 419 | for line in text.split("\n"): | 
| 420 | assert not line or line.startswith(margin), \ | 
| 421 | "line = %r, margin = %r" % (line, margin) | 
| 422 |  | 
| 423 | if margin: | 
| 424 | text = re.sub(r'(?m)^' + margin, '', text) | 
| 425 | return text | 
| 426 |  | 
| 427 | if __name__ == "__main__": | 
| 428 | #print dedent("\tfoo\n\tbar") | 
| 429 | #print dedent("  \thello there\n  \t  how are you?") | 
| 430 | print(dedent("Hello there.\n  This is indented.")) |