1 """Text wrapping and filling.
4 # Copyright (C) 1999-2001 Gregory P. Ward.
5 # Copyright (C) 2002, 2003 Python Software Foundation.
6 # Written by Greg Ward <gward@python.net>
15 # If Python is built without Unicode support, the unicode type
16 # will not exist. Fake one.
17 class _unicode(object):
20 # Do the right thing with boolean values for all known Python versions
21 # (so this module can be copied to projects that don't depend on Python
22 # 2.3, e.g. Optik and Docutils) by uncommenting the block of code below.
26 # (True, False) = (1, 0)
28 __all__
= ['TextWrapper', 'wrap', 'fill', 'dedent']
30 # Hardcode the recognized whitespace characters to the US-ASCII
31 # whitespace characters. The main reason for doing this is that in
32 # ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
33 # that character winds up in string.whitespace. Respecting
34 # string.whitespace in those cases would 1) make textwrap treat 0xa0 the
35 # same as any other whitespace char, which is clearly wrong (it's a
36 # *non-breaking* space), 2) possibly cause problems with Unicode,
37 # since 0xa0 is not in range(128).
38 _whitespace
= '\t\n\x0b\x0c\r '
42 Object for wrapping/filling text. The public interface consists of
43 the wrap() and fill() methods; the other methods are just there for
44 subclasses to override in order to tweak the default behaviour.
45 If you want to completely replace the main wrapping algorithm,
46 you'll probably have to override _wrap_chunks().
48 Several instance attributes control various aspects of wrapping:
50 the maximum width of wrapped lines (unless break_long_words
52 initial_indent (default: "")
53 string that will be prepended to the first line of wrapped
54 output. Counts towards the line's width.
55 subsequent_indent (default: "")
56 string that will be prepended to all lines save the first
57 of wrapped output; also counts towards each line's width.
58 expand_tabs (default: true)
59 Expand tabs in input text to spaces before further processing.
60 Each tab will become 1 .. 8 spaces, depending on its position in
61 its line. If false, each tab is treated as a single character.
62 replace_whitespace (default: true)
63 Replace all whitespace characters in the input text by spaces
64 after tab expansion. Note that if expand_tabs is false and
65 replace_whitespace is true, every tab will be converted to a
67 fix_sentence_endings (default: false)
68 Ensure that sentence-ending punctuation is always followed
69 by two spaces. Off by default because the algorithm is
70 (unavoidably) imperfect.
71 break_long_words (default: true)
72 Break words longer than 'width'. If false, those words will not
73 be broken, and some lines might be longer than 'width'.
74 break_on_hyphens (default: true)
75 Allow breaking hyphenated words. If true, wrapping will occur
76 preferably on whitespaces and right after hyphens part of
78 drop_whitespace (default: true)
79 Drop leading and trailing whitespace from lines.
82 whitespace_trans
= string
.maketrans(_whitespace
, ' ' * len(_whitespace
))
84 unicode_whitespace_trans
= {}
86 for x
in map(ord, _whitespace
):
87 unicode_whitespace_trans
[x
] = uspace
89 # This funky little regex is just the trick for splitting
90 # text up into word-wrappable chunks. E.g.
91 # "Hello there -- you goof-ball, use the -b option!"
93 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
94 # (after stripping out empty strings).
95 wordsep_re
= re
.compile(
96 r
'(\s+|' # any whitespace
97 r
'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words
98 r
'(?<=[\w\!\"\'\
&\
.\
,\?])-{2,}(?
=\w
))') # em-dash
100 # This less funky little regex just split on recognized spaces. E.g.
101 # "Hello there -- you goof-ball, use the -b option!"
103 # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
104 wordsep_simple_re = re.compile(r'(\s
+)')
106 # XXX this is not locale- or charset-aware -- string.lowercase
107 # is US-ASCII only (and therefore English-only)
108 sentence_end_re = re.compile(r'[%s]' # lowercase letter
109 r'[\
.\
!\?]' # sentence-ending punct.
110 r'[\"\']?
' # optional end-of-quote
118 subsequent_indent="",
120 replace_whitespace=True,
121 fix_sentence_endings=False,
122 break_long_words=True,
123 drop_whitespace=True,
124 break_on_hyphens=True):
126 self.initial_indent = initial_indent
127 self.subsequent_indent = subsequent_indent
128 self.expand_tabs = expand_tabs
129 self.replace_whitespace = replace_whitespace
130 self.fix_sentence_endings = fix_sentence_endings
131 self.break_long_words = break_long_words
132 self.drop_whitespace = drop_whitespace
133 self.break_on_hyphens = break_on_hyphens
135 # recompile the regexes for Unicode mode -- done in this clumsy way for
136 # backwards compatibility because it's rather common to monkey
-patch
137 # the TextWrapper class' wordsep_re attribute.
138 self
.wordsep_re_uni
= re
.compile(self
.wordsep_re
.pattern
, re
.U
)
139 self
.wordsep_simple_re_uni
= re
.compile(
140 self
.wordsep_simple_re
.pattern
, re
.U
)
143 # -- Private methods -----------------------------------------------
144 # (possibly useful for subclasses to override)
146 def _munge_whitespace(self
, text
):
147 """_munge_whitespace(text : string) -> string
149 Munge whitespace in text: expand tabs and convert all other
150 whitespace characters to spaces. Eg. " foo\\tbar\\n\\nbaz"
151 becomes " foo bar baz".
154 text
= text
.expandtabs()
155 if self
.replace_whitespace
:
156 if isinstance(text
, str):
157 text
= text
.translate(self
.whitespace_trans
)
158 elif isinstance(text
, _unicode
):
159 text
= text
.translate(self
.unicode_whitespace_trans
)
163 def _split(self
, text
):
164 """_split(text : string) -> [string]
166 Split the text to wrap into indivisible chunks. Chunks are
167 not quite the same as words; see _wrap_chunks() for full
168 details. As an example, the text
169 Look, goof-ball -- use the -b option!
170 breaks into the following chunks:
171 'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
172 'use', ' ', 'the', ' ', '-b', ' ', 'option!'
173 if break_on_hyphens is True, or in:
174 'Look,', ' ', 'goof-ball', ' ', '--', ' ',
175 'use', ' ', 'the', ' ', '-b', ' ', option!'
178 if isinstance(text
, _unicode
):
179 if self
.break_on_hyphens
:
180 pat
= self
.wordsep_re_uni
182 pat
= self
.wordsep_simple_re_uni
184 if self
.break_on_hyphens
:
185 pat
= self
.wordsep_re
187 pat
= self
.wordsep_simple_re
188 chunks
= pat
.split(text
)
189 chunks
= filter(None, chunks
) # remove empty chunks
192 def _fix_sentence_endings(self
, chunks
):
193 """_fix_sentence_endings(chunks : [string])
195 Correct for sentence endings buried in 'chunks'. Eg. when the
196 original text contains "... foo.\\nBar ...", munge_whitespace()
197 and split() will convert that to [..., "foo.", " ", "Bar", ...]
198 which has one too few spaces; this method simply changes the one
202 patsearch
= self
.sentence_end_re
.search
203 while i
< len(chunks
)-1:
204 if chunks
[i
+1] == " " and patsearch(chunks
[i
]):
210 def _handle_long_word(self
, reversed_chunks
, cur_line
, cur_len
, width
):
211 """_handle_long_word(chunks : [string],
213 cur_len : int, width : int)
215 Handle a chunk of text (most likely a word, not whitespace) that
216 is too long to fit in any line.
218 # Figure out when indent is larger than the specified width, and make
219 # sure at least one character is stripped off on every pass
223 space_left
= width
- cur_len
225 # If we're allowed to break long words, then do so: put as much
226 # of the next chunk onto the current line as will fit.
227 if self
.break_long_words
:
228 cur_line
.append(reversed_chunks
[-1][:space_left
])
229 reversed_chunks
[-1] = reversed_chunks
[-1][space_left
:]
231 # Otherwise, we have to preserve the long word intact. Only add
232 # it to the current line if there's nothing already there --
233 # that minimizes how much we violate the width constraint.
235 cur_line
.append(reversed_chunks
.pop())
237 # If we're not allowed to break long words, and there's already
238 # text on the current line, do nothing. Next time through the
239 # main loop of _wrap_chunks(), we'll wind up here again, but
240 # cur_len will be zero, so the next line will be entirely
241 # devoted to the long word that we can't handle right now.
243 def _wrap_chunks(self
, chunks
):
244 """_wrap_chunks(chunks : [string]) -> [string]
246 Wrap a sequence of text chunks and return a list of lines of
247 length 'self.width' or less. (If 'break_long_words' is false,
248 some lines may be longer than this.) Chunks correspond roughly
249 to words and the whitespace between them: each chunk is
250 indivisible (modulo 'break_long_words'), but a line break can
251 come between any two chunks. Chunks should not have internal
252 whitespace; ie. a chunk is either all whitespace or a "word".
253 Whitespace chunks will be removed from the beginning and end of
254 lines, but apart from that whitespace is preserved.
258 raise ValueError("invalid width %r (must be > 0)" % self
.width
)
260 # Arrange in reverse order so items can be efficiently popped
261 # from a stack of chucks.
266 # Start the list of chunks that will make up the current line.
267 # cur_len is just the length of all the chunks in cur_line.
271 # Figure out which static string will prefix this line.
273 indent
= self
.subsequent_indent
275 indent
= self
.initial_indent
277 # Maximum width for this line.
278 width
= self
.width
- len(indent
)
280 # First chunk on line is whitespace -- drop it, unless this
281 # is the very beginning of the text (ie. no lines started yet).
282 if self
.drop_whitespace
and chunks
[-1].strip() == '' and lines
:
288 # Can at least squeeze this chunk onto the current line.
289 if cur_len
+ l
<= width
:
290 cur_line
.append(chunks
.pop())
293 # Nope, this line is full.
297 # The current line is full, and the next chunk is too big to
298 # fit on *any* line (not just this one).
299 if chunks
and len(chunks
[-1]) > width
:
300 self
._handle
_long
_word
(chunks
, cur_line
, cur_len
, width
)
302 # If the last chunk on this line is all whitespace, drop it.
303 if self
.drop_whitespace
and cur_line
and cur_line
[-1].strip() == '':
306 # Convert current line back to a string and store it in list
307 # of all lines (return value).
309 lines
.append(indent
+ ''.join(cur_line
))
314 # -- Public interface ----------------------------------------------
316 def wrap(self
, text
):
317 """wrap(text : string) -> [string]
319 Reformat the single paragraph in 'text' so it fits in lines of
320 no more than 'self.width' columns, and return a list of wrapped
321 lines. Tabs in 'text' are expanded with string.expandtabs(),
322 and all other whitespace characters (including newline) are
325 text
= self
._munge
_whitespace
(text
)
326 chunks
= self
._split
(text
)
327 if self
.fix_sentence_endings
:
328 self
._fix
_sentence
_endings
(chunks
)
329 return self
._wrap
_chunks
(chunks
)
331 def fill(self
, text
):
332 """fill(text : string) -> string
334 Reformat the single paragraph in 'text' to fit in lines of no
335 more than 'self.width' columns, and return a new string
336 containing the entire wrapped paragraph.
338 return "\n".join(self
.wrap(text
))
341 # -- Convenience interface ---------------------------------------------
343 def wrap(text
, width
=70, **kwargs
):
344 """Wrap a single paragraph of text, returning a list of wrapped lines.
346 Reformat the single paragraph in 'text' so it fits in lines of no
347 more than 'width' columns, and return a list of wrapped lines. By
348 default, tabs in 'text' are expanded with string.expandtabs(), and
349 all other whitespace characters (including newline) are converted to
350 space. See TextWrapper class for available keyword args to customize
353 w
= TextWrapper(width
=width
, **kwargs
)
356 def fill(text
, width
=70, **kwargs
):
357 """Fill a single paragraph of text, returning a new string.
359 Reformat the single paragraph in 'text' to fit in lines of no more
360 than 'width' columns, and return a new string containing the entire
361 wrapped paragraph. As with wrap(), tabs are expanded and other
362 whitespace characters converted to space. See TextWrapper class for
363 available keyword args to customize wrapping behaviour.
365 w
= TextWrapper(width
=width
, **kwargs
)
369 # -- Loosely related functionality -------------------------------------
371 _whitespace_only_re
= re
.compile('^[ \t]+$', re
.MULTILINE
)
372 _leading_whitespace_re
= re
.compile('(^[ \t]*)(?:[^ \t\n])', re
.MULTILINE
)
375 """Remove any common leading whitespace from every line in `text`.
377 This can be used to make triple-quoted strings line up with the left
378 edge of the display, while still presenting them in the source code
381 Note that tabs and spaces are both treated as whitespace, but they
382 are not equal: the lines " hello" and "\\thello" are
383 considered to have no common leading whitespace. (This behaviour is
384 new in Python 2.5; older versions of this module incorrectly
385 expanded tabs before searching for common leading whitespace.)
387 # Look for the longest leading string of spaces and tabs common to
390 text
= _whitespace_only_re
.sub('', text
)
391 indents
= _leading_whitespace_re
.findall(text
)
392 for indent
in indents
:
396 # Current line more deeply indented than previous winner:
397 # no change (previous winner is still on top).
398 elif indent
.startswith(margin
):
401 # Current line consistent with and no deeper than previous winner:
402 # it's the new winner.
403 elif margin
.startswith(indent
):
406 # Current line and previous winner have no common whitespace:
407 # there is no margin.
412 # sanity check (testing/debugging only)
414 for line
in text
.split("\n"):
415 assert not line
or line
.startswith(margin
), \
416 "line = %r, margin = %r" % (line
, margin
)
419 text
= re
.sub(r
'(?m)^' + margin
, '', text
)
422 if __name__
== "__main__":
423 #print dedent("\tfoo\n\tbar")
424 #print dedent(" \thello there\n \t how are you?")
425 print dedent("Hello there.\n This is indented.")