# -*- coding: utf-8 -*- # # Copyright (c) 2013 Red Hat, Inc. # Copyright (c) 2010 Ville Skyttä # Copyright (c) 2009 Tim Lauridsen # Copyright (c) 2007 Marcus Kuhn # # kitchen is free software; you can redistribute it and/or modify it under the # terms of the GNU Lesser General Public License as published by the Free # Software Foundation; either version 2.1 of the License, or (at your option) # any later version. # # kitchen is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for # more details. # # You should have received a copy of the GNU Lesser General Public License # along with kitchen; if not, see # # Authors: # James Antill # Marcus Kuhn # Toshio Kuratomi # Tim Lauridsen # Ville Skyttä # # Portions of this are from yum/i18n.py ''' ----------------------- Format Text for Display ----------------------- Functions related to displaying unicode text. Unicode characters don't all have the same width so we need helper functions for displaying them. .. versionadded:: 0.2 kitchen.display API 1.0.0 ''' import itertools import unicodedata from kitchen.text.converters import to_unicode, to_bytes from kitchen.text.exceptions import ControlCharError # This is ported from ustr_utf8_* which I got from: # http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c # I've tried to leave it close to the original C (same names etc.) so that # it is easy to read/compare both versions... James Antilles # # Reimplemented quite a bit of this for speed. Use the bzr log or annotate # commands to see what I've changed since importing this file.-Toshio Kuratomi # ----------------------------- BEG utf8 ------------------to----------- # This is an implementation of wcwidth() and wcswidth() (defined in # IEEE Std 1002.1-2001) for Unicode. # # http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html # http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html # # In fixed-width output devices, Latin characters all occupy a single # "cell" position of equal width, whereas ideographic CJK characters # occupy two such cells. Interoperability between terminal-line # applications and (teletype-style) character terminals using the # UTF-8 encoding requires agreement on which character should advance # the cursor by how many cell positions. No established formal # standards exist at present on which Unicode character shall occupy # how many cell positions on character terminals. These routines are # a first attempt of defining such behavior based on simple rules # applied to data provided by the Unicode Consortium. # # [...] # # Markus Kuhn -- 2007-05-26 (Unicode 5.0) # # Permission to use, copy, modify, and distribute this software # for any purpose and without fee is hereby granted. The author # disclaims all warranties with regard to this software. # # Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c # Renamed but still pretty much JA's port of MK's code def _interval_bisearch(value, table): '''Binary search in an interval table. :arg value: numeric value to search for :arg table: Ordered list of intervals. This is a list of two-tuples. The elements of the two-tuple define an interval's start and end points. :returns: If :attr:`value` is found within an interval in the :attr:`table` return :data:`True`. Otherwise, :data:`False` This function checks whether a numeric value is present within a table of intervals. It checks using a binary search algorithm, dividing the list of values in half and checking against the values until it determines whether the value is in the table. ''' minimum = 0 maximum = len(table) - 1 if value < table[minimum][0] or value > table[maximum][1]: return False while maximum >= minimum: mid = divmod(minimum + maximum, 2)[0] if value > table[mid][1]: minimum = mid + 1 elif value < table[mid][0]: maximum = mid - 1 else: return True return False _COMBINING = ( (0x300, 0x36f), (0x483, 0x489), (0x591, 0x5bd), (0x5bf, 0x5bf), (0x5c1, 0x5c2), (0x5c4, 0x5c5), (0x5c7, 0x5c7), (0x600, 0x603), (0x610, 0x61a), (0x64b, 0x65f), (0x670, 0x670), (0x6d6, 0x6e4), (0x6e7, 0x6e8), (0x6ea, 0x6ed), (0x70f, 0x70f), (0x711, 0x711), (0x730, 0x74a), (0x7a6, 0x7b0), (0x7eb, 0x7f3), (0x816, 0x819), (0x81b, 0x823), (0x825, 0x827), (0x829, 0x82d), (0x859, 0x85b), (0x901, 0x902), (0x93c, 0x93c), (0x941, 0x948), (0x94d, 0x94d), (0x951, 0x954), (0x962, 0x963), (0x981, 0x981), (0x9bc, 0x9bc), (0x9c1, 0x9c4), (0x9cd, 0x9cd), (0x9e2, 0x9e3), (0xa01, 0xa02), (0xa3c, 0xa3c), (0xa41, 0xa42), (0xa47, 0xa48), (0xa4b, 0xa4d), (0xa70, 0xa71), (0xa81, 0xa82), (0xabc, 0xabc), (0xac1, 0xac5), (0xac7, 0xac8), (0xacd, 0xacd), (0xae2, 0xae3), (0xb01, 0xb01), (0xb3c, 0xb3c), (0xb3f, 0xb3f), (0xb41, 0xb43), (0xb4d, 0xb4d), (0xb56, 0xb56), (0xb82, 0xb82), (0xbc0, 0xbc0), (0xbcd, 0xbcd), (0xc3e, 0xc40), (0xc46, 0xc48), (0xc4a, 0xc4d), (0xc55, 0xc56), (0xcbc, 0xcbc), (0xcbf, 0xcbf), (0xcc6, 0xcc6), (0xccc, 0xccd), (0xce2, 0xce3), (0xd41, 0xd43), (0xd4d, 0xd4d), (0xdca, 0xdca), (0xdd2, 0xdd4), (0xdd6, 0xdd6), (0xe31, 0xe31), (0xe34, 0xe3a), (0xe47, 0xe4e), (0xeb1, 0xeb1), (0xeb4, 0xeb9), (0xebb, 0xebc), (0xec8, 0xecd), (0xf18, 0xf19), (0xf35, 0xf35), (0xf37, 0xf37), (0xf39, 0xf39), (0xf71, 0xf7e), (0xf80, 0xf84), (0xf86, 0xf87), (0xf90, 0xf97), (0xf99, 0xfbc), (0xfc6, 0xfc6), (0x102d, 0x1030), (0x1032, 0x1032), (0x1036, 0x1037), (0x1039, 0x103a), (0x1058, 0x1059), (0x108d, 0x108d), (0x1160, 0x11ff), (0x135d, 0x135f), (0x1712, 0x1714), (0x1732, 0x1734), (0x1752, 0x1753), (0x1772, 0x1773), (0x17b4, 0x17b5), (0x17b7, 0x17bd), (0x17c6, 0x17c6), (0x17c9, 0x17d3), (0x17dd, 0x17dd), (0x180b, 0x180d), (0x18a9, 0x18a9), (0x1920, 0x1922), (0x1927, 0x1928), (0x1932, 0x1932), (0x1939, 0x193b), (0x1a17, 0x1a18), (0x1a60, 0x1a60), (0x1a75, 0x1a7c), (0x1a7f, 0x1a7f), (0x1b00, 0x1b03), (0x1b34, 0x1b34), (0x1b36, 0x1b3a), (0x1b3c, 0x1b3c), (0x1b42, 0x1b42), (0x1b44, 0x1b44), (0x1b6b, 0x1b73), (0x1baa, 0x1baa), (0x1be6, 0x1be6), (0x1bf2, 0x1bf3), (0x1c37, 0x1c37), (0x1cd0, 0x1cd2), (0x1cd4, 0x1ce0), (0x1ce2, 0x1ce8), (0x1ced, 0x1ced), (0x1dc0, 0x1de6), (0x1dfc, 0x1dff), (0x200b, 0x200f), (0x202a, 0x202e), (0x2060, 0x2063), (0x206a, 0x206f), (0x20d0, 0x20f0), (0x2cef, 0x2cf1), (0x2d7f, 0x2d7f), (0x2de0, 0x2dff), (0x302a, 0x302f), (0x3099, 0x309a), (0xa66f, 0xa66f), (0xa67c, 0xa67d), (0xa6f0, 0xa6f1), (0xa806, 0xa806), (0xa80b, 0xa80b), (0xa825, 0xa826), (0xa8c4, 0xa8c4), (0xa8e0, 0xa8f1), (0xa92b, 0xa92d), (0xa953, 0xa953), (0xa9b3, 0xa9b3), (0xa9c0, 0xa9c0), (0xaab0, 0xaab0), (0xaab2, 0xaab4), (0xaab7, 0xaab8), (0xaabe, 0xaabf), (0xaac1, 0xaac1), (0xabed, 0xabed), (0xfb1e, 0xfb1e), (0xfe00, 0xfe0f), (0xfe20, 0xfe26), (0xfeff, 0xfeff), (0xfff9, 0xfffb), (0x101fd, 0x101fd), (0x10a01, 0x10a03), (0x10a05, 0x10a06), (0x10a0c, 0x10a0f), (0x10a38, 0x10a3a), (0x10a3f, 0x10a3f), (0x11046, 0x11046), (0x110b9, 0x110ba), (0x1d165, 0x1d169), (0x1d16d, 0x1d182), (0x1d185, 0x1d18b), (0x1d1aa, 0x1d1ad), (0x1d242, 0x1d244), (0xe0001, 0xe0001), (0xe0020, 0xe007f), (0xe0100, 0xe01ef), ) ''' Internal table, provided by this module to list :term:`code points` which combine with other characters and therefore should have no :term:`textual width`. This is a sorted :class:`tuple` of non-overlapping intervals. Each interval is a :class:`tuple` listing a starting :term:`code point` and ending :term:`code point`. Every :term:`code point` between the two end points is a combining character. .. seealso:: :func:`~kitchen.text.display._generate_combining_table` for how this table is generated This table was last regenerated on python-3.2.3 with :data:`unicodedata.unidata_version` 6.0.0 ''' # New function from Toshio Kuratomi (LGPLv2+) def _generate_combining_table(): '''Combine Markus Kuhn's data with :mod:`unicodedata` to make combining char list :rtype: :class:`tuple` of tuples :returns: :class:`tuple` of intervals of :term:`code points` that are combining character. Each interval is a 2-:class:`tuple` of the starting :term:`code point` and the ending :term:`code point` for the combining characters. In normal use, this function serves to tell how we're generating the combining char list. For speed reasons, we use this to generate a static list and just use that later. Markus Kuhn's list of combining characters is more complete than what's in the python :mod:`unicodedata` library but the python :mod:`unicodedata` is synced against later versions of the unicode database This is used to generate the :data:`~kitchen.text.display._COMBINING` table. ''' # Marcus Kuhn's sorted list of non-overlapping intervals of non-spacing # characters generated ifrom Unicode 5.0 data by: # "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" markus_kuhn_combining_5_0 = ( ( 0x0300, 0x036F ), ( 0x0483, 0x0486 ), ( 0x0488, 0x0489 ), ( 0x0591, 0x05BD ), ( 0x05BF, 0x05BF ), ( 0x05C1, 0x05C2 ), ( 0x05C4, 0x05C5 ), ( 0x05C7, 0x05C7 ), ( 0x0600, 0x0603 ), ( 0x0610, 0x0615 ), ( 0x064B, 0x065E ), ( 0x0670, 0x0670 ), ( 0x06D6, 0x06E4 ), ( 0x06E7, 0x06E8 ), ( 0x06EA, 0x06ED ), ( 0x070F, 0x070F ), ( 0x0711, 0x0711 ), ( 0x0730, 0x074A ), ( 0x07A6, 0x07B0 ), ( 0x07EB, 0x07F3 ), ( 0x0901, 0x0902 ), ( 0x093C, 0x093C ), ( 0x0941, 0x0948 ), ( 0x094D, 0x094D ), ( 0x0951, 0x0954 ), ( 0x0962, 0x0963 ), ( 0x0981, 0x0981 ), ( 0x09BC, 0x09BC ), ( 0x09C1, 0x09C4 ), ( 0x09CD, 0x09CD ), ( 0x09E2, 0x09E3 ), ( 0x0A01, 0x0A02 ), ( 0x0A3C, 0x0A3C ), ( 0x0A41, 0x0A42 ), ( 0x0A47, 0x0A48 ), ( 0x0A4B, 0x0A4D ), ( 0x0A70, 0x0A71 ), ( 0x0A81, 0x0A82 ), ( 0x0ABC, 0x0ABC ), ( 0x0AC1, 0x0AC5 ), ( 0x0AC7, 0x0AC8 ), ( 0x0ACD, 0x0ACD ), ( 0x0AE2, 0x0AE3 ), ( 0x0B01, 0x0B01 ), ( 0x0B3C, 0x0B3C ), ( 0x0B3F, 0x0B3F ), ( 0x0B41, 0x0B43 ), ( 0x0B4D, 0x0B4D ), ( 0x0B56, 0x0B56 ), ( 0x0B82, 0x0B82 ), ( 0x0BC0, 0x0BC0 ), ( 0x0BCD, 0x0BCD ), ( 0x0C3E, 0x0C40 ), ( 0x0C46, 0x0C48 ), ( 0x0C4A, 0x0C4D ), ( 0x0C55, 0x0C56 ), ( 0x0CBC, 0x0CBC ), ( 0x0CBF, 0x0CBF ), ( 0x0CC6, 0x0CC6 ), ( 0x0CCC, 0x0CCD ), ( 0x0CE2, 0x0CE3 ), ( 0x0D41, 0x0D43 ), ( 0x0D4D, 0x0D4D ), ( 0x0DCA, 0x0DCA ), ( 0x0DD2, 0x0DD4 ), ( 0x0DD6, 0x0DD6 ), ( 0x0E31, 0x0E31 ), ( 0x0E34, 0x0E3A ), ( 0x0E47, 0x0E4E ), ( 0x0EB1, 0x0EB1 ), ( 0x0EB4, 0x0EB9 ), ( 0x0EBB, 0x0EBC ), ( 0x0EC8, 0x0ECD ), ( 0x0F18, 0x0F19 ), ( 0x0F35, 0x0F35 ), ( 0x0F37, 0x0F37 ), ( 0x0F39, 0x0F39 ), ( 0x0F71, 0x0F7E ), ( 0x0F80, 0x0F84 ), ( 0x0F86, 0x0F87 ), ( 0x0F90, 0x0F97 ), ( 0x0F99, 0x0FBC ), ( 0x0FC6, 0x0FC6 ), ( 0x102D, 0x1030 ), ( 0x1032, 0x1032 ), ( 0x1036, 0x1037 ), ( 0x1039, 0x1039 ), ( 0x1058, 0x1059 ), ( 0x1160, 0x11FF ), ( 0x135F, 0x135F ), ( 0x1712, 0x1714 ), ( 0x1732, 0x1734 ), ( 0x1752, 0x1753 ), ( 0x1772, 0x1773 ), ( 0x17B4, 0x17B5 ), ( 0x17B7, 0x17BD ), ( 0x17C6, 0x17C6 ), ( 0x17C9, 0x17D3 ), ( 0x17DD, 0x17DD ), ( 0x180B, 0x180D ), ( 0x18A9, 0x18A9 ), ( 0x1920, 0x1922 ), ( 0x1927, 0x1928 ), ( 0x1932, 0x1932 ), ( 0x1939, 0x193B ), ( 0x1A17, 0x1A18 ), ( 0x1B00, 0x1B03 ), ( 0x1B34, 0x1B34 ), ( 0x1B36, 0x1B3A ), ( 0x1B3C, 0x1B3C ), ( 0x1B42, 0x1B42 ), ( 0x1B6B, 0x1B73 ), ( 0x1DC0, 0x1DCA ), ( 0x1DFE, 0x1DFF ), ( 0x200B, 0x200F ), ( 0x202A, 0x202E ), ( 0x2060, 0x2063 ), ( 0x206A, 0x206F ), ( 0x20D0, 0x20EF ), ( 0x302A, 0x302F ), ( 0x3099, 0x309A ), ( 0xA806, 0xA806 ), ( 0xA80B, 0xA80B ), ( 0xA825, 0xA826 ), ( 0xFB1E, 0xFB1E ), ( 0xFE00, 0xFE0F ), ( 0xFE20, 0xFE23 ), ( 0xFEFF, 0xFEFF ), ( 0xFFF9, 0xFFFB ), ( 0x10A01, 0x10A03 ), ( 0x10A05, 0x10A06 ), ( 0x10A0C, 0x10A0F ), ( 0x10A38, 0x10A3A ), ( 0x10A3F, 0x10A3F ), ( 0x1D167, 0x1D169 ), ( 0x1D173, 0x1D182 ), ( 0x1D185, 0x1D18B ), ( 0x1D1AA, 0x1D1AD ), ( 0x1D242, 0x1D244 ), ( 0xE0001, 0xE0001 ), ( 0xE0020, 0xE007F ), ( 0xE0100, 0xE01EF )) combining = [] in_interval = False interval = [] for codepoint in xrange (0, 0xFFFFF + 1): if _interval_bisearch(codepoint, markus_kuhn_combining_5_0) or \ unicodedata.combining(unichr(codepoint)): if not in_interval: # Found first part of an interval interval = [codepoint] in_interval = True else: if in_interval: in_interval = False interval.append(codepoint - 1) combining.append(interval) if in_interval: # If we're at the end and the interval is open, close it. # :W0631: We looped through a static range so we know codepoint is # defined here #pylint:disable-msg=W0631 interval.append(codepoint) combining.append(interval) return tuple(itertools.imap(tuple, combining)) # New function from Toshio Kuratomi (LGPLv2+) def _print_combining_table(): '''Print out a new :data:`_COMBINING` table This will print a new :data:`_COMBINING` table in the format used in :file:`kitchen/text/display.py`. It's useful for updating the :data:`_COMBINING` table with updated data from a new python as the format won't change from what's already in the file. ''' table = _generate_combining_table() entries = 0 print '_COMBINING = (' for pair in table: if entries >= 3: entries = 0 print if entries == 0: print ' ', entries += 1 entry = '(0x%x, 0x%x),' % pair print entry, print ')' # Handling of control chars rewritten. Rest is JA's port of MK's C code. # -Toshio Kuratomi def _ucp_width(ucs, control_chars='guess'): '''Get the :term:`textual width` of a ucs character :arg ucs: integer representing a single unicode :term:`code point` :kwarg control_chars: specify how to deal with :term:`control characters`. Possible values are: :guess: (default) will take a guess for :term:`control character` widths. Most codes will return zero width. ``backspace``, ``delete``, and ``clear delete`` return -1. ``escape`` currently returns -1 as well but this is not guaranteed as it's not always correct :strict: will raise :exc:`~kitchen.text.exceptions.ControlCharError` if a :term:`control character` is encountered :raises ControlCharError: if the :term:`code point` is a unicode :term:`control character` and :attr:`control_chars` is set to 'strict' :returns: :term:`textual width` of the character. .. note:: It's important to remember this is :term:`textual width` and not the number of characters or bytes. ''' # test for 8-bit control characters if ucs < 32 or (ucs < 0xa0 and ucs >= 0x7f): # Control character detected if control_chars == 'strict': raise ControlCharError('_ucp_width does not understand how to' ' assign a width value to control characters.') if ucs in (0x08, 0x07F, 0x94): # Backspace, delete, and clear delete remove a single character return -1 if ucs == 0x1b: # Excape is tricky. It removes some number of characters that # come after it but the amount is dependent on what is # interpreting the code. # So this is going to often be wrong but other values will be # wrong as well. return -1 # All other control characters get 0 width return 0 if _interval_bisearch(ucs, _COMBINING): # Combining characters return 0 width as they will be combined with # the width from other characters return 0 # if we arrive here, ucs is not a combining or C0/C1 control character return (1 + (ucs >= 0x1100 and (ucs <= 0x115f or # Hangul Jamo init. consonants ucs == 0x2329 or ucs == 0x232a or (ucs >= 0x2e80 and ucs <= 0xa4cf and ucs != 0x303f) or # CJK ... Yi (ucs >= 0xac00 and ucs <= 0xd7a3) or # Hangul Syllables (ucs >= 0xf900 and ucs <= 0xfaff) or # CJK Compatibility Ideographs (ucs >= 0xfe10 and ucs <= 0xfe19) or # Vertical forms (ucs >= 0xfe30 and ucs <= 0xfe6f) or # CJK Compatibility Forms (ucs >= 0xff00 and ucs <= 0xff60) or # Fullwidth Forms (ucs >= 0xffe0 and ucs <= 0xffe6) or (ucs >= 0x20000 and ucs <= 0x2fffd) or (ucs >= 0x30000 and ucs <= 0x3fffd)))) # Wholly rewritten by me (LGPLv2+) -Toshio Kuratomi def textual_width(msg, control_chars='guess', encoding='utf-8', errors='replace'): '''Get the :term:`textual width` of a string :arg msg: :class:`unicode` string or byte :class:`str` to get the width of :kwarg control_chars: specify how to deal with :term:`control characters`. Possible values are: :guess: (default) will take a guess for :term:`control character` widths. Most codes will return zero width. ``backspace``, ``delete``, and ``clear delete`` return -1. ``escape`` currently returns -1 as well but this is not guaranteed as it's not always correct :strict: will raise :exc:`kitchen.text.exceptions.ControlCharError` if a :term:`control character` is encountered :kwarg encoding: If we are given a byte :class:`str` this is used to decode it into :class:`unicode` string. Any characters that are not decodable in this encoding will get a value dependent on the :attr:`errors` parameter. :kwarg errors: How to treat errors encoding the byte :class:`str` to :class:`unicode` string. Legal values are the same as for :func:`kitchen.text.converters.to_unicode`. The default value of ``replace`` will cause undecodable byte sequences to have a width of one. ``ignore`` will have a width of zero. :raises ControlCharError: if :attr:`msg` contains a :term:`control character` and :attr:`control_chars` is ``strict``. :returns: :term:`Textual width` of the :attr:`msg`. This is the amount of space that the string will consume on a monospace display. It's measured in the number of cell positions or columns it will take up on a monospace display. This is **not** the number of glyphs that are in the string. .. note:: This function can be wrong sometimes because Unicode does not specify a strict width value for all of the :term:`code points`. In particular, we've found that some Tamil characters take up to four character cells but we return a lesser amount. ''' # On python 2.6.4, x86_64, I've benchmarked a few alternate # implementations:: # # timeit.repeat('display.textual_width(data)', # 'from __main__ import display, data', number=100) # I varied data by size and content (1MB of ascii, a few words, 43K utf8, # unicode type # # :this implementation: fastest across the board # # :list comprehension: 6-16% slower # return sum([_ucp_width(ord(c), control_chars=control_chars) # for c in msg]) # # :generator expression: 9-18% slower # return sum((_ucp_width(ord(c), control_chars=control_chars) for c in # msg)) # # :lambda: 10-19% slower # return sum(itertools.imap(lambda x: _ucp_width(ord(x), control_chars), # msg)) # # :partial application: 13-22% slower # func = functools.partial(_ucp_width, control_chars=control_chars) # return sum(itertools.imap(func, itertools.imap(ord, msg))) # # :the original code: 4-38% slower # The 4% was for the short, ascii only string. All the other pieces of # data yielded over 30% slower times. # Non decodable data is just assigned a single cell width msg = to_unicode(msg, encoding=encoding, errors=errors) # Add the width of each char return sum( # calculate width of each char itertools.starmap(_ucp_width, # Setup the arguments to _ucp_width itertools.izip( # int value of each char itertools.imap(ord, msg), # control_chars arg in a form that izip will deal with itertools.repeat(control_chars)))) # Wholly rewritten by me -Toshio Kuratomi def textual_width_chop(msg, chop, encoding='utf-8', errors='replace'): '''Given a string, return it chopped to a given :term:`textual width` :arg msg: :class:`unicode` string or byte :class:`str` to chop :arg chop: Chop :attr:`msg` if it exceeds this :term:`textual width` :kwarg encoding: If we are given a byte :class:`str`, this is used to decode it into a :class:`unicode` string. Any characters that are not decodable in this encoding will be assigned a width of one. :kwarg errors: How to treat errors encoding the byte :class:`str` to :class:`unicode`. Legal values are the same as for :func:`kitchen.text.converters.to_unicode` :rtype: :class:`unicode` string :returns: :class:`unicode` string of the :attr:`msg` chopped at the given :term:`textual width` This is what you want to use instead of ``%.*s``, as it does the "right" thing with regard to :term:`UTF-8` sequences, :term:`control characters`, and characters that take more than one cell position. Eg:: >>> # Wrong: only displays 8 characters because it is operating on bytes >>> print "%.*s" % (10, 'café ñunru!') café ñun >>> # Properly operates on graphemes >>> '%s' % (textual_width_chop('café ñunru!', 10)) café ñunru >>> # takes too many columns because the kanji need two cell positions >>> print '1234567890\\n%.*s' % (10, u'一二三四五六七八九十') 1234567890 一二三四五六七八九十 >>> # Properly chops at 10 columns >>> print '1234567890\\n%s' % (textual_width_chop(u'一二三四五六七八九十', 10)) 1234567890 一二三四五 ''' msg = to_unicode(msg, encoding=encoding, errors=errors) width = textual_width(msg) if width <= chop: return msg maximum = len(msg) if maximum > chop * 2: # A character can take at most 2 cell positions so this is the actual # maximum maximum = chop * 2 minimum = 0 eos = maximum if eos > chop: eos = chop width = textual_width(msg[:eos]) while True: # if current width is high, if width > chop: # calculate new midpoint mid = minimum + (eos - minimum) // 2 if mid == eos: break if (eos - chop) < (eos - mid): while width > chop: width = width - _ucp_width(ord(msg[eos-1])) eos -= 1 return msg[:eos] # subtract distance between eos and mid from width width = width - textual_width(msg[mid:eos]) maximum = eos eos = mid # if current width is low, elif width < chop: # Note: at present, the if (eos - chop) < (eos - mid): # short-circuit above means that we never use this branch. # calculate new midpoint mid = eos + (maximum - eos) // 2 if mid == eos: break if (chop - eos) < (mid - eos): while width < chop: new_width = _ucp_width(ord(msg[eos])) width = width + new_width eos += 1 return msg[:eos] # add distance between eos and new mid to width width = width + textual_width(msg[eos:mid]) minimum = eos eos = mid if eos > maximum: eos = maximum break # if current is just right else: return msg[:eos] return msg[:eos] # I made some adjustments for using unicode but largely unchanged from JA's # port of MK's code -Toshio def textual_width_fill(msg, fill, chop=None, left=True, prefix='', suffix=''): '''Expand a :class:`unicode` string to a specified :term:`textual width` or chop to same :arg msg: :class:`unicode` string to format :arg fill: pad string until the :term:`textual width` of the string is this length :kwarg chop: before doing anything else, chop the string to this length. Default: Don't chop the string at all :kwarg left: If :data:`True` (default) left justify the string and put the padding on the right. If :data:`False`, pad on the left side. :kwarg prefix: Attach this string before the field we're filling :kwarg suffix: Append this string to the end of the field we're filling :rtype: :class:`unicode` string :returns: :attr:`msg` formatted to fill the specified width. If no :attr:`chop` is specified, the string could exceed the fill length when completed. If :attr:`prefix` or :attr:`suffix` are printable characters, the string could be longer than the fill width. .. note:: :attr:`prefix` and :attr:`suffix` should be used for "invisible" characters like highlighting, color changing escape codes, etc. The fill characters are appended outside of any :attr:`prefix` or :attr:`suffix` elements. This allows you to only highlight :attr:`msg` inside of the field you're filling. .. warning:: :attr:`msg`, :attr:`prefix`, and :attr:`suffix` should all be representable as unicode characters. In particular, any escape sequences in :attr:`prefix` and :attr:`suffix` need to be convertible to :class:`unicode`. If you need to use byte sequences here rather than unicode characters, use :func:`~kitchen.text.display.byte_string_textual_width_fill` instead. This function expands a string to fill a field of a particular :term:`textual width`. Use it instead of ``%*.*s``, as it does the "right" thing with regard to :term:`UTF-8` sequences, :term:`control characters`, and characters that take more than one cell position in a display. Example usage:: >>> msg = u'一二三四五六七八九十' >>> # Wrong: This uses 10 characters instead of 10 cells: >>> u":%-*.*s:" % (10, 10, msg[:9]) :一二三四五六七八九 : >>> # This uses 10 cells like we really want: >>> u":%s:" % (textual_width_fill(msg[:9], 10, 10)) :一二三四五: >>> # Wrong: Right aligned in the field, but too many cells >>> u"%20.10s" % (msg) 一二三四五六七八九十 >>> # Correct: Right aligned with proper number of cells >>> u"%s" % (textual_width_fill(msg, 20, 10, left=False)) 一二三四五 >>> # Wrong: Adding some escape characters to highlight the line but too many cells >>> u"%s%20.10s%s" % (prefix, msg, suffix) u'\x1b[7m 一二三四五六七八九十\x1b[0m' >>> # Correct highlight of the line >>> u"%s%s%s" % (prefix, display.textual_width_fill(msg, 20, 10, left=False), suffix) u'\x1b[7m 一二三四五\x1b[0m' >>> # Correct way to not highlight the fill >>> u"%s" % (display.textual_width_fill(msg, 20, 10, left=False, prefix=prefix, suffix=suffix)) u' \x1b[7m一二三四五\x1b[0m' ''' msg = to_unicode(msg) if chop is not None: msg = textual_width_chop(msg, chop) width = textual_width(msg) if width >= fill: if prefix or suffix: msg = u''.join([prefix, msg, suffix]) else: extra = u' ' * (fill - width) if left: msg = u''.join([prefix, msg, suffix, extra]) else: msg = u''.join([extra, prefix, msg, suffix]) return msg def _textual_width_le(width, *args): '''Optimize the common case when deciding which :term:`textual width` is larger :arg width: :term:`textual width` to compare against. :arg \*args: :class:`unicode` strings to check the total :term:`textual width` of :returns: :data:`True` if the total length of :attr:`args` are less than or equal to :attr:`width`. Otherwise :data:`False`. We often want to know "does X fit in Y". It takes a while to use :func:`textual_width` to calculate this. However, we know that the number of canonically composed :class:`unicode` characters is always going to have 1 or 2 for the :term:`textual width` per character. With this we can take the following shortcuts: 1) If the number of canonically composed characters is more than width, the true :term:`textual width` cannot be less than width. 2) If the number of canonically composed characters * 2 is less than the width then the :term:`textual width` must be ok. :term:`textual width` of a canonically composed :class:`unicode` string will always be greater than or equal to the the number of :class:`unicode` characters. So we can first check if the number of composed :class:`unicode` characters is less than the asked for width. If it is we can return :data:`True` immediately. If not, then we must do a full :term:`textual width` lookup. ''' string = ''.join(args) string = unicodedata.normalize('NFC', string) if len(string) > width: return False elif len(string) * 2 <= width: return True elif len(to_bytes(string)) <= width: # Check against bytes. # utf8 has the property of having the same amount or more bytes per # character than textual width. return True else: true_width = textual_width(string) return true_width <= width def wrap(text, width=70, initial_indent=u'', subsequent_indent=u'', encoding='utf-8', errors='replace'): '''Works like we want :func:`textwrap.wrap` to work, :arg text: :class:`unicode` string or byte :class:`str` to wrap :kwarg width: :term:`textual width` at which to wrap. Default: 70 :kwarg initial_indent: string to use to indent the first line. Default: do not indent. :kwarg subsequent_indent: string to use to wrap subsequent lines. Default: do not indent :kwarg encoding: Encoding to use if :attr:`text` is a byte :class:`str` :kwarg errors: error handler to use if :attr:`text` is a byte :class:`str` and contains some undecodable characters. :rtype: :class:`list` of :class:`unicode` strings :returns: list of lines that have been text wrapped and indented. :func:`textwrap.wrap` from the |stdlib|_ has two drawbacks that this attempts to fix: 1. It does not handle :term:`textual width`. It only operates on bytes or characters which are both inadequate (due to multi-byte and double width characters). 2. It malforms lists and blocks. ''' # Tested with: # yum info robodoc gpicview php-pear-Net-Socket wmctrl ustr moreutils # mediawiki-HNP ocspd insight yum mousepad # ...at 120, 80 and 40 chars. # Also, notable among lots of others, searching for "\n ": # exim-clamav, jpackage-utils, tcldom, synaptics, "quake3", # perl-Class-Container, ez-ipupdate, perl-Net-XMPP, "kipi-plugins", # perl-Apache-DBI, netcdf, python-configobj, "translate-toolkit", alpine, # "udunits", "conntrack-tools" # # Note that, we "fail" on: # alsa-plugins-jack, setools*, dblatex, uisp, "perl-Getopt-GUI-Long", # suitesparse, "synce-serial", writer2latex, xenwatch, ltsp-utils def _indent_at_beg(line): '''Return the indent to use for this and (possibly) subsequent lines :arg line: :class:`unicode` line of text to process :rtype: tuple :returns: tuple of count of whitespace before getting to the start of this line followed by a count to the following indent if this block of text is an entry in a list. ''' # Find the first non-whitespace character try: char = line.strip()[0] except IndexError: # All whitespace return 0, 0 else: count = line.find(char) # if we have a bullet character, check for list if char not in u'-*.o\u2022\u2023\u2218': # No bullet; not a list return count, 0 # List: Keep searching until we hit the innermost list nxt = _indent_at_beg(line[count+1:]) nxt = nxt[1] or nxt[0] if nxt: return count, count + 1 + nxt return count, 0 initial_indent = to_unicode(initial_indent, encoding=encoding, errors=errors) subsequent_indent = to_unicode(subsequent_indent, encoding=encoding, errors=errors) subsequent_indent_width = textual_width(subsequent_indent) text = to_unicode(text, encoding=encoding, errors=errors).rstrip(u'\n') lines = text.expandtabs().split(u'\n') ret = [] indent = initial_indent wrap_last = False cur_sab = 0 cur_spc_indent = 0 for line in lines: line = line.rstrip(u' ') (last_sab, last_spc_indent) = (cur_sab, cur_spc_indent) (cur_sab, cur_spc_indent) = _indent_at_beg(line) force_nl = False # We want to stop wrapping under "certain" conditions: if wrap_last and cur_spc_indent: # if line starts a list or force_nl = True if wrap_last and cur_sab == len(line):# is empty line force_nl = True if wrap_last and not last_spc_indent: # if we don't continue a list if cur_sab >= 4 and cur_sab != last_sab: # and is "block indented" force_nl = True if force_nl: ret.append(indent.rstrip(u' ')) indent = subsequent_indent wrap_last = False if cur_sab == len(line): # empty line, remove spaces to make it easier. line = u'' if wrap_last: line = line.lstrip(u' ') cur_spc_indent = last_spc_indent if _textual_width_le(width, indent, line): wrap_last = False ret.append(indent + line) indent = subsequent_indent continue wrap_last = True words = line.split(u' ') line = indent spcs = cur_spc_indent if not spcs and cur_sab >= 4: spcs = cur_sab for word in words: if (not _textual_width_le(width, line, word) and textual_width(line) > subsequent_indent_width): ret.append(line.rstrip(u' ')) line = subsequent_indent + u' ' * spcs line += word line += u' ' indent = line.rstrip(u' ') + u' ' if wrap_last: ret.append(indent.rstrip(u' ')) return ret def fill(text, *args, **kwargs): '''Works like we want :func:`textwrap.fill` to work :arg text: :class:`unicode` string or byte :class:`str` to process :returns: :class:`unicode` string with each line separated by a newline .. seealso:: :func:`kitchen.text.display.wrap` for other parameters that you can give this command. This function is a light wrapper around :func:`kitchen.text.display.wrap`. Where that function returns a :class:`list` of lines, this function returns one string with each line separated by a newline. ''' return u'\n'.join(wrap(text, *args, **kwargs)) # # Byte strings # def byte_string_textual_width_fill(msg, fill, chop=None, left=True, prefix='', suffix='', encoding='utf-8', errors='replace'): '''Expand a byte :class:`str` to a specified :term:`textual width` or chop to same :arg msg: byte :class:`str` encoded in :term:`UTF-8` that we want formatted :arg fill: pad :attr:`msg` until the :term:`textual width` is this long :kwarg chop: before doing anything else, chop the string to this length. Default: Don't chop the string at all :kwarg left: If :data:`True` (default) left justify the string and put the padding on the right. If :data:`False`, pad on the left side. :kwarg prefix: Attach this byte :class:`str` before the field we're filling :kwarg suffix: Append this byte :class:`str` to the end of the field we're filling :rtype: byte :class:`str` :returns: :attr:`msg` formatted to fill the specified :term:`textual width`. If no :attr:`chop` is specified, the string could exceed the fill length when completed. If :attr:`prefix` or :attr:`suffix` are printable characters, the string could be longer than fill width. .. note:: :attr:`prefix` and :attr:`suffix` should be used for "invisible" characters like highlighting, color changing escape codes, etc. The fill characters are appended outside of any :attr:`prefix` or :attr:`suffix` elements. This allows you to only highlight :attr:`msg` inside of the field you're filling. .. seealso:: :func:`~kitchen.text.display.textual_width_fill` For example usage. This function has only two differences. 1. it takes byte :class:`str` for :attr:`prefix` and :attr:`suffix` so you can pass in arbitrary sequences of bytes, not just unicode characters. 2. it returns a byte :class:`str` instead of a :class:`unicode` string. ''' prefix = to_bytes(prefix, encoding=encoding, errors=errors) suffix = to_bytes(suffix, encoding=encoding, errors=errors) if chop is not None: msg = textual_width_chop(msg, chop, encoding=encoding, errors=errors) width = textual_width(msg) msg = to_bytes(msg) if width >= fill: if prefix or suffix: msg = ''.join([prefix, msg, suffix]) else: extra = ' ' * (fill - width) if left: msg = ''.join([prefix, msg, suffix, extra]) else: msg = ''.join([extra, prefix, msg, suffix]) return msg __all__ = ('byte_string_textual_width_fill', 'fill', 'textual_width', 'textual_width_chop', 'textual_width_fill', 'wrap')