kitchen/kitchen2/kitchen/text/misc.py

372 lines
14 KiB
Python

# -*- coding: utf-8 -*-
# Copyright (c) 2012 Red Hat, Inc
# Copyright (c) 2010 Seth Vidal
#
# kitchen is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# kitchen is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with kitchen; if not, see <http://www.gnu.org/licenses/>
#
# Authors:
# James Antill
# Toshio Kuratomi <toshio@fedoraproject.org>
# Seth Vidal
#
# Portions of this code taken from yum/misc.py and yum/i18n.py
'''
---------------------------------------------
Miscellaneous functions for manipulating text
---------------------------------------------
Collection of text functions that don't fit in another category.
.. versionchanged:: kitchen 1.2.0, API: kitchen.text 2.2.0
Added :func:`~kitchen.text.misc.isbasestring`,
:func:`~kitchen.text.misc.isbytestring`, and
:func:`~kitchen.text.misc.isunicodestring` to help tell which string type
is which on python2 and python3
'''
import htmlentitydefs
import itertools
import re
try:
import chardet
except ImportError:
chardet = None
from kitchen.pycompat24 import sets
from kitchen.text.exceptions import ControlCharError
sets.add_builtin_set()
# Define a threshold for chardet confidence. If we fall below this we decode
# byte strings we're guessing about as latin1
_CHARDET_THRESHHOLD = 0.6
# ASCII control codes (the c0 codes) that are illegal in xml 1.0
# Also unicode control codes (the C1 codes): also illegal in xml
_CONTROL_CODES = frozenset(range(0, 8) + [11, 12] + range(14, 32) + range(128, 160))
_CONTROL_CHARS = frozenset(itertools.imap(unichr, _CONTROL_CODES))
_IGNORE_TABLE = dict(zip(_CONTROL_CODES, [None] * len(_CONTROL_CODES)))
_REPLACE_TABLE = dict(zip(_CONTROL_CODES, [u'?'] * len(_CONTROL_CODES)))
# _ENTITY_RE
_ENTITY_RE = re.compile(r'(?s)<[^>]*>|&#?\w+;')
def isbasestring(obj):
'''Determine if obj is a byte :class:`str` or :class:`unicode` string
In python2 this is eqiuvalent to isinstance(obj, basestring). In python3
it checks whether the object is an instance of str, bytes, or bytearray.
This is an aid to porting code that needed to test whether an object was
derived from basestring in python2 (commonly used in unicode-bytes
conversion functions)
:arg obj: Object to test
:returns: True if the object is a :class:`basestring`. Otherwise False.
.. versionadded:: Kitchen: 1.2.0, API kitchen.text 2.2.0
'''
if isinstance(obj, basestring):
return True
return False
def isbytestring(obj):
'''Determine if obj is a byte :class:`str`
In python2 this is equivalent to isinstance(obj, str). In python3 it
checks whether the object is an instance of bytes or bytearray.
:arg obj: Object to test
:returns: True if the object is a byte :class:`str`. Otherwise, False.
.. versionadded:: Kitchen: 1.2.0, API kitchen.text 2.2.0
'''
if isinstance(obj, str):
return True
return False
def isunicodestring(obj):
'''Determine if obj is a :class:`unicode` string
In python2 this is equivalent to isinstance(obj, unicode). In python3 it
checks whether the object is an instance of :class:`str`.
:arg obj: Object to test
:returns: True if the object is a :class:`unicode` string. Otherwise, False.
.. versionadded:: Kitchen: 1.2.0, API kitchen.text 2.2.0
'''
if isinstance(obj, unicode):
return True
return False
def guess_encoding(byte_string, disable_chardet=False):
'''Try to guess the encoding of a byte :class:`str`
:arg byte_string: byte :class:`str` to guess the encoding of
:kwarg disable_chardet: If this is True, we never attempt to use
:mod:`chardet` to guess the encoding. This is useful if you need to
have reproducibility whether :mod:`chardet` is installed or not.
Default: :data:`False`.
:raises TypeError: if :attr:`byte_string` is not a byte :class:`str` type
:returns: string containing a guess at the encoding of
:attr:`byte_string`. This is appropriate to pass as the encoding
argument when encoding and decoding unicode strings.
We start by attempting to decode the byte :class:`str` as :term:`UTF-8`.
If this succeeds we tell the world it's :term:`UTF-8` text. If it doesn't
and :mod:`chardet` is installed on the system and :attr:`disable_chardet`
is False this function will use it to try detecting the encoding of
:attr:`byte_string`. If it is not installed or :mod:`chardet` cannot
determine the encoding with a high enough confidence then we rather
arbitrarily claim that it is ``latin-1``. Since ``latin-1`` will encode
to every byte, decoding from ``latin-1`` to :class:`unicode` will not
cause :exc:`UnicodeErrors` although the output might be mangled.
'''
if not isbytestring(byte_string):
raise TypeError('first argument must be a byte string (str)')
input_encoding = 'utf-8'
try:
unicode(byte_string, input_encoding, 'strict')
except UnicodeDecodeError:
input_encoding = None
if not input_encoding and chardet and not disable_chardet:
detection_info = chardet.detect(byte_string)
if detection_info['confidence'] >= _CHARDET_THRESHHOLD:
input_encoding = detection_info['encoding']
if not input_encoding:
input_encoding = 'latin-1'
return input_encoding
def str_eq(str1, str2, encoding='utf-8', errors='replace'):
'''Compare two strings, converting to byte :class:`str` if one is
:class:`unicode`
:arg str1: First string to compare
:arg str2: Second string to compare
:kwarg encoding: If we need to convert one string into a byte :class:`str`
to compare, the encoding to use. Default is :term:`utf-8`.
:kwarg errors: What to do if we encounter errors when encoding the string.
See the :func:`kitchen.text.converters.to_bytes` documentation for
possible values. The default is ``replace``.
This function prevents :exc:`UnicodeError` (python-2.4 or less) and
:exc:`UnicodeWarning` (python 2.5 and higher) when we compare
a :class:`unicode` string to a byte :class:`str`. The errors normally
arise because the conversion is done to :term:`ASCII`. This function
lets you convert to :term:`utf-8` or another encoding instead.
.. note::
When we need to convert one of the strings from :class:`unicode` in
order to compare them we convert the :class:`unicode` string into
a byte :class:`str`. That means that strings can compare differently
if you use different encodings for each.
Note that ``str1 == str2`` is faster than this function if you can accept
the following limitations:
* Limited to python-2.5+ (otherwise a :exc:`UnicodeDecodeError` may be
thrown)
* Will generate a :exc:`UnicodeWarning` if non-:term:`ASCII` byte
:class:`str` is compared to :class:`unicode` string.
'''
try:
return (not str1 < str2) and (not str1 > str2)
except UnicodeError:
pass
if isunicodestring(str1):
str1 = str1.encode(encoding, errors)
else:
str2 = str2.encode(encoding, errors)
if str1 == str2:
return True
return False
def process_control_chars(string, strategy='replace'):
'''Look for and transform :term:`control characters` in a string
:arg string: string to search for and transform :term:`control characters`
within
:kwarg strategy: XML does not allow :term:`ASCII` :term:`control
characters`. When we encounter those we need to know what to do.
Valid options are:
:replace: (default) Replace the :term:`control characters`
with ``"?"``
:ignore: Remove the characters altogether from the output
:strict: Raise a :exc:`~kitchen.text.exceptions.ControlCharError` when
we encounter a control character
:raises TypeError: if :attr:`string` is not a unicode string.
:raises ValueError: if the strategy is not one of replace, ignore, or
strict.
:raises kitchen.text.exceptions.ControlCharError: if the strategy is
``strict`` and a :term:`control character` is present in the
:attr:`string`
:returns: :class:`unicode` string with no :term:`control characters` in
it.
.. versionchanged:: kitchen 1.2.0, API: kitchen.text 2.2.0
Strip out the C1 control characters in addition to the C0 control
characters.
'''
if not isunicodestring(string):
raise TypeError('process_control_char must have a unicode type as'
' the first argument.')
if strategy not in ('replace', 'ignore', 'strict'):
raise ValueError('The strategy argument to process_control_chars'
' must be one of ignore, replace, or strict')
# Most strings don't have control chars and translating carries
# a higher cost than testing whether the chars are in the string
# So only translate if necessary
if not _CONTROL_CHARS.isdisjoint(string):
if strategy == 'replace':
control_table = _REPLACE_TABLE
elif strategy == 'ignore':
control_table = _IGNORE_TABLE
else:
# strategy can only equal 'strict'
raise ControlCharError('ASCII control code present in string'
' input')
string = string.translate(control_table)
return string
# Originally written by Fredrik Lundh (January 15, 2003) and placed in the
# public domain::
#
# Unless otherwise noted, source code can be be used freely. Examples, test
# scripts and other short code fragments can be considered as being in the
# public domain.
#
# http://effbot.org/zone/re-sub.htm#unescape-html
# http://effbot.org/zone/copyright.htm
#
def html_entities_unescape(string):
'''Substitute unicode characters for HTML entities
:arg string: :class:`unicode` string to substitute out html entities
:raises TypeError: if something other than a :class:`unicode` string is
given
:rtype: :class:`unicode` string
:returns: The plain text without html entities
'''
def fixup(match):
string = match.group(0)
if string[:1] == u"<":
return "" # ignore tags
if string[:2] == u"&#":
try:
if string[:3] == u"&#x":
return unichr(int(string[3:-1], 16))
else:
return unichr(int(string[2:-1]))
except ValueError:
# If the value is outside the unicode codepoint range, leave
# it in the output as is
pass
elif string[:1] == u"&":
entity = htmlentitydefs.entitydefs.get(string[1:-1].encode('utf-8'))
if entity:
if entity[:2] == "&#":
try:
return unichr(int(entity[2:-1]))
except ValueError:
# If the value is outside the unicode codepoint range,
# leave it in the output as is
pass
else:
return unicode(entity, "iso-8859-1")
return string # leave as is
if not isunicodestring(string):
raise TypeError('html_entities_unescape must have a unicode type'
' for its first argument')
return re.sub(_ENTITY_RE, fixup, string)
def byte_string_valid_xml(byte_string, encoding='utf-8'):
'''Check that a byte :class:`str` would be valid in xml
:arg byte_string: Byte :class:`str` to check
:arg encoding: Encoding of the xml file. Default: :term:`UTF-8`
:returns: :data:`True` if the string is valid. :data:`False` if it would
be invalid in the xml file
In some cases you'll have a whole bunch of byte strings and rather than
transforming them to :class:`unicode` and back to byte :class:`str` for
output to xml, you will just want to make sure they work with the xml file
you're constructing. This function will help you do that. Example::
ARRAY_OF_MOSTLY_UTF8_STRINGS = [...]
processed_array = []
for string in ARRAY_OF_MOSTLY_UTF8_STRINGS:
if byte_string_valid_xml(string, 'utf-8'):
processed_array.append(string)
else:
processed_array.append(guess_bytes_to_xml(string, encoding='utf-8'))
output_xml(processed_array)
'''
if not isbytestring(byte_string):
# Not a byte string
return False
try:
u_string = unicode(byte_string, encoding)
except UnicodeError:
# Not encoded with the xml file's encoding
return False
data = frozenset(u_string)
if data.intersection(_CONTROL_CHARS):
# Contains control codes
return False
# The byte string is compatible with this xml file
return True
def byte_string_valid_encoding(byte_string, encoding='utf-8'):
'''Detect if a byte :class:`str` is valid in a specific encoding
:arg byte_string: Byte :class:`str` to test for bytes not valid in this
encoding
:kwarg encoding: encoding to test against. Defaults to :term:`UTF-8`.
:returns: :data:`True` if there are no invalid :term:`UTF-8` characters.
:data:`False` if an invalid character is detected.
.. note::
This function checks whether the byte :class:`str` is valid in the
specified encoding. It **does not** detect whether the byte
:class:`str` actually was encoded in that encoding. If you want that
sort of functionality, you probably want to use
:func:`~kitchen.text.misc.guess_encoding` instead.
'''
try:
unicode(byte_string, encoding)
except UnicodeError:
# Not encoded with the xml file's encoding
return False
# byte string is valid in this encoding
return True
__all__ = ('byte_string_valid_encoding', 'byte_string_valid_xml',
'guess_encoding', 'html_entities_unescape', 'isbasestring',
'isbytestring', 'isunicodestring', 'process_control_chars', 'str_eq')