372 lines
14 KiB
Python
372 lines
14 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright (c) 2012 Red Hat, Inc
|
|
# Copyright (c) 2010 Seth Vidal
|
|
#
|
|
# kitchen is free software; you can redistribute it and/or
|
|
# modify it under the terms of the GNU Lesser General Public
|
|
# License as published by the Free Software Foundation; either
|
|
# version 2.1 of the License, or (at your option) any later version.
|
|
#
|
|
# kitchen is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
# Lesser General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Lesser General Public
|
|
# License along with kitchen; if not, see <http://www.gnu.org/licenses/>
|
|
#
|
|
# Authors:
|
|
# James Antill
|
|
# Toshio Kuratomi <toshio@fedoraproject.org>
|
|
# Seth Vidal
|
|
#
|
|
# Portions of this code taken from yum/misc.py and yum/i18n.py
|
|
'''
|
|
---------------------------------------------
|
|
Miscellaneous functions for manipulating text
|
|
---------------------------------------------
|
|
|
|
Collection of text functions that don't fit in another category.
|
|
|
|
.. versionchanged:: kitchen 1.2.0, API: kitchen.text 2.2.0
|
|
Added :func:`~kitchen.text.misc.isbasestring`,
|
|
:func:`~kitchen.text.misc.isbytestring`, and
|
|
:func:`~kitchen.text.misc.isunicodestring` to help tell which string type
|
|
is which on python2 and python3
|
|
'''
|
|
import htmlentitydefs
|
|
import itertools
|
|
import re
|
|
|
|
try:
|
|
import chardet
|
|
except ImportError:
|
|
chardet = None
|
|
|
|
from kitchen.pycompat24 import sets
|
|
from kitchen.text.exceptions import ControlCharError
|
|
|
|
sets.add_builtin_set()
|
|
|
|
# Define a threshold for chardet confidence. If we fall below this we decode
|
|
# byte strings we're guessing about as latin1
|
|
_CHARDET_THRESHHOLD = 0.6
|
|
|
|
# ASCII control codes (the c0 codes) that are illegal in xml 1.0
|
|
# Also unicode control codes (the C1 codes): also illegal in xml
|
|
_CONTROL_CODES = frozenset(range(0, 8) + [11, 12] + range(14, 32) + range(128, 160))
|
|
_CONTROL_CHARS = frozenset(itertools.imap(unichr, _CONTROL_CODES))
|
|
_IGNORE_TABLE = dict(zip(_CONTROL_CODES, [None] * len(_CONTROL_CODES)))
|
|
_REPLACE_TABLE = dict(zip(_CONTROL_CODES, [u'?'] * len(_CONTROL_CODES)))
|
|
|
|
# _ENTITY_RE
|
|
_ENTITY_RE = re.compile(r'(?s)<[^>]*>|&#?\w+;')
|
|
|
|
def isbasestring(obj):
|
|
'''Determine if obj is a byte :class:`str` or :class:`unicode` string
|
|
|
|
In python2 this is eqiuvalent to isinstance(obj, basestring). In python3
|
|
it checks whether the object is an instance of str, bytes, or bytearray.
|
|
This is an aid to porting code that needed to test whether an object was
|
|
derived from basestring in python2 (commonly used in unicode-bytes
|
|
conversion functions)
|
|
|
|
:arg obj: Object to test
|
|
:returns: True if the object is a :class:`basestring`. Otherwise False.
|
|
|
|
.. versionadded:: Kitchen: 1.2.0, API kitchen.text 2.2.0
|
|
'''
|
|
if isinstance(obj, basestring):
|
|
return True
|
|
return False
|
|
|
|
def isbytestring(obj):
|
|
'''Determine if obj is a byte :class:`str`
|
|
|
|
In python2 this is equivalent to isinstance(obj, str). In python3 it
|
|
checks whether the object is an instance of bytes or bytearray.
|
|
|
|
:arg obj: Object to test
|
|
:returns: True if the object is a byte :class:`str`. Otherwise, False.
|
|
|
|
.. versionadded:: Kitchen: 1.2.0, API kitchen.text 2.2.0
|
|
'''
|
|
if isinstance(obj, str):
|
|
return True
|
|
return False
|
|
|
|
def isunicodestring(obj):
|
|
'''Determine if obj is a :class:`unicode` string
|
|
|
|
In python2 this is equivalent to isinstance(obj, unicode). In python3 it
|
|
checks whether the object is an instance of :class:`str`.
|
|
|
|
:arg obj: Object to test
|
|
:returns: True if the object is a :class:`unicode` string. Otherwise, False.
|
|
|
|
.. versionadded:: Kitchen: 1.2.0, API kitchen.text 2.2.0
|
|
'''
|
|
if isinstance(obj, unicode):
|
|
return True
|
|
return False
|
|
|
|
def guess_encoding(byte_string, disable_chardet=False):
|
|
'''Try to guess the encoding of a byte :class:`str`
|
|
|
|
:arg byte_string: byte :class:`str` to guess the encoding of
|
|
:kwarg disable_chardet: If this is True, we never attempt to use
|
|
:mod:`chardet` to guess the encoding. This is useful if you need to
|
|
have reproducibility whether :mod:`chardet` is installed or not.
|
|
Default: :data:`False`.
|
|
:raises TypeError: if :attr:`byte_string` is not a byte :class:`str` type
|
|
:returns: string containing a guess at the encoding of
|
|
:attr:`byte_string`. This is appropriate to pass as the encoding
|
|
argument when encoding and decoding unicode strings.
|
|
|
|
We start by attempting to decode the byte :class:`str` as :term:`UTF-8`.
|
|
If this succeeds we tell the world it's :term:`UTF-8` text. If it doesn't
|
|
and :mod:`chardet` is installed on the system and :attr:`disable_chardet`
|
|
is False this function will use it to try detecting the encoding of
|
|
:attr:`byte_string`. If it is not installed or :mod:`chardet` cannot
|
|
determine the encoding with a high enough confidence then we rather
|
|
arbitrarily claim that it is ``latin-1``. Since ``latin-1`` will encode
|
|
to every byte, decoding from ``latin-1`` to :class:`unicode` will not
|
|
cause :exc:`UnicodeErrors` although the output might be mangled.
|
|
'''
|
|
if not isbytestring(byte_string):
|
|
raise TypeError('first argument must be a byte string (str)')
|
|
input_encoding = 'utf-8'
|
|
try:
|
|
unicode(byte_string, input_encoding, 'strict')
|
|
except UnicodeDecodeError:
|
|
input_encoding = None
|
|
|
|
if not input_encoding and chardet and not disable_chardet:
|
|
detection_info = chardet.detect(byte_string)
|
|
if detection_info['confidence'] >= _CHARDET_THRESHHOLD:
|
|
input_encoding = detection_info['encoding']
|
|
|
|
if not input_encoding:
|
|
input_encoding = 'latin-1'
|
|
|
|
return input_encoding
|
|
|
|
def str_eq(str1, str2, encoding='utf-8', errors='replace'):
|
|
'''Compare two strings, converting to byte :class:`str` if one is
|
|
:class:`unicode`
|
|
|
|
:arg str1: First string to compare
|
|
:arg str2: Second string to compare
|
|
:kwarg encoding: If we need to convert one string into a byte :class:`str`
|
|
to compare, the encoding to use. Default is :term:`utf-8`.
|
|
:kwarg errors: What to do if we encounter errors when encoding the string.
|
|
See the :func:`kitchen.text.converters.to_bytes` documentation for
|
|
possible values. The default is ``replace``.
|
|
|
|
This function prevents :exc:`UnicodeError` (python-2.4 or less) and
|
|
:exc:`UnicodeWarning` (python 2.5 and higher) when we compare
|
|
a :class:`unicode` string to a byte :class:`str`. The errors normally
|
|
arise because the conversion is done to :term:`ASCII`. This function
|
|
lets you convert to :term:`utf-8` or another encoding instead.
|
|
|
|
.. note::
|
|
|
|
When we need to convert one of the strings from :class:`unicode` in
|
|
order to compare them we convert the :class:`unicode` string into
|
|
a byte :class:`str`. That means that strings can compare differently
|
|
if you use different encodings for each.
|
|
|
|
Note that ``str1 == str2`` is faster than this function if you can accept
|
|
the following limitations:
|
|
|
|
* Limited to python-2.5+ (otherwise a :exc:`UnicodeDecodeError` may be
|
|
thrown)
|
|
* Will generate a :exc:`UnicodeWarning` if non-:term:`ASCII` byte
|
|
:class:`str` is compared to :class:`unicode` string.
|
|
'''
|
|
try:
|
|
return (not str1 < str2) and (not str1 > str2)
|
|
except UnicodeError:
|
|
pass
|
|
|
|
if isunicodestring(str1):
|
|
str1 = str1.encode(encoding, errors)
|
|
else:
|
|
str2 = str2.encode(encoding, errors)
|
|
if str1 == str2:
|
|
return True
|
|
|
|
return False
|
|
|
|
def process_control_chars(string, strategy='replace'):
|
|
'''Look for and transform :term:`control characters` in a string
|
|
|
|
:arg string: string to search for and transform :term:`control characters`
|
|
within
|
|
:kwarg strategy: XML does not allow :term:`ASCII` :term:`control
|
|
characters`. When we encounter those we need to know what to do.
|
|
Valid options are:
|
|
|
|
:replace: (default) Replace the :term:`control characters`
|
|
with ``"?"``
|
|
:ignore: Remove the characters altogether from the output
|
|
:strict: Raise a :exc:`~kitchen.text.exceptions.ControlCharError` when
|
|
we encounter a control character
|
|
:raises TypeError: if :attr:`string` is not a unicode string.
|
|
:raises ValueError: if the strategy is not one of replace, ignore, or
|
|
strict.
|
|
:raises kitchen.text.exceptions.ControlCharError: if the strategy is
|
|
``strict`` and a :term:`control character` is present in the
|
|
:attr:`string`
|
|
:returns: :class:`unicode` string with no :term:`control characters` in
|
|
it.
|
|
|
|
.. versionchanged:: kitchen 1.2.0, API: kitchen.text 2.2.0
|
|
Strip out the C1 control characters in addition to the C0 control
|
|
characters.
|
|
'''
|
|
if not isunicodestring(string):
|
|
raise TypeError('process_control_char must have a unicode type as'
|
|
' the first argument.')
|
|
if strategy not in ('replace', 'ignore', 'strict'):
|
|
raise ValueError('The strategy argument to process_control_chars'
|
|
' must be one of ignore, replace, or strict')
|
|
|
|
# Most strings don't have control chars and translating carries
|
|
# a higher cost than testing whether the chars are in the string
|
|
# So only translate if necessary
|
|
if not _CONTROL_CHARS.isdisjoint(string):
|
|
if strategy == 'replace':
|
|
control_table = _REPLACE_TABLE
|
|
elif strategy == 'ignore':
|
|
control_table = _IGNORE_TABLE
|
|
else:
|
|
# strategy can only equal 'strict'
|
|
raise ControlCharError('ASCII control code present in string'
|
|
' input')
|
|
string = string.translate(control_table)
|
|
|
|
return string
|
|
|
|
# Originally written by Fredrik Lundh (January 15, 2003) and placed in the
|
|
# public domain::
|
|
#
|
|
# Unless otherwise noted, source code can be be used freely. Examples, test
|
|
# scripts and other short code fragments can be considered as being in the
|
|
# public domain.
|
|
#
|
|
# http://effbot.org/zone/re-sub.htm#unescape-html
|
|
# http://effbot.org/zone/copyright.htm
|
|
#
|
|
def html_entities_unescape(string):
|
|
'''Substitute unicode characters for HTML entities
|
|
|
|
:arg string: :class:`unicode` string to substitute out html entities
|
|
:raises TypeError: if something other than a :class:`unicode` string is
|
|
given
|
|
:rtype: :class:`unicode` string
|
|
:returns: The plain text without html entities
|
|
'''
|
|
def fixup(match):
|
|
string = match.group(0)
|
|
if string[:1] == u"<":
|
|
return "" # ignore tags
|
|
if string[:2] == u"&#":
|
|
try:
|
|
if string[:3] == u"&#x":
|
|
return unichr(int(string[3:-1], 16))
|
|
else:
|
|
return unichr(int(string[2:-1]))
|
|
except ValueError:
|
|
# If the value is outside the unicode codepoint range, leave
|
|
# it in the output as is
|
|
pass
|
|
elif string[:1] == u"&":
|
|
entity = htmlentitydefs.entitydefs.get(string[1:-1].encode('utf-8'))
|
|
if entity:
|
|
if entity[:2] == "&#":
|
|
try:
|
|
return unichr(int(entity[2:-1]))
|
|
except ValueError:
|
|
# If the value is outside the unicode codepoint range,
|
|
# leave it in the output as is
|
|
pass
|
|
else:
|
|
return unicode(entity, "iso-8859-1")
|
|
return string # leave as is
|
|
|
|
if not isunicodestring(string):
|
|
raise TypeError('html_entities_unescape must have a unicode type'
|
|
' for its first argument')
|
|
return re.sub(_ENTITY_RE, fixup, string)
|
|
|
|
def byte_string_valid_xml(byte_string, encoding='utf-8'):
|
|
'''Check that a byte :class:`str` would be valid in xml
|
|
|
|
:arg byte_string: Byte :class:`str` to check
|
|
:arg encoding: Encoding of the xml file. Default: :term:`UTF-8`
|
|
:returns: :data:`True` if the string is valid. :data:`False` if it would
|
|
be invalid in the xml file
|
|
|
|
In some cases you'll have a whole bunch of byte strings and rather than
|
|
transforming them to :class:`unicode` and back to byte :class:`str` for
|
|
output to xml, you will just want to make sure they work with the xml file
|
|
you're constructing. This function will help you do that. Example::
|
|
|
|
ARRAY_OF_MOSTLY_UTF8_STRINGS = [...]
|
|
processed_array = []
|
|
for string in ARRAY_OF_MOSTLY_UTF8_STRINGS:
|
|
if byte_string_valid_xml(string, 'utf-8'):
|
|
processed_array.append(string)
|
|
else:
|
|
processed_array.append(guess_bytes_to_xml(string, encoding='utf-8'))
|
|
output_xml(processed_array)
|
|
'''
|
|
if not isbytestring(byte_string):
|
|
# Not a byte string
|
|
return False
|
|
|
|
try:
|
|
u_string = unicode(byte_string, encoding)
|
|
except UnicodeError:
|
|
# Not encoded with the xml file's encoding
|
|
return False
|
|
|
|
data = frozenset(u_string)
|
|
if data.intersection(_CONTROL_CHARS):
|
|
# Contains control codes
|
|
return False
|
|
|
|
# The byte string is compatible with this xml file
|
|
return True
|
|
|
|
def byte_string_valid_encoding(byte_string, encoding='utf-8'):
|
|
'''Detect if a byte :class:`str` is valid in a specific encoding
|
|
|
|
:arg byte_string: Byte :class:`str` to test for bytes not valid in this
|
|
encoding
|
|
:kwarg encoding: encoding to test against. Defaults to :term:`UTF-8`.
|
|
:returns: :data:`True` if there are no invalid :term:`UTF-8` characters.
|
|
:data:`False` if an invalid character is detected.
|
|
|
|
.. note::
|
|
|
|
This function checks whether the byte :class:`str` is valid in the
|
|
specified encoding. It **does not** detect whether the byte
|
|
:class:`str` actually was encoded in that encoding. If you want that
|
|
sort of functionality, you probably want to use
|
|
:func:`~kitchen.text.misc.guess_encoding` instead.
|
|
'''
|
|
try:
|
|
unicode(byte_string, encoding)
|
|
except UnicodeError:
|
|
# Not encoded with the xml file's encoding
|
|
return False
|
|
|
|
# byte string is valid in this encoding
|
|
return True
|
|
|
|
__all__ = ('byte_string_valid_encoding', 'byte_string_valid_xml',
|
|
'guess_encoding', 'html_entities_unescape', 'isbasestring',
|
|
'isbytestring', 'isunicodestring', 'process_control_chars', 'str_eq')
|