130 lines
7.1 KiB
Python
130 lines
7.1 KiB
Python
|
# -*- coding: utf-8 -*-
|
|||
|
#
|
|||
|
# Base class for testing unicode and utf8 functions. This holds data that's
|
|||
|
# useful for making tests
|
|||
|
|
|||
|
import re
|
|||
|
|
|||
|
from kitchen.text.converters import to_bytes
|
|||
|
from kitchen.text import misc
|
|||
|
|
|||
|
class UnicodeTestData(object):
|
|||
|
# This should encode fine -- sanity check
|
|||
|
u_ascii = u'the quick brown fox jumped over the lazy dog'
|
|||
|
b_ascii = 'the quick brown fox jumped over the lazy dog'
|
|||
|
|
|||
|
# First challenge -- what happens with latin-1 characters
|
|||
|
u_spanish = u'El veloz murciélago saltó sobre el perro perezoso.'
|
|||
|
# utf8 and latin1 both support these chars so no mangling
|
|||
|
utf8_spanish = u_spanish.encode('utf8')
|
|||
|
latin1_spanish = u_spanish.encode('latin1')
|
|||
|
|
|||
|
# ASCII does not have the accented characters so it mangles
|
|||
|
ascii_mangled_spanish_as_ascii = u_spanish.encode('ascii', 'replace')
|
|||
|
# Attempting to decode using the wrong charset will mangle
|
|||
|
# Note: as a general principle, we do not want to have code that mangles
|
|||
|
# input of one charset and output of the same charset. We want to avoid
|
|||
|
# things like::
|
|||
|
# input latin-1, transform to unicode with utf-8, output latin-1.
|
|||
|
u_mangled_spanish_utf8_as_latin1 = unicode(utf8_spanish, encoding='latin1', errors='replace')
|
|||
|
u_mangled_spanish_utf8_as_ascii = unicode(utf8_spanish, encoding='ascii', errors='replace')
|
|||
|
u_mangled_spanish_latin1_as_ascii = unicode(latin1_spanish, encoding='ascii', errors='replace')
|
|||
|
u_mangled_spanish_latin1_as_utf8 = unicode(latin1_spanish, encoding='utf-8', errors='replace')
|
|||
|
ascii_twice_mangled_spanish_latin1_as_utf8_as_ascii = u_mangled_spanish_latin1_as_utf8.encode('ascii', 'replace')
|
|||
|
utf8_mangled_spanish_latin1_as_utf8 = u_mangled_spanish_latin1_as_utf8.encode('utf-8')
|
|||
|
u_spanish_ignore = unicode(latin1_spanish, encoding='utf8', errors='ignore')
|
|||
|
|
|||
|
u_japanese = u"速い茶色のキツネが怠惰な犬に'増"
|
|||
|
utf8_japanese = u_japanese.encode('utf8')
|
|||
|
euc_jp_japanese = u_japanese.encode('euc_jp')
|
|||
|
u_mangled_euc_jp_as_latin1 = unicode(euc_jp_japanese, 'latin1')
|
|||
|
u_mangled_euc_jp_as_utf8 = unicode(euc_jp_japanese, 'utf-8', 'replace')
|
|||
|
utf8_mangled_euc_jp_as_latin1 = u_mangled_euc_jp_as_latin1.encode('utf8')
|
|||
|
u_mangled_japanese_utf8_as_latin1 = unicode(utf8_japanese, 'latin1')
|
|||
|
u_mangled_japanese_utf8_as_ascii = u"<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>'<EFBFBD><EFBFBD><EFBFBD>"
|
|||
|
ascii_mangled_japanese_replace_as_latin1 = "??????????????'?"
|
|||
|
latin1_mangled_japanese_replace_as_latin1 = "??????????????'?"
|
|||
|
|
|||
|
u_mixed = u'く ku ら ra と to み mi'
|
|||
|
utf8_mixed = u_mixed.encode('utf8')
|
|||
|
utf8_ku = u_mixed[0].encode('utf8')
|
|||
|
utf8_ra = u_mixed[2].encode('utf8')
|
|||
|
utf8_to = u_mixed[4].encode('utf8')
|
|||
|
utf8_mi = u_mixed[6].encode('utf8')
|
|||
|
|
|||
|
u_mixed_replace = u'\ufffd ku \ufffd ra \ufffd to \ufffd mi'
|
|||
|
u_mixed_ignore = u' ku ra to mi'
|
|||
|
latin1_mixed_replace = '? ku ? ra ? to ? mi'
|
|||
|
latin1_mixed_ignore = ' ku ra to mi'
|
|||
|
|
|||
|
u_entity = u'Test: <"&"> – ' + u_japanese + u'é'
|
|||
|
utf8_entity = u_entity.encode('utf8')
|
|||
|
u_entity_escape = u'Test: <"&"> – ' + unicode(u_japanese.encode('ascii', 'xmlcharrefreplace'), 'ascii') + u'é'
|
|||
|
utf8_entity_escape = 'Test: <"&"> – 速い茶色のキツネが怠惰な犬に\'増é'
|
|||
|
utf8_attrib_escape = 'Test: <"&"> – 速い茶色のキツネが怠惰な犬に\'増é'
|
|||
|
ascii_entity_escape = (u'Test: <"&"> – ' + u_japanese + u'é').encode('ascii', 'xmlcharrefreplace').replace('&', '&',1).replace('<', '<').replace('>', '>')
|
|||
|
|
|||
|
b_byte_chars = ' '.join(map(chr, range(0, 256)))
|
|||
|
b_byte_encoded = 'ACABIAIgAyAEIAUgBiAHIAggCSAKIAsgDCANIA4gDyAQIBEgEiATIBQgFSAWIBcgGCAZIBogGyAcIB0gHiAfICAgISAiICMgJCAlICYgJyAoICkgKiArICwgLSAuIC8gMCAxIDIgMyA0IDUgNiA3IDggOSA6IDsgPCA9ID4gPyBAIEEgQiBDIEQgRSBGIEcgSCBJIEogSyBMIE0gTiBPIFAgUSBSIFMgVCBVIFYgVyBYIFkgWiBbIFwgXSBeIF8gYCBhIGIgYyBkIGUgZiBnIGggaSBqIGsgbCBtIG4gbyBwIHEgciBzIHQgdSB2IHcgeCB5IHogeyB8IH0gfiB/IIAggSCCIIMghCCFIIYghyCIIIkgiiCLIIwgjSCOII8gkCCRIJIgkyCUIJUgliCXIJggmSCaIJsgnCCdIJ4gnyCgIKEgoiCjIKQgpSCmIKcgqCCpIKogqyCsIK0griCvILAgsSCyILMgtCC1ILYgtyC4ILkguiC7ILwgvSC+IL8gwCDBIMIgwyDEIMUgxiDHIMggySDKIMsgzCDNIM4gzyDQINEg0iDTINQg1SDWINcg2CDZINog2yDcIN0g3iDfIOAg4SDiIOMg5CDlIOYg5yDoIOkg6iDrIOwg7SDuIO8g8CDxIPIg8yD0IPUg9iD3IPgg+SD6IPsg/CD9IP4g/w=='
|
|||
|
|
|||
|
repr_re = re.compile('^<[^ ]*\.([^.]+) object at .*>$')
|
|||
|
|
|||
|
u_paragraph = u'''ConfigObj is a simple but powerful config file reader and writer: an ini file
|
|||
|
round tripper. Its main feature is that it is very easy to use, with a
|
|||
|
straightforward programmer's interface and a simple syntax for config files.
|
|||
|
It has lots of other features though:
|
|||
|
|
|||
|
|
|||
|
|
|||
|
* Nested sections (subsections), to any level
|
|||
|
* List values
|
|||
|
* Multiple line values
|
|||
|
* String interpolation (substitution)
|
|||
|
* Integrated with a powerful validation system
|
|||
|
o including automatic type checking/conversion
|
|||
|
o repeated sections
|
|||
|
o and allowing default values
|
|||
|
* All comments in the file are preserved
|
|||
|
* The order of keys/sections is preserved
|
|||
|
* No external dependencies
|
|||
|
* Full Unicode support
|
|||
|
* A powerful unrepr mode for storing basic datatypes
|
|||
|
'''
|
|||
|
utf8_paragraph = u_paragraph.encode('utf-8', 'replace')
|
|||
|
u_paragraph_out = [u'ConfigObj is a simple but powerful config file reader and writer: an',
|
|||
|
u'ini file round tripper. Its main feature is that it is very easy to',
|
|||
|
u"use, with a straightforward programmer's interface and a simple syntax",
|
|||
|
u'for config files. It has lots of other features though:',
|
|||
|
u'',
|
|||
|
u'',
|
|||
|
u'',
|
|||
|
u' * Nested sections (subsections), to any level',
|
|||
|
u' * List values',
|
|||
|
u' * Multiple line values',
|
|||
|
u' * String interpolation (substitution)',
|
|||
|
u' * Integrated with a powerful validation system',
|
|||
|
u' o including automatic type checking/conversion',
|
|||
|
u' o repeated sections',
|
|||
|
u' o and allowing default values',
|
|||
|
u' * All comments in the file are preserved',
|
|||
|
u' * The order of keys/sections is preserved',
|
|||
|
u' * No external dependencies',
|
|||
|
u' * Full Unicode support',
|
|||
|
u' * A powerful unrepr mode for storing basic datatypes']
|
|||
|
|
|||
|
utf8_paragraph_out = [line.encode('utf-8', 'replace') for line in u_paragraph_out]
|
|||
|
|
|||
|
u_mixed_para = u'くらとみ kuratomi ' * 5
|
|||
|
utf8_mixed_para = u_mixed_para.encode('utf8')
|
|||
|
u_mixed_para_out = [u'くらとみ kuratomi くらとみ kuratomi くらとみ kuratomi くらとみ',
|
|||
|
u'kuratomi くらとみ kuratomi']
|
|||
|
u_mixed_para_57_initial_subsequent_out = [u' くらとみ kuratomi くらとみ kuratomi くらとみ kuratomi',
|
|||
|
u'----くらとみ kuratomi くらとみ kuratomi']
|
|||
|
utf8_mixed_para_out = map(to_bytes, u_mixed_para_out)
|
|||
|
utf8_mixed_para_57_initial_subsequent_out = map(to_bytes, u_mixed_para_57_initial_subsequent_out)
|
|||
|
|
|||
|
u_ascii_chars = u' '.join(map(unichr, range(0, 256)))
|
|||
|
u_ascii_no_ctrl = u''.join([c for c in u_ascii_chars if ord(c) not in misc._CONTROL_CODES])
|
|||
|
u_ascii_ctrl_replace = u_ascii_chars.translate(dict([(c, u'?') for c in misc._CONTROL_CODES]))
|
|||
|
utf8_ascii_chars = u_ascii_chars.encode('utf8')
|