miraculixx · November 22, 2021 15:57 · miraculixx · Jan 2, 2015 · miraculixx · Jan 2, 2015
diff --git a/asciidecodeerror.py b/asciidecodeerror.py
 # Python ascii codec can't decode and unicode mess
 #
 # check this out https://pythonhosted.org/kitchen/unicode-frustrations.html
 # and this http://www.joelonsoftware.com/articles/Unicode.html
 #
 # The short of it is this
 # 1. If you can, always set PYTHONIOENCODING=utf8 before you start your python programs.
 # 2. If you can't or you can't ensure this, always use the following lambda _u to get unicode text
 #    whereever you convert to strings (str.format, str % etc.)
 #
 # So for 2. you always do this:
 # _u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
 # _uu = lambda *tt: tuple(_u(t) for t in tt)
 # # use like this
 # text='Some string with codes > 127, like Zürich'
 # print "Some unknown input %s" % _u(text)
 # print "Multiple inputs %s, %s" % _uu(text, text)
 # # or like this
 # print u"Some string with codecs > 127 {}".format(_u(text))
 # print u"Multiple inputs {}, {}".format(_uu(text, text))
 # ---
 # amazingly, you can assign UTF8 to strings, it works nicely
 print "==> We'll work with these strings"
 text = "Zürich"
 utext = u"Zürich"
 print text, type(text)
 print utext, type(utext)
 # will raise UnicodeDeocdeError
 try:
    print "==> Try as unicode(text) => error"
    print unicode(text)
 except UnicodeDecodeError as e: 
    print e
 # try again
 print "==> Try as with replacing unknown characters"
 print unicode(text, errors="replace")
 # again
 print "==> Try as with ignoring unknown characters"
 print unicode(text, errors='ignore')
 # again
 print "==> Try as with UTF-8 encoding"
 print unicode(text, encoding="UTF-8")
 # but this doesn't work!!
 try:
    print "But amazingly you can't encode unicode ext into UTF8"
    print unicode(utext, encoding="UTF-8")
 except Exception as e:
    print "because %s" %e
 # be smart
 print "==> So you want unicode in string formats, huh?"
 try:
    print u"%s" % text
 except UnicodeDecodeError as e:
    print "Not so fast! %s" % e
 # let's try that again
 print "==> Now for real using string formats"
 print u"%s" % unicode(text, 'UTF-8')
 # so you think that's taking all the fun out of working with string formatting?
 print "==> Doing it like this works more naturally"
 print unicode("%s" % text, 'UTF-8')
 try:
    print "==> except if the string is already unicode:"
    print unicode("%s" % utext, 'UTF-8')
 except Exception as e:
    print "sorry, no can do, %s" % e
 # can we join, slice etc. such strings?
 otext = "ü".join(text)
 print otext
 # even better, use export PYTHONIOENCODING=utf8, if you can at start up time
 # see http://stackoverflow.com/a/27066059/890242
 #---------
 # let's deal with unicode itself
 print "===> Unicode text can just be printed like that, because they are of type unicode"
 print "%s" % utext, type("%s" % utext)
 # but if you combine the two, the fun starts
 print "===> Unicode + byte text cannot just be printed like that, because they are of type unicode"
 try:
    print "%s == %s" % (utext, text), type("%s" % utext)
 except UnicodeDecodeError as e:
    print "because, well, %s" % e
 # try again
 print "===> But you can covert them all to unicode or UTF8"
 print unicode("%s == %s" % (utext, unicode(text, 'UTF-8')))
 print "%s == %s" % (utext, unicode(text, 'UTF-8'))
 try:
    print "%s == %s" % (utext.decode('UTF-8'), text.decode('UTF-8'))
 except UnicodeDecodeError as e:
    print "no so fast... %s" % e
 except Exception as e:
    print "no so fast... %s" % e
 # again
 print "===> byte strings with codes > 127 are harder. You have to DECODE to utf8"
 print text.decode('UTF-8')
 try:
    print text.encode('UTF-8')
 except Exception as e:
    print "because encode won't work, %s" % e
 print "===> while you have to ENCODE unicodes. You can't decode unicode into UTF-8."
 print "works:", utext.encode('UTF-8')
 try:
    print "doesn't:", utext.decode('UTF-8')
 except Exception as e:
     print "%s" % e
 # in summary, if you know you want
 print "==> So in summary, always use unicode(text, 'UTF-8') to ensure you have unicode text from byte code"
 print unicode(text, 'UTF-8')
 print "==> or save-get a unicode object from your inputs, whether unicode or byte string"
 _u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
 #else unicode(text.encode('UTF-8', 'replace'))
 print _u(text), type(_u(text))
 print _u(utext), type(_u(utext))
 print "==> then you can mix match safely"
 text = _u(text)
 print "%s == %s" % (text, utext)
 print u"%s == %s" % (text, utext)
 # Conclusion: Always use this:
 # guarantee unicode
 _u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
 _uu = lambda *tt: tuple(_u(t) for t in tt)
 # guarantee byte string in UTF8
 _u8 = lambda t: t.encode('UTF-8', 'replace') if isinstance(t, unicode) else t
 _uu8 = lambda *tt: tuple(_u8(t) for t in tt)
 text='Some string with codes > 127, like Zürich'
 print "==> with _u, _uu"
 print _u(text), type(_u(text))
 print _u(utext), type(_u(utext))
 print _uu(text, utext), type(_uu(text, utext))
 print "==> with u8, uu8"
 print _u8(text), type(_u8(text))
 print _u8(utext), type(_u8(utext))
 print _uu8(text, utext), type(_uu8(text, utext))
 # with % formatting, always use _u() and _uu()
 print "Some unknown input %s" % _u(text)
 print "Multiple inputs %s, %s" % _uu(text, text)
 # but with .format be sure to always work with unicode strings
 print u"Also works with formats: {}".format(_u(text))
 print u"Also works with formats: {},{}".format(*_uu(text, text))
 # ... or use _u8 and _uu8, because string.format expects byte strings
 print "Also works with formats: {}".format(_u8(text))
 print "Also works with formats: {},{}".format(*_uu8(text, text))
diff --git a/stringext.py b/stringext.py
 # lambdas to convert into unicode and bytestring UTF8 from any basestring (unicode or str)
 #
 # Use as follows:
 #  text = u'Zürich'
 #  utext = 'Zürich'
 #  print "Text %s %s" % _uu(text,  utext)
 #  print u"Text %s" % _u(text)
 #  print "Text {},{}".format(*_uu8(text,  utext))
 #  print u"Text {},{}".format(*_uu(text,  utext))
 # 
 # 
 # guarantee unicode string
 _u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
 _uu = lambda *tt: tuple(_u(t) for t in tt)
 # guarantee byte string in UTF8 encoding
 _u8 = lambda t: t.encode('UTF-8', 'replace') if isinstance(t, unicode) else t
 _uu8 = lambda *tt: tuple(_u8(t) for t in tt)

 def main():
    text='Some string with codes > 127, like Zürich'
    utext=u'Some string with codes > 127, like Zürich'
    print "==> with _u, _uu"
    print _u(text), type(_u(text))
    print _u(utext), type(_u(utext))
    print _uu(text, utext), type(_uu(text, utext))
    print "==> with u8, uu8"
    print _u8(text), type(_u8(text))
    print _u8(utext), type(_u8(utext))
    print _uu8(text, utext), type(_uu8(text, utext))
    # with % formatting, always use _u() and _uu()
    print "Some unknown input %s" % _u(text)
    print "Multiple inputs %s, %s" % _uu(text, text)
    # but with .format be sure to always work with unicode strings
    print u"Also works with formats: {}".format(_u(text))
    print u"Also works with formats: {},{}".format(*_uu(text, text))
    # ... or use _u8 and _uu8, because string.format expects byte strings
    print "Also works with formats: {}".format(_u8(text))
    print "Also works with formats: {},{}".format(*_uu8(text, text))

 if __name__ == '__main__':
    main()
diff --git a/unicodemess.py b/unicodemess.py
 # that's the actual mess with unicode() in Python. It only works on either bytestring or unicode input
 # and you can't actually be sure what you get every single time
 def doit(*args):
    print "trying", args
    text = args[0]
    ttype = type(text)
    try:
        text = unicode(*args)
    except:
        print "==> DID NOT WORK: %s (type %s)"  % (text, ttype)
    else:
        print "==> %s worked (type %s)" % (text, ttype)
 # we want to have a uniform way to call unicode() for either byte string or unicode
 # doesn't work
 doit('zürich')
 doit(u'zürich', 'utf-8')
 # works
 doit(u'zürich')
 doit('zürich', 'utf-8')
 ## _u solves it by ensuring we have what we want
 doit(_u('Zürich'))
 doit(_u(u'Zürich'))
 doit(_u8(u'Zürich'), 'utf-8')
 doit(_u8('Zürich'), 'utf-8')
	# Python ascii codec can't decode and unicode mess
	#
	# check this out https://pythonhosted.org/kitchen/unicode-frustrations.html
	# and this http://www.joelonsoftware.com/articles/Unicode.html
	#
	# The short of it is this
	# 1. If you can, always set PYTHONIOENCODING=utf8 before you start your python programs.
	# 2. If you can't or you can't ensure this, always use the following lambda _u to get unicode text
	# whereever you convert to strings (str.format, str % etc.)
	#
	# So for 2. you always do this:
	# _u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
	# _uu = lambda *tt: tuple(_u(t) for t in tt)
	# # use like this
	# text='Some string with codes > 127, like Zürich'
	# print "Some unknown input %s" % _u(text)
	# print "Multiple inputs %s, %s" % _uu(text, text)
	# # or like this
	# print u"Some string with codecs > 127 {}".format(_u(text))
	# print u"Multiple inputs {}, {}".format(_uu(text, text))
	# ---
	# amazingly, you can assign UTF8 to strings, it works nicely
	print "==> We'll work with these strings"
	text = "Zürich"
	utext = u"Zürich"
	print text, type(text)
	print utext, type(utext)
	# will raise UnicodeDeocdeError
	try:
	print "==> Try as unicode(text) => error"
	print unicode(text)
	except UnicodeDecodeError as e:
	print e
	# try again
	print "==> Try as with replacing unknown characters"
	print unicode(text, errors="replace")
	# again
	print "==> Try as with ignoring unknown characters"
	print unicode(text, errors='ignore')
	# again
	print "==> Try as with UTF-8 encoding"
	print unicode(text, encoding="UTF-8")
	# but this doesn't work!!
	try:
	print "But amazingly you can't encode unicode ext into UTF8"
	print unicode(utext, encoding="UTF-8")
	except Exception as e:
	print "because %s" %e
	# be smart
	print "==> So you want unicode in string formats, huh?"
	try:
	print u"%s" % text
	except UnicodeDecodeError as e:
	print "Not so fast! %s" % e
	# let's try that again
	print "==> Now for real using string formats"
	print u"%s" % unicode(text, 'UTF-8')
	# so you think that's taking all the fun out of working with string formatting?
	print "==> Doing it like this works more naturally"
	print unicode("%s" % text, 'UTF-8')
	try:
	print "==> except if the string is already unicode:"
	print unicode("%s" % utext, 'UTF-8')
	except Exception as e:
	print "sorry, no can do, %s" % e
	# can we join, slice etc. such strings?
	otext = "ü".join(text)
	print otext
	# even better, use export PYTHONIOENCODING=utf8, if you can at start up time
	# see http://stackoverflow.com/a/27066059/890242
	#---------
	# let's deal with unicode itself
	print "===> Unicode text can just be printed like that, because they are of type unicode"
	print "%s" % utext, type("%s" % utext)
	# but if you combine the two, the fun starts
	print "===> Unicode + byte text cannot just be printed like that, because they are of type unicode"
	try:
	print "%s == %s" % (utext, text), type("%s" % utext)
	except UnicodeDecodeError as e:
	print "because, well, %s" % e
	# try again
	print "===> But you can covert them all to unicode or UTF8"
	print unicode("%s == %s" % (utext, unicode(text, 'UTF-8')))
	print "%s == %s" % (utext, unicode(text, 'UTF-8'))
	try:
	print "%s == %s" % (utext.decode('UTF-8'), text.decode('UTF-8'))
	except UnicodeDecodeError as e:
	print "no so fast... %s" % e
	except Exception as e:
	print "no so fast... %s" % e
	# again
	print "===> byte strings with codes > 127 are harder. You have to DECODE to utf8"
	print text.decode('UTF-8')
	try:
	print text.encode('UTF-8')
	except Exception as e:
	print "because encode won't work, %s" % e
	print "===> while you have to ENCODE unicodes. You can't decode unicode into UTF-8."
	print "works:", utext.encode('UTF-8')
	try:
	print "doesn't:", utext.decode('UTF-8')
	except Exception as e:
	print "%s" % e
	# in summary, if you know you want
	print "==> So in summary, always use unicode(text, 'UTF-8') to ensure you have unicode text from byte code"
	print unicode(text, 'UTF-8')
	print "==> or save-get a unicode object from your inputs, whether unicode or byte string"
	_u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
	#else unicode(text.encode('UTF-8', 'replace'))
	print _u(text), type(_u(text))
	print _u(utext), type(_u(utext))
	print "==> then you can mix match safely"
	text = _u(text)
	print "%s == %s" % (text, utext)
	print u"%s == %s" % (text, utext)
	# Conclusion: Always use this:
	# guarantee unicode
	_u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
	_uu = lambda *tt: tuple(_u(t) for t in tt)
	# guarantee byte string in UTF8
	_u8 = lambda t: t.encode('UTF-8', 'replace') if isinstance(t, unicode) else t
	_uu8 = lambda *tt: tuple(_u8(t) for t in tt)
	text='Some string with codes > 127, like Zürich'
	print "==> with _u, _uu"
	print _u(text), type(_u(text))
	print _u(utext), type(_u(utext))
	print _uu(text, utext), type(_uu(text, utext))
	print "==> with u8, uu8"
	print _u8(text), type(_u8(text))
	print _u8(utext), type(_u8(utext))
	print _uu8(text, utext), type(_uu8(text, utext))
	# with % formatting, always use _u() and _uu()
	print "Some unknown input %s" % _u(text)
	print "Multiple inputs %s, %s" % _uu(text, text)
	# but with .format be sure to always work with unicode strings
	print u"Also works with formats: {}".format(_u(text))
	print u"Also works with formats: {},{}".format(*_uu(text, text))
	# ... or use _u8 and _uu8, because string.format expects byte strings
	print "Also works with formats: {}".format(_u8(text))
	print "Also works with formats: {},{}".format(*_uu8(text, text))
	# lambdas to convert into unicode and bytestring UTF8 from any basestring (unicode or str)
	#
	# Use as follows:
	# text = u'Zürich'
	# utext = 'Zürich'
	# print "Text %s %s" % _uu(text, utext)
	# print u"Text %s" % _u(text)
	# print "Text {},{}".format(*_uu8(text, utext))
	# print u"Text {},{}".format(*_uu(text, utext))
	#
	#
	# guarantee unicode string
	_u = lambda t: t.decode('UTF-8', 'replace') if isinstance(t, str) else t
	_uu = lambda *tt: tuple(_u(t) for t in tt)
	# guarantee byte string in UTF8 encoding
	_u8 = lambda t: t.encode('UTF-8', 'replace') if isinstance(t, unicode) else t
	_uu8 = lambda *tt: tuple(_u8(t) for t in tt)

	def main():
	text='Some string with codes > 127, like Zürich'
	utext=u'Some string with codes > 127, like Zürich'
	print "==> with _u, _uu"
	print _u(text), type(_u(text))
	print _u(utext), type(_u(utext))
	print _uu(text, utext), type(_uu(text, utext))
	print "==> with u8, uu8"
	print _u8(text), type(_u8(text))
	print _u8(utext), type(_u8(utext))
	print _uu8(text, utext), type(_uu8(text, utext))
	# with % formatting, always use _u() and _uu()
	print "Some unknown input %s" % _u(text)
	print "Multiple inputs %s, %s" % _uu(text, text)
	# but with .format be sure to always work with unicode strings
	print u"Also works with formats: {}".format(_u(text))
	print u"Also works with formats: {},{}".format(*_uu(text, text))
	# ... or use _u8 and _uu8, because string.format expects byte strings
	print "Also works with formats: {}".format(_u8(text))
	print "Also works with formats: {},{}".format(*_uu8(text, text))

	if __name__ == '__main__':
	main()
	# that's the actual mess with unicode() in Python. It only works on either bytestring or unicode input
	# and you can't actually be sure what you get every single time
	def doit(*args):
	print "trying", args
	text = args[0]
	ttype = type(text)
	try:
	text = unicode(*args)
	except:
	print "==> DID NOT WORK: %s (type %s)" % (text, ttype)
	else:
	print "==> %s worked (type %s)" % (text, ttype)
	# we want to have a uniform way to call unicode() for either byte string or unicode
	# doesn't work
	doit('zürich')
	doit(u'zürich', 'utf-8')
	# works
	doit(u'zürich')
	doit('zürich', 'utf-8')
	## _u solves it by ensuring we have what we want
	doit(_u('Zürich'))
	doit(_u(u'Zürich'))
	doit(_u8(u'Zürich'), 'utf-8')
	doit(_u8('Zürich'), 'utf-8')