summaryrefslogtreecommitdiff
path: root/django/utils/encoding.py
diff options
context:
space:
mode:
authorMalcolm Tredinnick <malcolm.tredinnick@gmail.com>2007-07-04 12:11:04 +0000
committerMalcolm Tredinnick <malcolm.tredinnick@gmail.com>2007-07-04 12:11:04 +0000
commit953badbea5a04159adbfa970f5805c0232b6a401 (patch)
tree9569f74b5d382b222613a1085efd0de21937e95f /django/utils/encoding.py
parent4c958b15b250866b70ded7d82aa532f1e57f96ae (diff)
Merged Unicode branch into trunk (r4952:5608). This should be fully
backwards compatible for all practical purposes. Fixed #2391, #2489, #2996, #3322, #3344, #3370, #3406, #3432, #3454, #3492, #3582, #3690, #3878, #3891, #3937, #4039, #4141, #4227, #4286, #4291, #4300, #4452, #4702 git-svn-id: http://code.djangoproject.com/svn/django/trunk@5609 bcc190cf-cafb-0310-a4f2-bffc1f526a37
Diffstat (limited to 'django/utils/encoding.py')
-rw-r--r--django/utils/encoding.py84
1 files changed, 68 insertions, 16 deletions
diff --git a/django/utils/encoding.py b/django/utils/encoding.py
index 4774fb0d26..7515d0c41b 100644
--- a/django/utils/encoding.py
+++ b/django/utils/encoding.py
@@ -1,32 +1,84 @@
+import types
+import urllib
from django.conf import settings
from django.utils.functional import Promise
-def smart_unicode(s):
+class StrAndUnicode(object):
+ """
+ A class whose __str__ returns its __unicode__ as a UTF-8 bytestring.
+
+ Useful as a mix-in.
+ """
+ def __str__(self):
+ return self.__unicode__().encode('utf-8')
+
+def smart_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
+ """
+ Returns a unicode object representing 's'. Treats bytestrings using the
+ 'encoding' codec.
+
+ If strings_only is True, don't convert (some) non-string-like objects.
+ """
if isinstance(s, Promise):
- # The input is the result of a gettext_lazy() call, or similar. It will
- # already be encoded in DEFAULT_CHARSET on evaluation and we don't want
- # to evaluate it until render time.
- # FIXME: This isn't totally consistent, because it eventually returns a
- # bytestring rather than a unicode object. It works wherever we use
- # smart_unicode() at the moment. Fixing this requires work in the
- # i18n internals.
+ # The input is the result of a gettext_lazy() call.
+ return s
+ return force_unicode(s, encoding, strings_only, errors)
+
+def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
+ """
+ Similar to smart_unicode, except that lazy instances are resolved to
+ strings, rather than kept as lazy objects.
+
+ If strings_only is True, don't convert (some) non-string-like objects.
+ """
+ if strings_only and isinstance(s, (types.NoneType, int)):
return s
if not isinstance(s, basestring,):
if hasattr(s, '__unicode__'):
s = unicode(s)
else:
- s = unicode(str(s), settings.DEFAULT_CHARSET)
+ s = unicode(str(s), encoding, errors)
elif not isinstance(s, unicode):
- s = unicode(s, settings.DEFAULT_CHARSET)
+ s = unicode(s, encoding, errors)
return s
-class StrAndUnicode(object):
+def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
"""
- A class whose __str__ returns its __unicode__ as a bytestring
- according to settings.DEFAULT_CHARSET.
+ Returns a bytestring version of 's', encoded as specified in 'encoding'.
- Useful as a mix-in.
+ If strings_only is True, don't convert (some) non-string-like objects.
"""
- def __str__(self):
- return self.__unicode__().encode(settings.DEFAULT_CHARSET)
+ if strings_only and isinstance(s, (types.NoneType, int)):
+ return s
+ if isinstance(s, Promise):
+ return unicode(s).encode(encoding, errors)
+ elif not isinstance(s, basestring):
+ try:
+ return str(s)
+ except UnicodeEncodeError:
+ return unicode(s).encode(encoding, errors)
+ elif isinstance(s, unicode):
+ return s.encode(encoding, errors)
+ elif s and encoding != 'utf-8':
+ return s.decode('utf-8', errors).encode(encoding, errors)
+ else:
+ return s
+
+def iri_to_uri(iri):
+ """
+ Convert an Internationalized Resource Identifier (IRI) portion to a URI
+ portion that is suitable for inclusion in a URL.
+
+ This is the algorithm from section 3.1 of RFC 3987. However, since we are
+ assuming input is either UTF-8 or unicode already, we can simplify things a
+ little from the full method.
+
+ Returns an ASCII string containing the encoded result.
+ """
+ # The list of safe characters here is constructed from the printable ASCII
+ # characters that are not explicitly excluded by the list at the end of
+ # section 3.1 of RFC 3987.
+ if iri is None:
+ return iri
+ return urllib.quote(smart_str(iri), safe='/#%[]=:;$&()+,!?')