Merged Unicode branch into trunk (r4952:5608). This should be fully

backwards compatible for all practical purposes. Fixed #2391, #2489, #2996, #3322, #3344, #3370, #3406, #3432, #3454, #3492, #3582, #3690, #3878, #3891, #3937, #4039, #4141, #4227, #4286, #4291, #4300, #4452, #4702 git-svn-id: http://code.djangoproject.com/svn/django/trunk@5609 bcc190cf-cafb-0310-a4f2-bffc1f526a37
author: Malcolm Tredinnick <malcolm.tredinnick@gmail.com> 2007-07-04 12:11:04 +0000
committer: Malcolm Tredinnick <malcolm.tredinnick@gmail.com> 2007-07-04 12:11:04 +0000
commit: 953badbea5a04159adbfa970f5805c0232b6a401 (patch)
tree: 9569f74b5d382b222613a1085efd0de21937e95f /django/utils/encoding.py
parent: 4c958b15b250866b70ded7d82aa532f1e57f96ae (diff)
1 files changed, 68 insertions, 16 deletions
diff --git a/django/utils/encoding.py b/django/utils/encoding.py
index 4774fb0d26..7515d0c41b 100644
--- a/django/utils/encoding.py
+++ b/django/utils/encoding.py
@@ -1,32 +1,84 @@
+import types
+import urllib
 from django.conf import settings
 from django.utils.functional import Promise
 
-def smart_unicode(s):
+class StrAndUnicode(object):
+    """
+    A class whose __str__ returns its __unicode__ as a UTF-8 bytestring.
+
+    Useful as a mix-in.
+    """
+    def __str__(self):
+        return self.__unicode__().encode('utf-8')
+
+def smart_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
+    """
+    Returns a unicode object representing 's'. Treats bytestrings using the
+    'encoding' codec.
+
+    If strings_only is True, don't convert (some) non-string-like objects.
+    """
     if isinstance(s, Promise):
-        # The input is the result of a gettext_lazy() call, or similar. It will
-        # already be encoded in DEFAULT_CHARSET on evaluation and we don't want
-        # to evaluate it until render time.
-        # FIXME: This isn't totally consistent, because it eventually returns a
-        # bytestring rather than a unicode object. It works wherever we use
-        # smart_unicode() at the moment. Fixing this requires work in the
-        # i18n internals.
+        # The input is the result of a gettext_lazy() call.
+        return s
+    return force_unicode(s, encoding, strings_only, errors)
+
+def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
+    """
+    Similar to smart_unicode, except that lazy instances are resolved to
+    strings, rather than kept as lazy objects.
+
+    If strings_only is True, don't convert (some) non-string-like objects.
+    """
+    if strings_only and isinstance(s, (types.NoneType, int)):
         return s
     if not isinstance(s, basestring,):
         if hasattr(s, '__unicode__'):
             s = unicode(s)
         else:
-            s = unicode(str(s), settings.DEFAULT_CHARSET)
+            s = unicode(str(s), encoding, errors)
     elif not isinstance(s, unicode):
-        s = unicode(s, settings.DEFAULT_CHARSET)
+        s = unicode(s, encoding, errors)
     return s
 
-class StrAndUnicode(object):
+def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
     """
-    A class whose __str__ returns its __unicode__ as a bytestring
-    according to settings.DEFAULT_CHARSET.
+    Returns a bytestring version of 's', encoded as specified in 'encoding'.
 
-    Useful as a mix-in.
+    If strings_only is True, don't convert (some) non-string-like objects.
     """
-    def __str__(self):
-        return self.__unicode__().encode(settings.DEFAULT_CHARSET)
+    if strings_only and isinstance(s, (types.NoneType, int)):
+        return s
+    if isinstance(s, Promise):
+        return unicode(s).encode(encoding, errors)
+    elif not isinstance(s, basestring):
+        try:
+            return str(s)
+        except UnicodeEncodeError:
+            return unicode(s).encode(encoding, errors)
+    elif isinstance(s, unicode):
+        return s.encode(encoding, errors)
+    elif s and encoding != 'utf-8':
+        return s.decode('utf-8', errors).encode(encoding, errors)
+    else:
+        return s
+
+def iri_to_uri(iri):
+    """
+    Convert an Internationalized Resource Identifier (IRI) portion to a URI
+    portion that is suitable for inclusion in a URL.
+
+    This is the algorithm from section 3.1 of RFC 3987.  However, since we are
+    assuming input is either UTF-8 or unicode already, we can simplify things a
+    little from the full method.
+
+    Returns an ASCII string containing the encoded result.
+    """
+    # The list of safe characters here is constructed from the printable ASCII
+    # characters that are not explicitly excluded by the list at the end of
+    # section 3.1 of RFC 3987.
+    if iri is None:
+        return iri
+    return urllib.quote(smart_str(iri), safe='/#%[]=:;$&()+,!?')
author	Malcolm Tredinnick <malcolm.tredinnick@gmail.com>	2007-07-04 12:11:04 +0000
committer	Malcolm Tredinnick <malcolm.tredinnick@gmail.com>	2007-07-04 12:11:04 +0000
commit	953badbea5a04159adbfa970f5805c0232b6a401 (patch)
tree	9569f74b5d382b222613a1085efd0de21937e95f /django/utils/encoding.py
parent	4c958b15b250866b70ded7d82aa532f1e57f96ae (diff)