From 8b4b764f036d9ac945d36a1966f2eb284c4f44a3 Mon Sep 17 00:00:00 2001
From: Michael Vrable <vrable@cs.hmc.edu>
Date: Sun, 7 Sep 2014 16:37:43 -0700
Subject: [PATCH] Rework uri_encode/uri_decode to more cleanly work with
 bytes/strings.

More clearly specify the intended behavior of uri_encode and uri_decode,
and make them work both for Python 2 and Python 3.
---
 README                             |  2 +
 python/cumulus/__init__.py         | 24 ++------
 python/cumulus/rebuild_database.py |  5 +-
 python/cumulus/util.py             | 82 +++++++++++++++++++++++++++
 python/cumulus/util_test.py        | 89 ++++++++++++++++++++++++++++++
 5 files changed, 180 insertions(+), 22 deletions(-)
 create mode 100644 python/cumulus/util.py
 create mode 100644 python/cumulus/util_test.py

diff --git a/README b/README
index 543fed5..b6345cc 100644
--- a/README
+++ b/README
@@ -7,6 +7,8 @@ Dependencies:
   - libuuid (sometimes part of e2fsprogs)
   - sqlite3
   - Python (2.7 or later, or 3.2 or later)
+  - Python six, a Python 2/3 compatibility library
+    https://pypi.python.org/pypi/six
   - boto, the python interface to Amazon's Web Services (for S3 storage)
     http://code.google.com/p/boto
   - paramiko, SSH2 protocol for python (for sftp storage)
diff --git a/python/cumulus/__init__.py b/python/cumulus/__init__.py
index b351658..c53a78a 100644
--- a/python/cumulus/__init__.py
+++ b/python/cumulus/__init__.py
@@ -34,6 +34,7 @@ import itertools
 import os
 import posixpath
 import re
+import six
 import sqlite3
 import subprocess
 import sys
@@ -46,11 +47,7 @@ except ImportError:
 
 import cumulus.store
 import cumulus.store.file
-
-if sys.version < "3":
-    StringTypes = (str, unicode)
-else:
-    StringTypes = (str,)
+import cumulus.util
 
 # The largest supported snapshot format that can be understood.
 FORMAT_VERSION = (0, 11)        # Cumulus Snapshot v0.11
@@ -77,19 +74,6 @@ def to_lines(data):
     Newline markers are retained."""
     return list(codecs.iterdecode(data.splitlines(True), "utf-8"))
 
-def uri_decode(s):
-    """Decode a URI-encoded (%xx escapes) string."""
-    def hex_decode(m): return chr(int(m.group(1), 16))
-    return re.sub(r"%([0-9a-f]{2})", hex_decode, s)
-def uri_encode(s):
-    """Encode a string to URI-encoded (%xx escapes) form."""
-    def hex_encode(c):
-        if c > '+' and c < '\x7f' and c != '@':
-            return c
-        else:
-            return "%%%02x" % (ord(c),)
-    return ''.join(hex_encode(c) for c in s)
-
 class Struct:
     """A class which merely acts as a data container.
 
@@ -283,7 +267,7 @@ class BackendWrapper(object):
 
         store may either be a Store object or URL.
         """
-        if type(backend) in StringTypes:
+        if isinstance(backend, six.string_types):
             self._backend = cumulus.store.open(backend)
         else:
             self._backend = backend
@@ -567,7 +551,7 @@ class MetadataItem:
     @staticmethod
     def decode_str(s):
         """Decode a URI-encoded (%xx escapes) string."""
-        return uri_decode(s)
+        return cumulus.util.uri_decode_pathname(s)
 
     @staticmethod
     def raw_str(s):
diff --git a/python/cumulus/rebuild_database.py b/python/cumulus/rebuild_database.py
index 3f391d8..4ab580f 100755
--- a/python/cumulus/rebuild_database.py
+++ b/python/cumulus/rebuild_database.py
@@ -40,6 +40,7 @@ import tarfile
 import time
 
 import cumulus
+from cumulus import util
 
 CHECKSUM_ALGORITHM = "sha224"
 CHUNKER_PROGRAM = "cumulus-chunker-standalone"
@@ -419,8 +420,8 @@ class SegmentStateRebuilder(object):
                 data_size += tarinfo.size
                 object_count += 1
 
-        return {"segment": cumulus.uri_encode(segment_name),
-                "path": cumulus.uri_encode(relative_path),
+        return {"segment": util.uri_encode_pathname(segment_name),
+                "path": util.uri_encode_pathname(relative_path),
                 "checksum": checksum,
                 "data_size": data_size,
                 "disk_size": disk_size,
diff --git a/python/cumulus/util.py b/python/cumulus/util.py
new file mode 100644
index 0000000..d2ee68b
--- /dev/null
+++ b/python/cumulus/util.py
@@ -0,0 +1,82 @@
+# Cumulus: Efficient Filesystem Backup to the Cloud
+# Copyright (C) 2014 The Cumulus Developers
+# See the AUTHORS file for a list of contributors.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+"""Collected utility functions for use by Cumulus."""
+
+from __future__ import division, print_function, unicode_literals
+
+import re
+import six
+
+# The encoding assumed when interpreting path names.
+ENCODING="utf-8"
+
+# In both Python 2 and Python 3 pathnames are represented using the str type.
+# For Python 2, this means that the converting from a bytestring to a pathname
+# is a no-op.  For Python 3, the conversion assumes a utf-8 encoding, but the
+# surrogateescape encoding error handler is used to allow other byte sequences
+# to be passed through.
+if six.PY2:
+    def bytes_to_pathname(b): return b
+    def pathname_to_bytes(p):
+        if isinstance(p, unicode):
+            return p.encode(encoding=ENCODING, errors="replace")
+        else:
+            return p
+elif six.PY3:
+    def bytes_to_pathname(b):
+        """Decodes a byte string to a pathname.
+
+        The input is assumed to be encoded using ENCODING (defaults to
+        utf-8)."""
+        return b.decode(encoding=ENCODING, errors="surrogateescape")
+
+    def pathname_to_bytes(p):
+        """Converts a pathname to encoded bytes.
+
+        The input is encoded to ENCODING (defaults to utf-8)."""
+        return p.encode(encoding=ENCODING, errors="surrogateescape")
+else:
+    raise AssertionError("Unsupported Python version")
+
+def uri_decode_raw(s):
+    """Decode a URI-encoded (%xx escapes) string.
+
+    The input should be a string, preferably only using ASCII characters.  The
+    output will be of type bytes."""
+    def hex_decode(m): return six.int2byte(int(m.group(1), 16))
+    return re.sub(br"%([0-9a-fA-F]{2})", hex_decode, pathname_to_bytes(s))
+
+def uri_encode_raw(s):
+    """Encode a bytes array to URI-encoded (%xx escapes) form."""
+    def hex_encode(c):
+        # Allow certain literal characters: c > "+" and c < "\x7f" and c != "@"
+        if c > 0x2b and c < 0x7f and c != 0x40:
+            return chr(c)
+        else:
+            return "%%%02x" % c
+
+    return "".join(hex_encode(c) for c in six.iterbytes(s))
+
+def uri_decode_pathname(s):
+    """Decodes a URI-encoded string to a pathname."""
+    return bytes_to_pathname(uri_decode_raw(s))
+
+def uri_encode_pathname(p):
+    """Encodes a pathname to a URI-encoded string."""
+    return uri_encode_raw(pathname_to_bytes(p))
diff --git a/python/cumulus/util_test.py b/python/cumulus/util_test.py
new file mode 100644
index 0000000..e3c9059
--- /dev/null
+++ b/python/cumulus/util_test.py
@@ -0,0 +1,89 @@
+#!/usr/bin/python
+# coding: utf-8
+#
+# Cumulus: Efficient Filesystem Backup to the Cloud
+# Copyright (C) 2014 The Cumulus Developers
+# See the AUTHORS file for a list of contributors.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+"""Unit tests for the cumulus.util module."""
+
+from __future__ import division, print_function, unicode_literals
+
+import six
+import unittest
+
+from cumulus import util
+
+class UtilCodecs(unittest.TestCase):
+    def test_pathnames(self):
+        self.assertEqual(util.ENCODING, "utf-8")
+        if six.PY2:
+            self.assertEqual(util.bytes_to_pathname(b"ext\xc3\xa9nsion"),
+                             b"ext\xc3\xa9nsion")
+            self.assertEqual(util.pathname_to_bytes(b"ext\xc3\xa9nsion"),
+                             b"ext\xc3\xa9nsion")
+            self.assertEqual(util.pathname_to_bytes(u"extÃ©nsion"),
+                             b"ext\xc3\xa9nsion")
+        elif six.PY3:
+            self.assertEqual(util.bytes_to_pathname(b"ext\xc3\xa9nsion"),
+                             "extÃ©nsion")
+            self.assertEqual(util.pathname_to_bytes("extÃ©nsion"),
+                             b"ext\xc3\xa9nsion")
+            self.assertEqual(util.bytes_to_pathname(b"inv\xe1lid"),
+                             "inv\udce1lid")
+            self.assertEqual(util.pathname_to_bytes("inv\udce1lid"),
+                             b"inv\xe1lid")
+
+    def test_uri_encode_raw(self):
+        self.assertEqual(util.uri_encode_raw(b"sample ASCII"), "sample%20ASCII")
+        self.assertEqual(util.uri_encode_raw(b"sample ext\xc3\xa9nded"),
+                         "sample%20ext%c3%a9nded")
+
+    def test_uri_decode_raw(self):
+        self.assertEqual(util.uri_decode_raw("sample%20ASCII"), b"sample ASCII")
+        self.assertEqual(util.uri_decode_raw("sample%20ext%c3%a9nded"),
+                         b"sample ext\xc3\xa9nded")
+
+    def test_uri_decode_pathname(self):
+        if six.PY2:
+            self.assertEqual(util.uri_decode_pathname("sample%20ext%c3%a9nded"),
+                             b"sample ext\xc3\xa9nded")
+            self.assertEqual(util.uri_decode_pathname("sample%20extÃ©nded"),
+                             b"sample ext\xc3\xa9nded")
+            # In Python 2, non-UTF-8 sequences are just passed through as
+            # bytestrings.
+            self.assertEqual(util.uri_decode_pathname(b"inv%e1lid"),
+                             b"inv\xe1lid")
+            self.assertEqual(util.uri_decode_pathname(b"inv\xe1lid"),
+                             b"inv\xe1lid")
+        elif six.PY3:
+            self.assertEqual(util.uri_decode_pathname("sample%20ext%c3%a9nded"),
+                             "sample extÃ©nded")
+            self.assertEqual(util.uri_decode_pathname("sample%20extÃ©nded"),
+                             "sample extÃ©nded")
+            # In Python 3, non-UTF-8 sequences are represented using surrogate
+            # escapes to allow lossless conversion back to the appropriate
+            # bytestring.
+            self.assertEqual(util.uri_decode_pathname("inv%e1lid"),
+                             "inv\udce1lid")
+            self.assertEqual(
+                util.pathname_to_bytes(util.uri_decode_pathname("inv%e1lid")),
+                b"inv\xe1lid")
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
2.20.1