turicas · cuducos · Jul 5, 2017 · Sep 14, 2017 · Sep 14, 2017 · Sep 14, 2017
diff --git a/rows/utils.py b/rows/utils.py
@@ -17,7 +17,9 @@
 
 from __future__ import unicode_literals
 
+import bz2
 import cgi
+import gzip
 import mimetypes
 import os
 import tempfile
@@ -27,6 +29,11 @@
 except ImportError:
     from urllib.parse import urlparse  # Python 3
 
+try:
+    import lzma
+except ImportError:
+    lzma = None
+
 try:
     import magic
 except ImportError:
@@ -158,14 +165,7 @@ def plugin_name_by_mime_type(mime_type, mime_name, file_extension):
             None)
 
 
-def detect_local_source(path, content, mime_type=None, encoding=None):
-
-    # TODO: may add sample_size
-
-    filename = os.path.basename(path)
-    parts = filename.split('.')
-    extension = parts[-1] if len(parts) > 1 else None
-
+def describe_file_type(filename, content, mime_type=None, encoding=None):
     if magic is not None:
         detected = magic.detect_from_content(content)
         encoding = detected.encoding or encoding
@@ -177,6 +177,19 @@ def detect_local_source(path, content, mime_type=None, encoding=None):
         mime_name = None
         mime_type = mime_type or mimetypes.guess_type(filename)[0]
 
+    return mime_type, encoding, mime_name
+
+def detect_local_source(path, content, mime_type=None, encoding=None):
+    # TODO: may add sample_size
+
+    filename = os.path.basename(path)
+    parts = filename.split('.')
+    extension = parts[-1] if len(parts) > 1 else None
+
+    args = (filename, content)
+    kwargs = dict(mime_type=mime_type, encoding=encoding)
+    mime_type, encoding, mime_name = describe_file_type(*args, **kwargs)
+
     plugin_name = plugin_name_by_mime_type(mime_type, mime_name, extension)
     if encoding == 'binary':
         encoding = None
@@ -297,3 +310,53 @@ def export_to_uri(table, uri, *args, **kwargs):
         raise ValueError('Plugin (export) "{}" not found'.format(plugin_name))
 
     return export_function(table, uri, *args, **kwargs)
+
+
+def decompress(path, **kwargs):
+    """
+    Given a bz2, gzip or lzma file returns a decompressed file object. All
+    kwargs are passed to either `bz2.openn`, `gzip.open` or `lzma.open`.
+    :param path: (str) path to a bz2, gzip or lzma file
+    """
+    filename = os.path.basename(path)
+    with open(path, 'rb') as handler:
+        mime_type = describe_file_type(filename, handler.read())[0]
+
+    bz2_mime_types = (
+        'application/bzip2',
+        'application/octet-stream',
+        'application/x-bz2',
+        'application/x-bzip',
+        'application/x-compressed',
+        'application/x-stuffit'
+    )
+    gzip_mime_types = (
+        'application/gzip',
+        'application/x-gzip',
+        'application/x-gunzip',
+        'application/gzipped',
+        'application/gzip-compressed',
+        'application/x-compressed',
+        'application/x-compress',
+        'gzip/document',
+        'application/octet-stream'
+    )
+    lzma_mime_types = (
+        'application/x-xz',
+        'application/x-lzma'
+    )
+
+    open_compressed = None
+    if mime_type in bz2_mime_types:
+        open_compressed = bz2.open
+    if mime_type in gzip_mime_types:
+        open_compressed = gzip.open
+    if lzma and mime_type in lzma_mime_types:
+        open_compressed = lzma.open
+
+    if not open_compressed:
+        msg = "Couldn't identify file mimetype, or lzma module isn't available"
+        raise RuntimeError(msg)
+
+    with open_compressed(path, **kwargs) as handler:
+        return handler
diff --git a/tests/tests_utils.py b/tests/tests_utils.py
@@ -17,9 +17,19 @@
 
 from __future__ import unicode_literals
 
+import bz2
+import gzip
+import os
 import tempfile
 import unittest
 
+try:
+    import lzma
+except ImportError:
+    lzma = None
+
+import six
+
 import rows.utils
 
 import tests.utils as utils
@@ -71,3 +81,48 @@ def test_local_file_sample_size(self):
 # TODO: test normalize_mime_type
 # TODO: test plugin_name_by_mime_type
 # TODO: test plugin_name_by_uri
+
+
+class UtilsDecompressTestCase(unittest.TestCase):
+
+    def setUp(self):
+        self.contents = six.b('Ahoy')
+        self.temp = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp.cleanup()
+
+    def test_decompress_with_bz2(self):
+        compressed = os.path.join(self.tmp.name, 'test.bz2')
+        with bz2.open(compressed, mode='wb') as compressed_handler:
+            compressed_handler.write(self.contents)
+        decompressed = rows.utils.decompress(compressed)
+        self.assertEqual(self.contents, decompressed.read())
+
+    def test_decompress_with_gz(self):
+        compressed = os.path.join(self.tmp.name, 'test.gz')
+        with gzip.open(compressed, mode='wb') as compressed_handler:
+            compressed_handler.write(self.contents)
+        decompressed = rows.utils.decompress(compressed)
+        self.assertEqual(self.contents, decompressed.read())
+
+    @unittest.skipIf(not lzma, 'lzma module not available')
+    def test_decompress_with_lzma(self):
+        compressed = os.path.join(self.tmp.name, 'test.lzma')
+        with lzma.open(compressed) as compressed_handler:
+            compressed_handler.write(self.contents)
+        decompressed = rows.utils.decompress(compressed)
+        self.assertEqual(self.contents, decompressed.read())
+
+    @unittest.skipIf(not lzma, 'lzma module not available')
+    def test_decompress_with_xz(self):
+        compressed = os.path.join(self.tmp.name, 'test.gz')
+        with lzma.open(compressed) as compressed_handler:
+            compressed_handler.write(self.contents)
+        decompressed = rows.utils.decompress(compressed)
+        self.assertEqual(self.contents, decompressed.read())
+
+    def test_decompress_with_incompatible_file(self):
+        with self.assertRaises():
+            with tempfile.NamedTemporaryFile() as tmp:
+                rows.utils.decompress(tmp.name)