diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c9f4f4e..e571357f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). +## [2.1.0] - 2018-08-12 + +### Added + +- fs.glob support + ## [2.0.27] - 2018-08-05 ### Fixed @@ -159,6 +165,7 @@ No changes, pushed wrong branch to PyPi. ## [2.0.8] - 2017-08-13 ### Added + - Lstat info namespace - Link info namespace - FS.islink method diff --git a/docs/source/conf.py b/docs/source/conf.py index f3b85c68..97e58dac 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -302,3 +302,5 @@ # If true, do not generate a @detailmenu in the "Top" node's menu. #texinfo_no_detailmenu = False + +napoleon_include_special_with_doc = True \ No newline at end of file diff --git a/docs/source/globbing.rst b/docs/source/globbing.rst new file mode 100644 index 00000000..c72392fc --- /dev/null +++ b/docs/source/globbing.rst @@ -0,0 +1,72 @@ +.. _globbing: + +Globbing +======== + +Globbing is the process of matching paths according to the rules used +by the Unix shell. + +Generally speaking, you can think of a glob pattern as a path containing +one or more wildcard patterns, separated by forward slashes. + + +Matching Files and Directories +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In a glob pattern, A ``*`` means match anything text in a filename. A ``?`` +matches any single character. A ``**`` matches any number of subdirectories, +making the glob *recusrive*. If the glob pattern ends in a ``/``, it will +only match directory paths, otherwise it will match files and directories. + +.. note:: + A recursive glob requires that PyFilesystem scan a lot of files, + and can potentially be slow for large (or network based) filesystems. + +Here's a summary of glob patterns: + +``*`` + Matches all files in the current directory. +``*.py`` + Matches all .py file in the current directory. +``*.py?`` + Matches all .py files and .pyi, .pyc etc in the currenct directory. +``project/*.py`` + Matches all .py files in a directory called ``project``. +``*/*.py`` + Matches all .py files in any sub directory. +``**/*.py`` + Recursively matches all .py files. +``**/.git/`` + Recursively matches all the git directories. + + +Interface +~~~~~~~~~ + +PyFilesystem supports globbing via the ``glob`` attribute on every FS +instance, which is an instance of :class:`~fs.glob.BoundGlobber`. Here's +how you might use it to find all the Python files in your filesystem:: + + for match in my_fs.glob("**/*.py"): + print(f"{match.path} is {match.info.size} bytes long") + +Calling ``.glob`` with a pattern will return an iterator of +:class:`~fs.glob.GlobMatch` named tuples for each matching file or +directory. A glob match contains two attributes; ``path`` which is the +full path in the filesystem, and ``info`` which is an +:class:`fs.info.Info` info object for the matched resource. + + +Batch Methods +~~~~~~~~~~~~~ + +In addition to iterating over the results, you can also call methods on +the :class:`~fs.glob.Globber` which apply to every matched path. + +For instance, here is how you can use glob to remove all ``.pyc`` files +from a project directory:: + + >>> import fs + >>> fs.open_fs('~/projects/my_project').glob('**/*.pyc').remove() + 29 + diff --git a/docs/source/guide.rst b/docs/source/guide.rst index e6500a5c..a7c81b21 100644 --- a/docs/source/guide.rst +++ b/docs/source/guide.rst @@ -196,6 +196,20 @@ The ``walk`` attribute on FS objects is instance of a :class:`~fs.walk.BoundWalk See :ref:`walking` for more information on walking directories. +Globbing +~~~~~~~~ + +Closely related to walking a filesystem is *globbing*, which is a slightly higher level way of scanning filesystems. Paths can be filtered by a *glob* pattern, which is similar to a wildcard (such as ``*.py``), but can match multiple levels of a directory structure. + +Here's an example of globbing, which removes all the ``.pyc`` files in your project directory:: + + >>> from fs import open_fs + >>> open_fs('~/project').glob('**/*.pyc').remove() + 62 + +See :ref:`globbing` for more information. + + Moving and Copying ~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/index.rst b/docs/source/index.rst index cd007d78..a393ff26 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -17,6 +17,7 @@ Contents: info.rst openers.rst walking.rst + globbing.rst builtin.rst implementers.rst extension.rst diff --git a/docs/source/reference.rst b/docs/source/reference.rst index 634263f7..0cc860d7 100644 --- a/docs/source/reference.rst +++ b/docs/source/reference.rst @@ -9,6 +9,7 @@ Reference reference/copy.rst reference/enums.rst reference/errors.rst + reference/glob.rst reference/info_objects.rst reference/filesize.rst reference/mirror.rst diff --git a/docs/source/reference/glob.rst b/docs/source/reference/glob.rst new file mode 100644 index 00000000..172f2ea8 --- /dev/null +++ b/docs/source/reference/glob.rst @@ -0,0 +1,5 @@ +fs.glob +======= + +.. automodule:: fs.glob + :members: diff --git a/fs/_version.py b/fs/_version.py index 48aae0cc..1b36bd0a 100644 --- a/fs/_version.py +++ b/fs/_version.py @@ -1,3 +1,3 @@ """Version, used in module and setup.py. """ -__version__ = "2.0.27" +__version__ = "2.1.0" diff --git a/fs/base.py b/fs/base.py index ae35f1f0..ee5f4e17 100644 --- a/fs/base.py +++ b/fs/base.py @@ -6,34 +6,23 @@ """ -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals +from __future__ import absolute_import, print_function, unicode_literals import abc +import itertools import os import threading import time import typing -from functools import partial - from contextlib import closing -import itertools +from functools import partial import six -from . import copy -from . import errors -from . import fsencode -from . import iotools -from . import move -from . import tools -from . import walk -from . import wildcard +from . import copy, errors, fsencode, iotools, move, tools, walk, wildcard +from .glob import BoundGlobber from .mode import validate_open_mode -from .path import abspath -from .path import join -from .path import normpath +from .path import abspath, join, normpath from .time import datetime_to_epoch from .walk import Walker @@ -108,6 +97,12 @@ def __exit__( """ self.close() + @property + def glob(self): + """`~fs.glob.BoundGlobber`: a globber object.. + """ + return BoundGlobber(self) + @property def walk(self): # type: (_F) -> BoundWalker[_F] diff --git a/fs/copy.py b/fs/copy.py index b8eeae73..8f04a291 100644 --- a/fs/copy.py +++ b/fs/copy.py @@ -71,7 +71,7 @@ def copy_fs_if_newer( on_copy (callable):A function callback called after a single file copy is executed. Expected signature is ``(src_fs, src_path, dst_fs, dst_path)``. - workers (int): Use `worker` threads to copy data, or ``0`` (default) for + workers (int): Use ``worker`` threads to copy data, or ``0`` (default) for a single-threaded copy. """ @@ -269,7 +269,7 @@ def copy_dir( on_copy (callable, optional): A function callback called after a single file copy is executed. Expected signature is ``(src_fs, src_path, dst_fs, dst_path)``. - workers (int): Use `worker` threads to copy data, or ``0`` (default) for + workers (int): Use ``worker`` threads to copy data, or ``0`` (default) for a single-threaded copy. """ @@ -330,7 +330,7 @@ def copy_dir_if_newer( on_copy (callable, optional): A function callback called after a single file copy is executed. Expected signature is ``(src_fs, src_path, dst_fs, dst_path)``. - workers (int): Use `worker` threads to copy data, or ``0`` (default) for + workers (int): Use ``worker`` threads to copy data, or ``0`` (default) for a single-threaded copy. """ diff --git a/fs/glob.py b/fs/glob.py new file mode 100644 index 00000000..09927952 --- /dev/null +++ b/fs/glob.py @@ -0,0 +1,285 @@ +from __future__ import unicode_literals + +from collections import namedtuple +from typing import Iterator, List +import re + +from .lrucache import LRUCache +from ._repr import make_repr +from .path import iteratepath +from . import wildcard + + +_PATTERN_CACHE = LRUCache( + 1000 +) # type: LRUCache[Tuple[Text, bool], Tuple[int, bool, Pattern]] + +GlobMatch = namedtuple('GlobMatch', ["path", "info"]) +Counts = namedtuple("Counts", ["files", "directories", "data"]) +LineCounts = namedtuple("LineCounts", ["lines", "non_blank"]) + +if False: # typing.TYPE_CHECKING + from typing import Iterator, List, Optional, Tuple + from .base import FS + from .info import Info + + +def _translate_glob(pattern, case_sensitive=True): + levels = 0 + recursive = False + re_patterns = [""] + for component in iteratepath(pattern): + if component == "**": + re_patterns.append(".*/?") + recursive = True + else: + re_patterns.append( + "/" + wildcard._translate(component, case_sensitive=case_sensitive) + ) + levels += 1 + re_glob = "(?ms)^" + "".join(re_patterns) + ("/$" if pattern.endswith("/") else "$") + return ( + levels, + recursive, + re.compile(re_glob, 0 if case_sensitive else re.IGNORECASE), + ) + + +def match(pattern, path): + # type: (str, str) -> bool + """Compare a glob pattern with a path (case sensitive). + + Arguments: + pattern (str): A glob pattern. + path (str): A path. + + Returns: + bool: ``True`` if the path matches the pattern. + + Example: + + >>> from fs.glob import match + >>> match("**/*.py", "/fs/glob.py") + True + + """ + try: + levels, recursive, re_pattern = _PATTERN_CACHE[(pattern, True)] + except KeyError: + levels, recursive, re_pattern = _translate_glob(pattern, case_sensitive=True) + _PATTERN_CACHE[(pattern, True)] = (levels, recursive, re_pattern) + return bool(re_pattern.match(path)) + + +def imatch(pattern, path): + # type: (str, str) -> bool + """Compare a glob pattern with a path (case insensitive). + + Arguments: + pattern (str): A glob pattern. + path (str): A path. + + Returns: + bool: ``True`` if the path matches the pattern. + + """ + try: + levels, recursive, re_pattern = _PATTERN_CACHE[(pattern, False)] + except KeyError: + levels, recursive, re_pattern = _translate_glob(pattern, case_sensitive=True) + _PATTERN_CACHE[(pattern, False)] = (levels, recursive, re_pattern) + return bool(re_pattern.match(path)) + + +class Globber(object): + """A generator of glob results. + + Arguments: + fs (~fs.base.FS): A filesystem object + pattern (str): A glob pattern, e.g. ``"**/*.py"`` + path (str): A path to a directory in the filesystem. + namespaces (list): A list of additional info namespaces. + case_sensitive (bool): If ``True``, the path matching will be + case *sensitive* i.e. ``"FOO.py"`` and ``"foo.py"`` will + be different, otherwise path matching will be case *insensitive*. + exclude_dirs (list): A list of patterns to exclude when searching, + e.g. ``["*.git"]``. + + """ + + def __init__( + self, + fs, + pattern, + path="/", + namespaces=None, + case_sensitive=True, + exclude_dirs=None, + ): + # type: (FS, str, str, Optional[List[str]], bool, Optional[List[str]]) -> None + self.fs = fs + self.pattern = pattern + self.path = path + self.namespaces = namespaces + self.case_sensitive = case_sensitive + self.exclude_dirs = exclude_dirs + + def __repr__(self): + return make_repr( + self.__class__.__name__, + self.fs, + self.pattern, + path=(self.path, "/"), + namespaces=(self.namespaces, None), + case_sensitive=(self.case_sensitive, True), + exclude_dirs=(self.exclude_dirs, None), + ) + + def _make_iter(self, search="breadth", namespaces=None): + # type: (str, List[str]) -> Iterator[GlobMatch] + try: + levels, recursive, re_pattern = _PATTERN_CACHE[ + (self.pattern, self.case_sensitive) + ] + except KeyError: + levels, recursive, re_pattern = _translate_glob( + self.pattern, case_sensitive=self.case_sensitive + ) + + for path, info in self.fs.walk.info( + path=self.path, + namespaces=namespaces or self.namespaces, + max_depth=None if recursive else levels, + search=search, + exclude_dirs=self.exclude_dirs, + ): + if info.is_dir: + path += "/" + if re_pattern.match(path): + yield GlobMatch(path, info) + + def __iter__(self): + # type: () -> Iterator[GlobMatch] + """An iterator of :class:`fs.glob.GlobMatch` objects.""" + return self._make_iter() + + def count(self): + # type: () -> Counts + """Count files / directories / data in matched paths. + + Example: + >>> import fs + >>> fs.open_fs('~/projects').glob('**/*.py').count() + Counts(files=18519, directories=0, data=206690458) + + Returns: + `~Counts`: A named tuple containing results. + + """ + directories = 0 + files = 0 + data = 0 + for path, info in self._make_iter(namespaces=["details"]): + if info.is_dir: + directories += 1 + else: + files += 1 + data += info.size + return Counts(directories=directories, files=files, data=data) + + def count_lines(self): + # type: () -> LineCounts + """Count the lines in the matched files. + + Returns: + `~LineCounts`: A named tuple containing line counts. + + Example: + >>> import fs + >>> fs.open_fs('~/projects').glob('**/*.py').count_lines() + LineCounts(lines=5767102, non_blank=4915110) + + """ + + lines = 0 + non_blank = 0 + for path, info in self._make_iter(): + if info.is_file: + for line in self.fs.open(path, "rb"): + lines += 1 + if line.rstrip(): + non_blank += 1 + return LineCounts(lines=lines, non_blank=non_blank) + + def remove(self): + # type: () -> int + """Removed all matched paths. + + Returns: + int: Number of file and directories removed. + + Example: + >>> import fs + >>> fs.open_fs('~/projects/my_project').glob('**/*.pyc').remove() + 29 + + """ + removes = 0 + for path, info in self._make_iter(search="depth"): + if info.is_dir: + self.fs.removetree(path) + else: + self.fs.remove(path) + removes += 1 + return removes + + +class BoundGlobber(object): + """A :class:`~Globber` object bound to a filesystem. + + An instance of this object is available on every Filesystem object + as ``.glob``. + + Arguments: + fs (FS): A filesystem object. + + """ + + __slots__ = ["fs"] + + def __init__(self, fs): + # type: (FS) -> None + self.fs = fs + + def __repr__(self): + return make_repr(self.__class__.__name__, self.fs) + + def __call__( + self, pattern, path="/", namespaces=None, case_sensitive=True, exclude_dirs=None + ): + # type: (str, str, Optional[List[str]], bool, Optional[List[str]]) -> Globber + """Match resources on the bound filesystem againsts a glob pattern. + + Arguments: + pattern (str): A glob pattern, e.g. ``"**/*.py"`` + namespaces (list): A list of additional info namespaces. + case_sensitive (bool): If ``True``, the path matching will be + case *sensitive* i.e. ``"FOO.py"`` and ``"foo.py"`` will + be different, otherwise path matching will be case **insensitive**. + exclude_dirs (list): A list of patterns to exclude when searching, + e.g. ``["*.git"]``. + + Returns: + `~Globber`: + An object that may be queried for the glob matches. + + + """ + return Globber( + self.fs, + pattern, + path, + namespaces=namespaces, + case_sensitive=case_sensitive, + exclude_dirs=exclude_dirs, + ) diff --git a/fs/test.py b/fs/test.py index 6c4b413f..04bbbb46 100644 --- a/fs/test.py +++ b/fs/test.py @@ -22,6 +22,7 @@ from fs import ResourceType, Seek from fs import errors from fs import walk +from fs import glob from fs.opener import open_fs from fs.subfs import ClosingSubFS, SubFS @@ -1796,3 +1797,9 @@ def test_case_sensitive(self): self.assert_isdir("foo") self.assert_isdir("Foo") self.assert_isfile("fOO") + + def test_glob(self): + self.assertIsInstance( + self.fs.glob, + glob.BoundGlobber + ) diff --git a/fs/wildcard.py b/fs/wildcard.py index 1ddcb350..b9f58591 100644 --- a/fs/wildcard.py +++ b/fs/wildcard.py @@ -2,20 +2,20 @@ """ # Adapted from https://hg.python.org/cpython/file/2.7/Lib/fnmatch.py -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function import re import typing from functools import partial from .lrucache import LRUCache +from . import path if False: # typing.TYPE_CHECKING from typing import Callable, Iterable, MutableMapping, Text, Tuple, Pattern -_MAXCACHE = 1000 -_PATTERN_CACHE = LRUCache(_MAXCACHE) # type: LRUCache[Tuple[Text, bool], Pattern] +_PATTERN_CACHE = LRUCache(1000) # type: LRUCache[Tuple[Text, bool], Pattern] def match(pattern, name): @@ -33,7 +33,7 @@ def match(pattern, name): try: re_pat = _PATTERN_CACHE[(pattern, True)] except KeyError: - res = _translate(pattern) + res = "(?ms)" + _translate(pattern) + r'\Z' _PATTERN_CACHE[(pattern, True)] = re_pat = re.compile(res) return re_pat.match(name) is not None @@ -53,7 +53,7 @@ def imatch(pattern, name): try: re_pat = _PATTERN_CACHE[(pattern, False)] except KeyError: - res = _translate(pattern, case_sensitive=False) + res = "(?ms)" + _translate(pattern, case_sensitive=False) + r'\Z' _PATTERN_CACHE[(pattern, False)] = re_pat = re.compile(res, re.IGNORECASE) return re_pat.match(name) is not None @@ -105,7 +105,7 @@ def get_matcher(patterns, case_sensitive): Arguments: patterns (list): A list of wildcard pattern. e.g. ``["*.py", "*.pyc"]`` - case_sensitive (bool): If `True`, then the callable will be case + case_sensitive (bool): If ``True``, then the callable will be case sensitive, otherwise it will be case insensitive. Returns: @@ -152,7 +152,7 @@ def _translate(pattern, case_sensitive=True): c = pattern[i] i = i + 1 if c == "*": - res = res + ".*" + res = res + "[^/]*" elif c == "?": res = res + "." elif c == "[": @@ -175,4 +175,4 @@ def _translate(pattern, case_sensitive=True): res = "%s[%s]" % (res, stuff) else: res = res + re.escape(c) - return res + "\Z(?ms)" + return res diff --git a/requirements.txt b/requirements.txt index 668c6793..0b0438f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -appdirs==1.4.0 +appdirs~=1.4.3 backports.os==0.1.1; python_version == '2.7' enum34==1.1.6 ; python_version < '3.4' pytz diff --git a/tests/test_copy.py b/tests/test_copy.py index f0958bcc..17e6e0da 100644 --- a/tests/test_copy.py +++ b/tests/test_copy.py @@ -29,6 +29,12 @@ def test_copy_fs(self): self.assertTrue(dst_fs.isdir("foo/bar")) self.assertTrue(dst_fs.isfile("test.txt")) + def test_copy_value_error(self): + src_fs = open_fs("mem://") + dst_fs = open_fs("mem://") + with self.assertRaises(ValueError): + fs.copy.copy_fs(src_fs, dst_fs, workers=-1) + def test_copy_dir(self): src_fs = open_fs("mem://") src_fs.makedirs("foo/bar") diff --git a/tests/test_errors.py b/tests/test_errors.py index 8599fb85..0b78fd15 100644 --- a/tests/test_errors.py +++ b/tests/test_errors.py @@ -30,6 +30,7 @@ def test_raise_in_multiprocessing(self): [errors.NoURL, "some_path", "some_purpose"], [errors.Unsupported], [errors.IllegalBackReference, "path"], + [errors.MissingInfoNamespace, "path"] ] try: pool = multiprocessing.Pool(1) diff --git a/tests/test_glob.py b/tests/test_glob.py new file mode 100644 index 00000000..54719b3d --- /dev/null +++ b/tests/test_glob.py @@ -0,0 +1,99 @@ +from __future__ import unicode_literals + +import unittest + +from fs import glob +from fs import open_fs + + +class TestGlob(unittest.TestCase): + def setUp(self): + fs = self.fs = open_fs("mem://") + fs.settext("foo.py", "Hello, World") + fs.touch("bar.py") + fs.touch("baz.py") + fs.makedirs("egg") + fs.settext("egg/foo.py", "from fs import open_fs") + fs.touch("egg/foo.pyc") + fs.makedirs("a/b/c/").settext("foo.py", "import fs") + repr(fs.glob) + + def test_match(self): + tests = [ + ("*.?y", "/test.py", True), + ("*.py", "/test.py", True), + ("*.py", "/test.pc", False), + ("*.py", "/foo/test.py", False), + ("foo/*.py", "/foo/test.py", True), + ("foo/*.py", "/bar/foo/test.py", False), + ("?oo/*.py", "/foo/test.py", True), + ("*/*.py", "/foo/test.py", True), + ("foo/*.py", "/bar/foo/test.py", False), + ("**/foo/*.py", "/bar/foo/test.py", True), + ("foo/**/bar/*.py", "/foo/bar/test.py", True), + ("foo/**/bar/*.py", "/foo/baz/egg/bar/test.py", True), + ("foo/**/bar/*.py", "/foo/baz/egg/bar/egg/test.py", False), + ("**", "/test.py", True), + ("**", "/test", True), + ("**", "/test/", True), + ("**/", "/test/", True), + ("**/", "/test.py", False), + ] + for pattern, path, expected in tests: + self.assertEqual(glob.match(pattern, path), expected) + # Run a second time to test cache + for pattern, path, expected in tests: + self.assertEqual(glob.match(pattern, path), expected) + + def test_count_1dir(self): + globber = glob.BoundGlobber(self.fs) + counts = globber("*.py").count() + self.assertEqual(counts, glob.Counts(files=3, directories=0, data=12)) + repr(globber("*.py")) + + def test_count_2dir(self): + globber = glob.BoundGlobber(self.fs) + counts = globber("*/*.py").count() + self.assertEqual(counts, glob.Counts(files=1, directories=0, data=22)) + + def test_count_recurse_dir(self): + globber = glob.BoundGlobber(self.fs) + counts = globber("**/*.py").count() + self.assertEqual(counts, glob.Counts(files=5, directories=0, data=43)) + + def test_count_lines(self): + globber = glob.BoundGlobber(self.fs) + line_counts = globber("**/*.py").count_lines() + self.assertEqual(line_counts, glob.LineCounts(lines=3, non_blank=3)) + + def test_count_dirs(self): + globber = glob.BoundGlobber(self.fs) + counts = globber("**/?/").count() + self.assertEqual(counts, glob.Counts(files=0, directories=3, data=0)) + + def test_count_all(self): + globber = glob.BoundGlobber(self.fs) + counts = globber("**").count() + self.assertEqual(counts, glob.Counts(files=6, directories=4, data=43)) + counts = globber("**/").count() + self.assertEqual(counts, glob.Counts(files=0, directories=4, data=0)) + + def test_remove(self): + globber = glob.BoundGlobber(self.fs) + self.assertTrue(self.fs.exists("egg/foo.pyc")) + removed_count = globber("**/*.pyc").remove() + self.assertEqual(removed_count, 1) + self.assertFalse(self.fs.exists("egg/foo.pyc")) + + def test_remove_dir(self): + globber = glob.BoundGlobber(self.fs) + self.assertTrue(self.fs.exists("egg/foo.pyc")) + removed_count = globber("**/?/").remove() + self.assertEqual(removed_count, 3) + self.assertFalse(self.fs.exists("a")) + self.assertTrue(self.fs.exists("egg")) + + def test_remove_all(self): + globber = glob.BoundGlobber(self.fs) + globber("**").remove() + self.assertEqual(sorted(self.fs.listdir("/")), [])