catalystneuro · h-mayorquin · Dec 19, 2023 · Dec 6, 2023 · Dec 6, 2023 · Dec 6, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,7 @@
 # Upcoming
 
 ### Bug fixes
-* LocalPathExpander matches only `folder_paths` or `file_paths` if that is indicated in the passed specification. [PR #675](https://github.com/catalystneuro/neuroconv/pull/675)
+* LocalPathExpander matches only `folder_paths` or `file_paths` if that is indicated in the passed specification. [PR #679](https://github.com/catalystneuro/neuroconv/pull/675) and [PR #675](https://github.com/catalystneuro/neuroconv/pull/679
 * Fixed depth consideration in partial chunking pattern for the ROI data buffer. [PR #677](https://github.com/catalystneuro/neuroconv/pull/677)
 * Fix mapping between channel names and the electrode table when writing more than one `ElectricalSeries` to the NWBFile. This fixes an issue when the converter pipeline of `SpikeGLXConverterPipe` was writing the electrode table region of the NIDQ stream incorrectly [PR #678](https://github.com/catalystneuro/neuroconv/pull/678)
 

diff --git a/src/neuroconv/tools/path_expansion.py b/src/neuroconv/tools/path_expansion.py
@@ -13,13 +13,41 @@
 
 class AbstractPathExpander(abc.ABC):
     def extract_metadata(self, base_directory: DirectoryPath, format_: str):
+        """
+        Uses the parse library to extract metadata from file paths in the base_directory.
+
+        This method iterates over files in `base_directory`, parsing each file path according to `format_`.
+        The format string is adjusted to the current operating system's path separator. The method yields
+        each file path and its corresponding parsed metadata. To constrain metadata matches to only the
+        name of the file or folder/directory, the method checks that the metadata does not contain the
+        OS path separator (e.g., '/' or '\\').
+
+        Parameters
+        ----------
+        base_directory : DirectoryPath
+            The base directory from which to list files for metadata extraction. It should be a path-like
+            object that is convertible to a `pathlib.Path`.
+        format_ : str
+            The format string used for parsing the file paths. This string can represent a path in any
+            OS format, and is adjusted internally to match the current OS's path separator.
+
+        Yields
+        ------
+        Tuple[Path, Dict[str, Any]]
+            A tuple containing the file path as a `Path` object and a dictionary of the named metadata
+            extracted from the file path.
+        """
+
         format_ = format_.replace("\\", os.sep)  # Actual character is a single back-slash; first is an escape for that
         format_ = format_.replace("/", os.sep)  # our f-string uses '/' to communicate os-independent separators
 
         for filepath in self.list_directory(base_directory=Path(base_directory)):
             result = parse(format_, filepath)
             if result:
-                yield filepath, result.named
+                named_result = result.named
+                no_field_in_metadata_contains_os_sep = all(os.sep not in str(val) for val in named_result.values())
+                if no_field_in_metadata_contains_os_sep:
+                    yield filepath, named_result
 
     @abc.abstractmethod
     def list_directory(self, base_directory: DirectoryPath) -> Iterable[FilePath]:

diff --git a/tests/test_minimal/test_tools/test_expand_paths.py b/tests/test_minimal/test_tools/test_expand_paths.py
@@ -2,95 +2,136 @@
 import unittest
 from datetime import datetime
 from pathlib import Path
-
-import pytest
+from typing import List, Tuple
 
 from neuroconv.tools import LocalPathExpander
 from neuroconv.tools.testing import generate_path_expander_demo_ibl
 from neuroconv.utils import NWBMetaDataEncoder
 
 
-def test_only_folder_match(tmpdir):
-    base_directory = Path(tmpdir)
-
-    sub_directory1 = base_directory / "a_simple_pattern_1"
-    sub_directory2 = base_directory / "a_simple_pattern_2"
+def create_test_directories_and_files(
+    base_directory: Path, directories_and_files: List[Tuple[List[str], List[str]]]
+) -> None:
+    """
+    Create test directories and files in a way that is compatible across different
+    operating systems.
+
+    Parameters
+    ----------
+    base_directory : Path
+        The base directory under which all subdirectories and files will be created.
+    directories_and_files : List[Tuple[List[str], List[str]]]
+        A list where each element is a tuple. The first element of the tuple is a list
+        of directory components, and the second element is a list of file names to be
+        created in that directory.
+    """
+    for directory_components, files in directories_and_files:
+        # Create directory using Path for OS compatibility
+        full_directory_path = base_directory.joinpath(*directory_components)
+        full_directory_path.mkdir(parents=True, exist_ok=True)
+
+        # Create files in the directory
+        for file in files:
+            (full_directory_path / file).touch()
 
-    sub_directory1.mkdir(exist_ok=True)
-    sub_directory2.mkdir(exist_ok=True)
 
-    # Add files with the same name to both folders
-    file1 = sub_directory1 / "a_simple_pattern_1.bin"
-    file2 = sub_directory2 / "a_simple_pattern_2.bin"
+def test_only_folder_match(tmpdir):
+    base_directory = Path(tmpdir)
 
-    # Create files
-    file1.touch()
-    file2.touch()
+    # Define the directories and files to be created
+    directories_and_files = [
+        (["subject1", "a_simple_pattern_1"], ["a_simple_pattern_1.bin"]),  # matches
+        (["subject1"], ["a_simple_pattern_file.bin"]),  # matches query but is a file
+        (["subject2", "a_simple_pattern_2", "nested_directory"], []),  # match should not contain nested folder
+    ]
 
-    # Add another sub-nested folder with a folder
-    sub_directory3 = sub_directory1 / "a_simple_pattern_3"
-    sub_directory3.mkdir(exist_ok=True)
-    file3 = sub_directory3 / "a_simple_pattern_3.bin"
-    file3.touch()
+    # Create test directories and files
+    create_test_directories_and_files(base_directory, directories_and_files)
 
     # Specify source data (note this assumes the files are arranged in the same way as in the example data)
     source_data_spec = {
         "a_source": {
             "base_directory": base_directory,
-            "folder_path": "a_simple_pattern_{session_id}",
+            "folder_path": "{subject_id}/a_simple_pattern_{session_id}",
         }
     }
 
-    # Instantiate LocalPathExpander
-
     path_expander = LocalPathExpander()
-    metadata_list = path_expander.expand_paths(source_data_spec)
-    folder_paths = [metadata_match["source_data"]["a_source"]["folder_path"] for metadata_match in metadata_list]
-
-    expected = {str(sub_directory1), str(sub_directory2), str(sub_directory3)}
+    matches_list = path_expander.expand_paths(source_data_spec)
 
+    folder_paths = [match["source_data"]["a_source"]["folder_path"] for match in matches_list]
+    # Note that sub_directory3 is not included because it does not conform to the pattern
+    expected = {
+        str(base_directory.joinpath("subject1", "a_simple_pattern_1")),
+        str(base_directory.joinpath("subject2", "a_simple_pattern_2")),
+    }
     assert set(folder_paths) == expected
 
+    metadata_list = [match["metadata"].to_dict() for match in matches_list]
+    expected_metadata = [
+        {"Subject": {"subject_id": "subject1"}, "NWBFile": {"session_id": "1"}},
+        {"Subject": {"subject_id": "subject2"}, "NWBFile": {"session_id": "2"}},
+    ]
 
-def test_only_file_match(tmpdir):
-    base_directory = Path(tmpdir)
-
-    sub_directory1 = base_directory / "a_simple_pattern_1"
-    sub_directory2 = base_directory / "a_simple_pattern_2"
+    # Sort both lists by subject id to ensure order is the same
+    metadata_list = sorted(metadata_list, key=lambda x: x["Subject"]["subject_id"])
+    expected_metadata = sorted(expected_metadata, key=lambda x: x["Subject"]["subject_id"])
+    assert metadata_list == expected_metadata
 
-    sub_directory1.mkdir(exist_ok=True)
-    sub_directory2.mkdir(exist_ok=True)
 
-    # Add files with the same name to both folders
-    file1 = sub_directory1 / "a_simple_pattern_1.bin"
-    file2 = sub_directory2 / "a_simple_pattern_2.bin"
+def test_only_file_match(tmpdir):
+    base_directory = Path(tmpdir)
 
-    # Create files
-    file1.touch()
-    file2.touch()
+    # Define the directories and files to be created
+    directories_and_files = [
+        (["subject1", "a_simple_pattern_1"], ["a_simple_pattern_1.bin"]),  # matches
+        (["subject2", "a_simple_pattern_2"], ["a_simple_pattern_2.bin"]),  # matches
+        (  # intermediate nested folder breaks match
+            ["subject1", "intermediate_nested", "a_simple_pattern_3"],
+            ["a_simple_pattern_3.bin"],
+        ),
+    ]
 
-    # Add another sub-nested folder with a folder
-    sub_directory3 = sub_directory1 / "a_simple_pattern_3"
-    sub_directory3.mkdir(exist_ok=True)
-    file3 = sub_directory3 / "a_simple_pattern_3.bin"
-    file3.touch()
+    # Create test directories and files
+    create_test_directories_and_files(base_directory, directories_and_files)
 
     # Specify source data (note this assumes the files are arranged in the same way as in the example data)
     source_data_spec = {
         "a_source": {
             "base_directory": base_directory,
-            "file_path": "a_simple_pattern_{session_id}.bin",
+            "file_path": "{subject_id}/{a_parent_folder}/a_simple_pattern_{session_id}.bin",
         }
     }
 
-    # Instantiate LocalPathExpander
-
     path_expander = LocalPathExpander()
-    metadata_list = path_expander.expand_paths(source_data_spec)
-    file_paths = [metadata_match["source_data"]["a_source"]["file_path"] for metadata_match in metadata_list]
+    matches_list = path_expander.expand_paths(source_data_spec)
+    file_paths = set(match["source_data"]["a_source"]["file_path"] for match in matches_list)
+
+    # Note that file3 is not included because it does not conform to the pattern
+    expected = {
+        str(base_directory / "subject1" / "a_simple_pattern_1" / "a_simple_pattern_1.bin"),
+        str(base_directory / "subject2" / "a_simple_pattern_2" / "a_simple_pattern_2.bin"),
+    }
+    assert file_paths == expected
+
+    metadata_list = [match["metadata"].to_dict() for match in matches_list]
+    expected_metadata = [
+        {
+            "Subject": {"subject_id": "subject1"},
+            "NWBFile": {"session_id": "1"},
+            "extras": {"a_parent_folder": "a_simple_pattern_1"},
+        },
+        {
+            "Subject": {"subject_id": "subject2"},
+            "NWBFile": {"session_id": "2"},
+            "extras": {"a_parent_folder": "a_simple_pattern_2"},
+        },
+    ]
 
-    expected = {str(file1), str(file2), str(file3)}
-    assert set(file_paths) == expected
+    # Sort both lists by subject id to ensure order is the same
+    metadata_list = sorted(metadata_list, key=lambda x: x["Subject"]["subject_id"])
+    expected_metadata = sorted(expected_metadata, key=lambda x: x["Subject"]["subject_id"])
+    assert metadata_list == expected_metadata
 
 
 def test_expand_paths(tmpdir):