Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix path expander II #679

Merged
merged 12 commits into from
Dec 19, 2023
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Upcoming

### Bug fixes
* LocalPathExpander matches only `folder_paths` or `file_paths` if that is indicated in the passed specification. [PR #675](https://github.com/catalystneuro/neuroconv/pull/675)
* LocalPathExpander matches only `folder_paths` or `file_paths` if that is indicated in the passed specification. [PR #679](https://github.com/catalystneuro/neuroconv/pull/675) and [PR #675](https://github.com/catalystneuro/neuroconv/pull/679
* Fixed depth consideration in partial chunking pattern for the ROI data buffer. [PR #677](https://github.com/catalystneuro/neuroconv/pull/677)
* Fix mapping between channel names and the electrode table when writing more than one `ElectricalSeries` to the NWBFile. This fixes an issue when the converter pipeline of `SpikeGLXConverterPipe` was writing the electrode table region of the NIDQ stream incorrectly [PR #678](https://github.com/catalystneuro/neuroconv/pull/678)

Expand Down
30 changes: 29 additions & 1 deletion src/neuroconv/tools/path_expansion.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,41 @@

class AbstractPathExpander(abc.ABC):
def extract_metadata(self, base_directory: DirectoryPath, format_: str):
"""
Uses the parse library to extract metadata from file paths in the base_directory.

This method iterates over files in `base_directory`, parsing each file path according to `format_`.
The format string is adjusted to the current operating system's path separator. The method yields
each file path and its corresponding parsed metadata. To constrain metadata matches to only the
name of the file or folder/directory, the method checks that the metadata does not contain the
OS path separator (e.g., '/' or '\\').

Parameters
----------
base_directory : DirectoryPath
The base directory from which to list files for metadata extraction. It should be a path-like
object that is convertible to a `pathlib.Path`.
format_ : str
The format string used for parsing the file paths. This string can represent a path in any
OS format, and is adjusted internally to match the current OS's path separator.

Yields
------
Tuple[Path, Dict[str, Any]]
A tuple containing the file path as a `Path` object and a dictionary of the named metadata
extracted from the file path.
"""

format_ = format_.replace("\\", os.sep) # Actual character is a single back-slash; first is an escape for that
format_ = format_.replace("/", os.sep) # our f-string uses '/' to communicate os-independent separators

for filepath in self.list_directory(base_directory=Path(base_directory)):
result = parse(format_, filepath)
if result:
yield filepath, result.named
named_result = result.named
no_field_in_metadata_contains_os_sep = all(os.sep not in str(val) for val in named_result.values())
if no_field_in_metadata_contains_os_sep:
yield filepath, named_result

@abc.abstractmethod
def list_directory(self, base_directory: DirectoryPath) -> Iterable[FilePath]:
Expand Down
145 changes: 93 additions & 52 deletions tests/test_minimal/test_tools/test_expand_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,95 +2,136 @@
import unittest
from datetime import datetime
from pathlib import Path

import pytest
from typing import List, Tuple

from neuroconv.tools import LocalPathExpander
from neuroconv.tools.testing import generate_path_expander_demo_ibl
from neuroconv.utils import NWBMetaDataEncoder


def test_only_folder_match(tmpdir):
base_directory = Path(tmpdir)

sub_directory1 = base_directory / "a_simple_pattern_1"
sub_directory2 = base_directory / "a_simple_pattern_2"
def create_test_directories_and_files(
base_directory: Path, directories_and_files: List[Tuple[List[str], List[str]]]
) -> None:
"""
Create test directories and files in a way that is compatible across different
operating systems.

Parameters
----------
base_directory : Path
The base directory under which all subdirectories and files will be created.
directories_and_files : List[Tuple[List[str], List[str]]]
A list where each element is a tuple. The first element of the tuple is a list
of directory components, and the second element is a list of file names to be
created in that directory.
"""
for directory_components, files in directories_and_files:
# Create directory using Path for OS compatibility
full_directory_path = base_directory.joinpath(*directory_components)
full_directory_path.mkdir(parents=True, exist_ok=True)

# Create files in the directory
for file in files:
(full_directory_path / file).touch()

sub_directory1.mkdir(exist_ok=True)
sub_directory2.mkdir(exist_ok=True)

# Add files with the same name to both folders
file1 = sub_directory1 / "a_simple_pattern_1.bin"
file2 = sub_directory2 / "a_simple_pattern_2.bin"
def test_only_folder_match(tmpdir):
base_directory = Path(tmpdir)

# Create files
file1.touch()
file2.touch()
# Define the directories and files to be created
directories_and_files = [
(["subject1", "a_simple_pattern_1"], ["a_simple_pattern_1.bin"]), # matches
(["subject1"], ["a_simple_pattern_file.bin"]), # matches query but is a file
(["subject2", "a_simple_pattern_2", "nested_directory"], []), # match should not contain nested folder
]

# Add another sub-nested folder with a folder
sub_directory3 = sub_directory1 / "a_simple_pattern_3"
sub_directory3.mkdir(exist_ok=True)
file3 = sub_directory3 / "a_simple_pattern_3.bin"
file3.touch()
# Create test directories and files
create_test_directories_and_files(base_directory, directories_and_files)

# Specify source data (note this assumes the files are arranged in the same way as in the example data)
source_data_spec = {
"a_source": {
"base_directory": base_directory,
"folder_path": "a_simple_pattern_{session_id}",
"folder_path": "{subject_id}/a_simple_pattern_{session_id}",
}
}

# Instantiate LocalPathExpander

path_expander = LocalPathExpander()
metadata_list = path_expander.expand_paths(source_data_spec)
folder_paths = [metadata_match["source_data"]["a_source"]["folder_path"] for metadata_match in metadata_list]

expected = {str(sub_directory1), str(sub_directory2), str(sub_directory3)}
matches_list = path_expander.expand_paths(source_data_spec)

folder_paths = [match["source_data"]["a_source"]["folder_path"] for match in matches_list]
# Note that sub_directory3 is not included because it does not conform to the pattern
expected = {
str(base_directory.joinpath("subject1", "a_simple_pattern_1")),
str(base_directory.joinpath("subject2", "a_simple_pattern_2")),
}
assert set(folder_paths) == expected

metadata_list = [match["metadata"].to_dict() for match in matches_list]
expected_metadata = [
{"Subject": {"subject_id": "subject1"}, "NWBFile": {"session_id": "1"}},
{"Subject": {"subject_id": "subject2"}, "NWBFile": {"session_id": "2"}},
]

def test_only_file_match(tmpdir):
base_directory = Path(tmpdir)

sub_directory1 = base_directory / "a_simple_pattern_1"
sub_directory2 = base_directory / "a_simple_pattern_2"
# Sort both lists by subject id to ensure order is the same
metadata_list = sorted(metadata_list, key=lambda x: x["Subject"]["subject_id"])
expected_metadata = sorted(expected_metadata, key=lambda x: x["Subject"]["subject_id"])
assert metadata_list == expected_metadata

sub_directory1.mkdir(exist_ok=True)
sub_directory2.mkdir(exist_ok=True)

# Add files with the same name to both folders
file1 = sub_directory1 / "a_simple_pattern_1.bin"
file2 = sub_directory2 / "a_simple_pattern_2.bin"
def test_only_file_match(tmpdir):
base_directory = Path(tmpdir)

# Create files
file1.touch()
file2.touch()
# Define the directories and files to be created
directories_and_files = [
(["subject1", "a_simple_pattern_1"], ["a_simple_pattern_1.bin"]), # matches
(["subject2", "a_simple_pattern_2"], ["a_simple_pattern_2.bin"]), # matches
( # intermediate nested folder breaks match
["subject1", "intermediate_nested", "a_simple_pattern_3"],
["a_simple_pattern_3.bin"],
),
]

# Add another sub-nested folder with a folder
sub_directory3 = sub_directory1 / "a_simple_pattern_3"
sub_directory3.mkdir(exist_ok=True)
file3 = sub_directory3 / "a_simple_pattern_3.bin"
file3.touch()
# Create test directories and files
create_test_directories_and_files(base_directory, directories_and_files)

# Specify source data (note this assumes the files are arranged in the same way as in the example data)
source_data_spec = {
"a_source": {
"base_directory": base_directory,
"file_path": "a_simple_pattern_{session_id}.bin",
"file_path": "{subject_id}/{a_parent_folder}/a_simple_pattern_{session_id}.bin",
}
}

# Instantiate LocalPathExpander

path_expander = LocalPathExpander()
metadata_list = path_expander.expand_paths(source_data_spec)
file_paths = [metadata_match["source_data"]["a_source"]["file_path"] for metadata_match in metadata_list]
matches_list = path_expander.expand_paths(source_data_spec)
file_paths = set(match["source_data"]["a_source"]["file_path"] for match in matches_list)
bendichter marked this conversation as resolved.
Show resolved Hide resolved

# Note that file3 is not included because it does not conform to the pattern
expected = {
str(base_directory / "subject1" / "a_simple_pattern_1" / "a_simple_pattern_1.bin"),
str(base_directory / "subject2" / "a_simple_pattern_2" / "a_simple_pattern_2.bin"),
}
assert file_paths == expected

metadata_list = [match["metadata"].to_dict() for match in matches_list]
expected_metadata = [
{
"Subject": {"subject_id": "subject1"},
"NWBFile": {"session_id": "1"},
"extras": {"a_parent_folder": "a_simple_pattern_1"},
},
{
"Subject": {"subject_id": "subject2"},
"NWBFile": {"session_id": "2"},
"extras": {"a_parent_folder": "a_simple_pattern_2"},
},
]

expected = {str(file1), str(file2), str(file3)}
assert set(file_paths) == expected
# Sort both lists by subject id to ensure order is the same
metadata_list = sorted(metadata_list, key=lambda x: x["Subject"]["subject_id"])
expected_metadata = sorted(expected_metadata, key=lambda x: x["Subject"]["subject_id"])
assert metadata_list == expected_metadata


def test_expand_paths(tmpdir):
Expand Down
Loading