From 2955ee853a43fa16f760c6bb4fc9e26b06901df3 Mon Sep 17 00:00:00 2001 From: Josh Borrow Date: Wed, 24 Jan 2024 13:48:15 -0500 Subject: [PATCH 1/6] Implement file searching Fixes simonsobs/librarian#29 --- hera_librarian/client.py | 61 +++++++++++++++++++++- librarian_background/send_clone.py | 2 +- tests/integration_test/test_upload_file.py | 15 ++++++ 3 files changed, 75 insertions(+), 3 deletions(-) diff --git a/hera_librarian/client.py b/hera_librarian/client.py index c2211c9..cb984b8 100644 --- a/hera_librarian/client.py +++ b/hera_librarian/client.py @@ -2,6 +2,7 @@ The public-facing LibrarianClient object. """ +from datetime import datetime from pathlib import Path from typing import TYPE_CHECKING, Optional @@ -11,6 +12,8 @@ from .deletion import DeletionPolicy from .exceptions import LibrarianError, LibrarianHTTPError from .models.ping import PingRequest, PingResponse +from .models.search import (FileSearchRequest, FileSearchResponse, + FileSearchResponses) from .models.uploads import (UploadCompletionRequest, UploadInitiationRequest, UploadInitiationResponse) from .settings import ClientInfo @@ -206,9 +209,11 @@ def upload( local_path : Path Path of the file or directory to upload. dest_path : Path - The destination 'path' on the librarian store (often the same as your filename, but may be under some root directory). + The destination 'path' on the librarian store (often the same as your + filename, but may be under some root directory). deletion_policy : DeletionPolicy | str, optional - Whether or not this file may be deleted, by default DeletionPolicy.DISALLOWED + Whether or not this file may be deleted, by default + DeletionPolicy.DISALLOWED Returns ------- @@ -295,3 +300,55 @@ def upload( ) return + + def search_files( + self, + name: Optional[str] = None, + create_time_window: Optional[tuple[datetime, ...]] = None, + uploader: Optional[str] = None, + source: Optional[str] = None, + max_results: int = 64, + ) -> list[FileSearchResponse]: + """ + Search for files on this librarain. + + Parameters + ---------- + name : Optional[str], optional + The name o files to search for, by default None + create_time_window : Optional[tuple[datetime, ...]], optional + A time window to search files within (make sure these are UTC + times), by default None + uploader : Optional[str], optional + The person who uploaded this file, by default None + source : Optional[str], optional + The source of this file, could be another librarian, by default None + max_results : int, optional + The maximal number of results., by default 64. Note that this can be + lower as it is also set by the server. + + Returns + ------- + list[FileSearchResponse] + A list of files that match the query. + """ + + try: + response: FileSearchResponses = self.post( + endpoint="search/file", + request=FileSearchRequest( + name=name, + create_time_window=create_time_window, + uploader=uploader, + source=source, + max_results=max_results, + ), + response=FileSearchResponses, + ) + except LibrarianHTTPError as e: + if e.status_code == 404 and e.reason == "No files found.": + return [] + else: + raise e + + return response.root diff --git a/librarian_background/send_clone.py b/librarian_background/send_clone.py index 906f788..e30662c 100644 --- a/librarian_background/send_clone.py +++ b/librarian_background/send_clone.py @@ -12,7 +12,7 @@ from schedule import CancelJob from pathlib import Path -from librarian_server.database import get_session() +from librarian_server.database import get_session from librarian_server.orm import ( StoreMetadata, Instance, diff --git a/tests/integration_test/test_upload_file.py b/tests/integration_test/test_upload_file.py index 59f0423..353a579 100644 --- a/tests/integration_test/test_upload_file.py +++ b/tests/integration_test/test_upload_file.py @@ -7,6 +7,9 @@ def test_upload_simple(librarian_client, garbage_file, server): + """ + Also tests file searching. + """ # Perform the upload librarian_client.upload(garbage_file, Path("test_file")) @@ -29,6 +32,18 @@ def test_upload_simple(librarian_client, garbage_file, server): assert real_file_contents == garbage_file_contents + search_result = librarian_client.search_files(name="test_file") + + assert len(search_result) == 1 + + assert search_result[0].name == "test_file" + assert search_result[0].size == 1024 + + assert search_result[0].instances[0].path == real_file_path + + + + def test_upload_file_to_unique_directory(librarian_client, garbage_file, server): librarian_client.upload( From ce427b4e0982bf833f297e5fcad3afee467ef60b Mon Sep 17 00:00:00 2001 From: Josh Borrow Date: Wed, 24 Jan 2024 17:28:46 -0500 Subject: [PATCH 2/6] First attempt at implementing file search on client --- hera_librarian/cli.py | 103 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 99 insertions(+), 4 deletions(-) diff --git a/hera_librarian/cli.py b/hera_librarian/cli.py index 7752e21..0bc51c1 100644 --- a/hera_librarian/cli.py +++ b/hera_librarian/cli.py @@ -7,12 +7,15 @@ """ import argparse +import datetime import json import os import sys import time from pathlib import Path +import dateutil.parser + from . import LibrarianClient from .exceptions import LibrarianClientRemovedFunctionality, LibrarianError from .settings import client_settings @@ -251,10 +254,75 @@ def search_files(args): Search for files in the librarian. """ - raise NotImplementedError( - "This needs to be implemented, but requires a change to the Librarian API." + if args.search is not None: + raise LibrarianClientRemovedFunctionality( + "search_files", "JSON search functionality is removed. See help." + ) + + # Create the search request + + # Start with the most complex part, parsing dates... + create_time_window = None + + if args.create_time_start is not None or args.create_time_end is not None: + create_time_window = [] + + if args.create_time_start is not None: + create_time_window.append(dateutil.parser.parse(args.create_time_start)) + else: + create_time_window.append(datetime.datetime.min) + + if args.create_time_end is not None: + create_time_window.append(dateutil.parser.parse(args.create_time_end)) + else: + create_time_window.append(datetime.datetime.max) + + create_time_window = tuple(create_time_window) + + # Perform the search + + client = LibrarianClient.from_info(client_settings.connections[args.conn_name]) + + search_response = client.search_files( + name=args.name, + create_time_window=create_time_window, + uploader=args.uploader, + source=args.source, + max_results=args.max_results, ) + if len(search_response) == 0: + print("No results found.") + return 1 + + # Print the results + for file in search_response: + print( + "\033[1m" + + f"{file.name} ({sizeof_fmt(file.size)}) - {file.create_time} - {file.uploader} - {file.source}" + + "\033[0m" + ) + + if len(file.instances) == 0: + print("No instances of this file found.") + else: + print("Instances:") + + for instance in file.instances: + print( + f" {instance.path} - {'AVAILABLE' if instance.available else 'NOT AVAILABLE'}" + ) + + if len(file.remote_instances) == 0: + print("No remote instances of this file found.") + else: + print("Remote instances:") + + for remote_instance in file.remote_instances: + print(f" {remote_instance.librarian_name}") + + return 0 + def set_file_deletion_policy(args): """ @@ -309,7 +377,6 @@ def upload(args): local_path=Path(args.local_path), dest_path=Path(args.dest_store_path), deletion_policy=args.deletion, - null_obsid=args.null_obsid, ) except ValueError as e: die("Upload failed, check paths: {}".format(e)) @@ -674,10 +741,38 @@ def config_search_files_subparser(sub_parsers): ) sp.add_argument("conn_name", metavar="CONNECTION-NAME", help=_conn_name_help) sp.add_argument( - "search", + "--search", metavar="JSON-SEARCH", help="A JSON search specification; files that match will be displayed.", + required=False, + ) + sp.add_argument( + "-n", + "--name", + default=None, + help="Only search for files with this name.", + ) + sp.add_argument( + "--create-time-start", + help="Search for files who were created after this date and time. Use a parseable date string, if no timezone is specified, UTC is assumed.", + ) + sp.add_argument( + "--create-time-end", + help="Search for files who were created before this date and time. Use a parseable date string, if no timezone is specified, UTC is assumed.", + ) + sp.add_argument( + "-u", "--uploader", help="Search for files uploaded by this uploader." ) + sp.add_argument( + "-s", "--source", help="Search for files uploaded from this source." + ) + sp.add_argument( + "--max-results", + type=int, + default=64, + help="Maximum number of results to return.", + ) + sp.set_defaults(func=search_files) return From affb6ce9f2e3264418cdebce0d0c8ea2db194e0c Mon Sep 17 00:00:00 2001 From: Josh Borrow Date: Wed, 24 Jan 2024 20:28:37 -0500 Subject: [PATCH 3/6] Install dateutil --- pyproject.toml | 1 + tests/client_unit_test/test_cli.py | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 17fb4d1..dd062c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ "requests", "schedule", "checksumdir", + "dateutil", ] authors = [ {name = "HERA Team", email = "hera@lists.berkeley.edu"}, diff --git a/tests/client_unit_test/test_cli.py b/tests/client_unit_test/test_cli.py index ea84d44..fb06339 100644 --- a/tests/client_unit_test/test_cli.py +++ b/tests/client_unit_test/test_cli.py @@ -6,12 +6,8 @@ """ - -import os - import pytest -import hera_librarian from hera_librarian import cli From 5ce5220969535637f6b2cc8719c52a3c9caec75f Mon Sep 17 00:00:00 2001 From: Josh Borrow Date: Wed, 24 Jan 2024 20:31:37 -0500 Subject: [PATCH 4/6] python-dateutil not dateutil --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index dd062c0..5fff2b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "requests", "schedule", "checksumdir", - "dateutil", + "python-dateutil", ] authors = [ {name = "HERA Team", email = "hera@lists.berkeley.edu"}, From 5ceab6cd4ea2757c9a50c66364c76f7301a5a399 Mon Sep 17 00:00:00 2001 From: Josh Borrow Date: Wed, 24 Jan 2024 20:54:27 -0500 Subject: [PATCH 5/6] Add simple test for CLI --- hera_librarian/settings.py | 4 ++- tests/integration_test/conftest.py | 36 ++++++++++++++-------- tests/integration_test/test_search_file.py | 28 +++++++++++++++++ tests/integration_test/test_upload_file.py | 7 +---- 4 files changed, 55 insertions(+), 20 deletions(-) create mode 100644 tests/integration_test/test_search_file.py diff --git a/hera_librarian/settings.py b/hera_librarian/settings.py index 68bcbbd..1882081 100644 --- a/hera_librarian/settings.py +++ b/hera_librarian/settings.py @@ -6,7 +6,7 @@ from pathlib import Path from pydantic import BaseModel -from pydantic_settings import BaseSettings +from pydantic_settings import BaseSettings, SettingsConfigDict from typing import TYPE_CHECKING @@ -30,6 +30,8 @@ class ClientInfo(BaseModel): class ClientSettings(BaseSettings): connections: dict[str, ClientInfo] = {} + model_config = SettingsConfigDict(env_prefix='librarian_client_') + @classmethod def from_file(cls, config_path: Path | str) -> "ClientSettings": """ diff --git a/tests/integration_test/conftest.py b/tests/integration_test/conftest.py index 02a0e96..71f545e 100644 --- a/tests/integration_test/conftest.py +++ b/tests/integration_test/conftest.py @@ -4,17 +4,10 @@ import json import os -import random import shutil -import socket -import subprocess import sys -from pathlib import Path -from socket import gethostname -from subprocess import run import pytest -from pydantic import BaseModel from xprocess import ProcessStarter from hera_librarian import LibrarianClient @@ -42,7 +35,7 @@ class Starter(ProcessStarter): for label, key in setup.env.items(): if key is None: raise ValueError(f"Environment variable {label} is None.") - + xprocess.ensure("server", Starter) setup.process = "server" @@ -76,12 +69,29 @@ def librarian_client(server) -> LibrarianClient: Returns a LibrarianClient connected to the server. """ - client = LibrarianClient( - host="http://localhost", - port=server.id, - user="test-A" - ) + client = LibrarianClient(host="http://localhost", port=server.id, user="test-A") yield client del client + + +@pytest.fixture +def librarian_client_command_line(server): + """ + Sets up the required environment variables for the command line client. + """ + + connections = json.dumps( + { + "test-A": { + "user": "test-B", + "port": server.id, + "host": "http://localhost", + } + } + ) + + os.environ["LIBRARIAN_CLIENT_CONNECTIONS"] = connections + + yield "test-A" diff --git a/tests/integration_test/test_search_file.py b/tests/integration_test/test_search_file.py new file mode 100644 index 0000000..ad42e95 --- /dev/null +++ b/tests/integration_test/test_search_file.py @@ -0,0 +1,28 @@ +""" +Tests for file searches. +""" + +import subprocess + + +def test_simple_name_search(librarian_client_command_line, garbage_file): + subprocess.call( + [ + "librarian", + "upload", + librarian_client_command_line, + garbage_file, + "test_file_for_searching", + ] + ) + + captured = subprocess.check_output( + [ + "librarian", + "search-files", + librarian_client_command_line, + "--name=test_file_for_searching", + ] + ) + + assert "test_file_for_searching" in str(captured) diff --git a/tests/integration_test/test_upload_file.py b/tests/integration_test/test_upload_file.py index 353a579..598f91a 100644 --- a/tests/integration_test/test_upload_file.py +++ b/tests/integration_test/test_upload_file.py @@ -42,13 +42,8 @@ def test_upload_simple(librarian_client, garbage_file, server): assert search_result[0].instances[0].path == real_file_path - - - def test_upload_file_to_unique_directory(librarian_client, garbage_file, server): - librarian_client.upload( - garbage_file, Path("test_directory/test_file") - ) + librarian_client.upload(garbage_file, Path("test_directory/test_file")) # Check we got it (by manually verifying) conn = sqlite3.connect(server.database) From 4be31c0ba4c7cf10ed7735b48a0b35c44b5100f3 Mon Sep 17 00:00:00 2001 From: Josh Borrow Date: Wed, 24 Jan 2024 21:20:47 -0500 Subject: [PATCH 6/6] Add tests for cli parser --- .../test_cli_parse_search_files.py | 51 +++++++++++++++++++ tests/integration_test/test_search_file.py | 3 +- 2 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 tests/client_unit_test/test_cli_parse_search_files.py diff --git a/tests/client_unit_test/test_cli_parse_search_files.py b/tests/client_unit_test/test_cli_parse_search_files.py new file mode 100644 index 0000000..3428998 --- /dev/null +++ b/tests/client_unit_test/test_cli_parse_search_files.py @@ -0,0 +1,51 @@ +""" +Tests the search-files parser. +""" + +import datetime + +import dateutil.parser + +from hera_librarian import cli + + +def test_parser_simple_name(): + parser = cli.generate_parser() + + args = parser.parse_args( + [ + "search-files", + "fake_connection", + "--name=test_file", + ] + ) + + assert args.name == "test_file" + + +def test_parser_lots(): + parser = cli.generate_parser() + + args = parser.parse_args( + [ + "search-files", + "fake_connection", + "--name=test_file", + "--create-time-start=2020-01-01", + "--create-time-end=2020-01-02", + "--uploader=uploader", + "--source=source", + "--max-results=10", + ] + ) + + assert args.name == "test_file" + assert dateutil.parser.parse(args.create_time_start) == datetime.datetime( + year=2020, month=1, day=1 + ) + assert dateutil.parser.parse(args.create_time_end) == datetime.datetime( + year=2020, month=1, day=2 + ) + assert args.uploader == "uploader" + assert args.source == "source" + assert args.max_results == 10 diff --git a/tests/integration_test/test_search_file.py b/tests/integration_test/test_search_file.py index ad42e95..1ec7134 100644 --- a/tests/integration_test/test_search_file.py +++ b/tests/integration_test/test_search_file.py @@ -3,6 +3,7 @@ """ import subprocess +from hera_librarian import cli def test_simple_name_search(librarian_client_command_line, garbage_file): @@ -25,4 +26,4 @@ def test_simple_name_search(librarian_client_command_line, garbage_file): ] ) - assert "test_file_for_searching" in str(captured) + assert "test_file_for_searching" in str(captured) \ No newline at end of file