Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up build of input index file #234

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 0 additions & 17 deletions gammacat/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,6 @@

log = logging.getLogger(__name__)


# TODO: put this in a better place?
def check_info_yaml():
"""Check the info.yaml files in input/data"""
from gammacat.utils import load_yaml, validate_schema

schema = load_yaml('input/schemas/dataset_info.schema.yaml')

for path in Path('input/data').glob('*/*/info.yaml'):
print(f'Checking: {path}')
data = load_yaml(path)
validate_schema(path=path, data=data, schema=schema)


class CheckerConfig:
"""Config for Checker"""

Expand Down Expand Up @@ -86,9 +72,6 @@ def check_all(self):
def check_input(self):
log.info('Run checks: input')
self.input_data.validate()
check_info_yaml()
print()
print(self.input_data)

def check_collection(self):
log.info('Run checks: collection')
Expand Down
70 changes: 40 additions & 30 deletions gammacat/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from .lightcurve import LightCurve
from .dataset import DataSet
from .info import gammacat_info, GammaCatStr
from .input import InputData
from .input import InputData, InputInfoCollection
from .utils import write_json, load_json, log_list_difference, load_yaml

__all__ = [
Expand Down Expand Up @@ -199,6 +199,45 @@ def validate_list_of_files(self):
expected_files = expected_files_extra + expected_files_sed
log_list_difference(actual, expected_files)

class InputCollection:

def __init__ (self, config):
self.config = config
self.data = InputData.read()
self.info_files = InputInfoCollection.read()
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line creates a list of all info files which are present in the input folder (step 1 in comment).


def run(self):
self._validate_info_files()
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here, all info files which are present in the input folder are validated (step 2 in comment).

self._make_index_file_for_input()
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line creates the input index file and stores it (step 3 in comment).


def _validate_info_files(self):
self.info_files.validate()

def _make_index_file_for_input(self):
resources = []
for info_filename in self.info_files.to_list():
info_data = load_yaml(info_filename)
if info_data['data_entry']['status'] == 'missing':
continue
# TODO: Decide which datasets are copied to output collection by the keywords in
# 'status' and 'reviewed' in info.yaml
# e.g if info_data['data_entry']['status'] == 'complete':
for dataset in info_data['datasets']:
resource = GammaCatResource(0, 'empty')
if dataset.endswith('yaml'):
resource = DataSet.read(info_filename.parent / dataset).resource
elif dataset.endswith('ecsv'):
if 'lc' in dataset:
resource = LightCurve.read(info_filename.parent / dataset).resource
elif 'sed' in dataset:
resource = SED.read(info_filename.parent / dataset).resource
resource.location = str(info_filename.parent.relative_to(self.config.in_path) / dataset)
resources.append(resource)

self.index = GammaCatResourceIndex(resources).sort()
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure whether this definition of the class instance variable self.index is pythonic or not. Currently, self.index is not used anywhere but I think in the future we will handle a lot of things via this index, hence, it is nice to have it as a class instance variable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's not used from elsewhere at the moment, better to keep it as a local variable only.


path = self.config.index_input_json
write_json(self.index.to_list(), path)

class CollectionMaker:
"""Make gamma-cat data collection (from the input files)."""
Expand All @@ -212,8 +251,6 @@ def run(self):
step = self.config.step
if step == 'all':
self.process_all()
elif step == 'input-index':
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not working anymore because it is not handled in this class.

self.make_index_file_for_input()
elif step == 'source-info':
self.process_src_info()
elif step == 'dataset':
Expand All @@ -233,7 +270,6 @@ def input_data(self):
return InputData.read()

def process_all(self):
self.make_index_file_for_input()
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above. Not part of this class anymore.


self.process_src_info()
self.process_datasets()
Expand Down Expand Up @@ -290,32 +326,6 @@ def process_datasets(self):
path.parent.mkdir(parents=True, exist_ok=True)
dataset.write(path)

def make_index_file_for_input(self):
resources = []
for info_filename in self.input_data.info_yaml_list:
info_data = load_yaml(info_filename)
if info_data['data_entry']['status'] == 'missing':
continue
# TODO: Decide which datasets are copied to output collection by the keywords in
# 'status' and 'reviewed' in info.yaml
# e.g if info_data['data_entry']['status'] == 'complete':
for dataset in info_data['datasets']:
resource = GammaCatResource(0, 'empty')
if dataset.endswith('yaml'):
resource = DataSet.read(info_filename.parent / dataset).resource
elif dataset.endswith('ecsv'):
if 'lc' in dataset:
resource = LightCurve.read(info_filename.parent / dataset).resource
elif 'sed' in dataset:
resource = SED.read(info_filename.parent / dataset).resource
resource.location = str(info_filename.parent.relative_to(self.config.in_path) / dataset)
resources.append(resource)

ri = GammaCatResourceIndex(resources).sort()

path = self.config.index_input_json
write_json(ri.to_list(), path)

def make_index_file_for_output(self):

# input and output should be consistent, modulo known differences
Expand Down
69 changes: 61 additions & 8 deletions gammacat/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,53 @@ def validate(self):
log.info('Validating YAML files in `input/sources`')
[_.validate() for _ in self.data]

class InputInfo:
"""All basic info about the data of a single publication.
"""

schema = load_yaml(gammacat_info.base_dir / 'input/schemas/dataset_info.schema.yaml')

def __init__(self, data, path):
self.data = data
self.path = path

@classmethod
def read(cls, path):
path = Path(path)
data = load_yaml(path)
return cls(data=data, path=path)

def validate(self):
validate_schema(path=self.path, data=self.data, schema=self.schema)

class InputInfoCollection:
"""List of InputInfo objects.
"""

def __init__(self, data):
self.data = data

@classmethod
def read(cls):
path = gammacat_info.base_dir / 'input/data'
paths = path.glob('*/*/info.yaml')

data = []
for path in paths:
info = InputInfo.read(path)
data.append(info)

return cls(data=data)

def to_list(self):
data = []
for infodata in self.data:
data.append(infodata.path)
return data

def validate(self):
log.info('Validating info.yaml files in `input/data`')
[_.validate() for _ in self.data]

class InputDatasetCollection:
"""
Expand Down Expand Up @@ -306,6 +353,16 @@ class InputData:
Expose it as Python objects that can be validated and used.
"""

def __init__(self, schemas=None, sources=None, datasets=None,
ref_infos=None, gammacat_dataset_config=None):
self.path = gammacat_info.base_dir / 'input'
self.schemas = schemas
self.sources = sources
self.datasets = datasets
self.ref_infos = ref_infos
# TODO: self.lightcurves, self.seds
self.gammacat_dataset_config = gammacat_dataset_config

@property
def src_info_list(self):
"""List of all basic source info files in input/sources"""
Expand Down Expand Up @@ -339,14 +396,6 @@ def dataset_file_list(self):
paths = path.glob('*/*/tev*.yaml')
return sorted(paths)

def __init__(self, schemas=None, sources=None, datasets=None,
gammacat_dataset_config=None):
self.path = gammacat_info.base_dir / 'input'
self.schemas = schemas
self.sources = sources
self.datasets = datasets
self.gammacat_dataset_config = gammacat_dataset_config

@classmethod
def read(cls):
"""Read all data from disk."""
Expand All @@ -355,11 +404,14 @@ def read(cls):
schemas = Schemas.read()
sources = BasicSourceList.read()
datasets = InputDatasetCollection.read()
ref_infos = InputInfoCollection.read()
#TODO: lightcurves, seds
gammacat_dataset_config = DatasetConfig.read()
return cls(
schemas=schemas,
sources=sources,
datasets=datasets,
ref_infos=ref_infos,
gammacat_dataset_config=gammacat_dataset_config,
)

Expand All @@ -385,6 +437,7 @@ def validate(self):
self.schemas.validate()
self.sources.validate()
self.datasets.validate()
self.ref_infos.validate()

for filename in self.sed_file_list:
SED.read(filename=filename).process()
Expand Down
7 changes: 5 additions & 2 deletions make.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import subprocess
import click
from gammacat.info import gammacat_info
from gammacat.collection import CollectionConfig, CollectionMaker
from gammacat.collection import CollectionConfig, CollectionMaker, InputCollection
from gammacat.catalog import CatalogConfig, CatalogMaker
from gammacat.webpage import WebpageConfig, WebpageMaker
from gammacat.checks import CheckerConfig, Checker
Expand Down Expand Up @@ -45,7 +45,10 @@ def cli_collection(step):
out_path=gammacat_info.out_path,
step=step,
)
CollectionMaker(config).run()
if (step == 'input-index'):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know, this looks ugly and need follow up change! But it works, hence, I would leave it as it is for now and change it later.

InputCollection(config).run()
else:
CollectionMaker(config).run()


@cli.command(name='catalog')
Expand Down