From cf9ab294584880d5f4575890a45707c8f8e37931 Mon Sep 17 00:00:00 2001
From: Jan-Lukas Wynen <jan-lukas.wynen@ess.eu>
Date: Wed, 30 Oct 2024 13:19:22 +0100
Subject: [PATCH 01/13] Use auth/login endpoint to get token

---
 src/scitacean/client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scitacean/client.py b/src/scitacean/client.py
index c0bdcf26..700665e7 100644
--- a/src/scitacean/client.py
+++ b/src/scitacean/client.py
@@ -1224,7 +1224,7 @@ def _log_in_via_users_login(
 ) -> httpx.Response:
     # Currently only used for functional accounts.
     response = httpx.post(
-        _url_concat(url, "Users/login"),
+        _url_concat(url, "auth/login"),
         json={"username": username.get_str(), "password": password.get_str()},
         timeout=timeout.seconds,
     )

From af1212f526eb2b75fd294f8ff4bf47e462671963 Mon Sep 17 00:00:00 2001
From: Jan-Lukas Wynen <jan-lukas.wynen@ess.eu>
Date: Wed, 30 Oct 2024 13:19:54 +0100
Subject: [PATCH 02/13] Experiment: Get new models from SciCat

---
 src/scitacean/_dataset_fields.py              | 170 +++++++++++++-----
 src/scitacean/model.py                        |  35 ++--
 tests/client/dataset_client_test.py           |   8 +-
 tools/model-generation/README.md              |   2 +-
 tools/model-generation/spec/__init__.py       |   4 +-
 tools/model-generation/spec/masked-fields.yml |   3 -
 tools/model-generation/spec/schema.py         |   8 +-
 .../model-generation/templates/model.py.jinja |   2 +-
 8 files changed, 154 insertions(+), 78 deletions(-)

diff --git a/src/scitacean/_dataset_fields.py b/src/scitacean/_dataset_fields.py
index 265bda42..51816f7d 100644
--- a/src/scitacean/_dataset_fields.py
+++ b/src/scitacean/_dataset_fields.py
@@ -109,7 +109,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="api_version",
-            description="Version of the API used in creation of the dataset.",
+            description="Version of the API used when the dataset was created or last updated. API version is defined in code for each release. Managed by the system.",
             read_only=True,
             required=False,
             scicat_name="version",
@@ -119,7 +119,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="classification",
-            description="ACIA information about AUthenticity,COnfidentiality,INtegrity and AVailability requirements of dataset. E.g. AV(ailabilty)=medium could trigger the creation of a two tape copies. Format 'AV=medium,CO=low'",
+            description="ACIA information about AUthenticity,COnfidentiality,INtegrity and AVailability requirements of dataset. E.g. AV(ailabilty)=medium could trigger the creation of a two tape copies. Format 'AV=medium,CO=low'. Please check the following post for more info: https://en.wikipedia.org/wiki/Parkerian_Hexad",
             read_only=False,
             required=False,
             scicat_name="classification",
@@ -129,7 +129,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="comment",
-            description="Comment the user has about a given dataset.",
+            description="Short comment provided by the user about a given dataset. This is additional to the description field.",
             read_only=False,
             required=False,
             scicat_name="comment",
@@ -149,7 +149,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="created_at",
-            description="Date and time when this record was created. This property is added and maintained by mongoose.",
+            description="Date and time when this record was created. This field is managed by mongoose with through the timestamp settings. The field should be a string containing a date in ISO 8601 format (2024-02-27T12:26:57.313Z)",
             read_only=True,
             required=False,
             scicat_name="createdAt",
@@ -169,7 +169,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="creation_location",
-            description="Unique location identifier where data was taken, usually in the form /Site-name/facility-name/instrumentOrBeamline-name. This field is required if the dataset is a Raw dataset.",
+            description="Unique location identifier where data was acquired. Usually in the form /Site-name/facility-name/instrumentOrBeamline-name.",
             read_only=False,
             required=True,
             scicat_name="creationLocation",
@@ -179,7 +179,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="creation_time",
-            description="Time when dataset became fully available on disk, i.e. all containing files have been written. Format according to chapter 5.6 internet date/time format in RFC 3339. Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server.",
+            description="Time when dataset became fully available on disk, i.e. all containing files have been written,  or the dataset was created in SciCat.<br>It is expected to be in ISO8601 format according to specifications for internet date/time format in RFC 3339, chapter 5.6 (https://www.rfc-editor.org/rfc/rfc3339#section-5).<br>Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server.",
             read_only=False,
             required=True,
             scicat_name="creationTime",
@@ -219,7 +219,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="end_time",
-            description="End time of data acquisition for this dataset, format according to chapter 5.6 internet date/time format in RFC 3339. Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server.",
+            description="End time of data acquisition for the current dataset.<br>It is expected to be in ISO8601 format according to specifications for internet date/time format in RFC 3339, chapter 5.6 (https://www.rfc-editor.org/rfc/rfc3339#section-5).<br>Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server.",
             read_only=False,
             required=False,
             scicat_name="endTime",
@@ -229,13 +229,13 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="input_datasets",
-            description="Array of input dataset identifiers used in producing the derived dataset. Ideally these are the global identifier to existing datasets inside this or federated data catalogs. This field is required if the dataset is a Derived dataset.",
+            description="Array of input dataset identifiers used in producing the derived dataset. Ideally these are the global identifier to existing datasets inside this or federated data catalogs.",
             read_only=False,
             required=True,
             scicat_name="inputDatasets",
             type=list[PID],
             used_by_derived=True,
-            used_by_raw=False,
+            used_by_raw=True,
         ),
         Field(
             name="instrument_group",
@@ -257,15 +257,25 @@ def used_by(self, dataset_type: DatasetType) -> bool:
             used_by_derived=False,
             used_by_raw=True,
         ),
+        Field(
+            name="instrument_ids",
+            description="Id of the instrument or array of IDS of the instruments where the data contained in this dataset was created/acquired.",
+            read_only=True,
+            required=False,
+            scicat_name="instrumentIds",
+            type=list[str],
+            used_by_derived=True,
+            used_by_raw=True,
+        ),
         Field(
             name="investigator",
-            description="First name and last name of the person or people pursuing the data analysis. The string may contain a list of names, which should then be separated by semicolons.",
+            description="",
             read_only=False,
             required=True,
             scicat_name="investigator",
             type=str,
             used_by_derived=True,
-            used_by_raw=False,
+            used_by_raw=True,
         ),
         Field(
             name="is_published",
@@ -285,7 +295,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
             scicat_name="jobLogData",
             type=str,
             used_by_derived=True,
-            used_by_raw=False,
+            used_by_raw=True,
         ),
         Field(
             name="job_parameters",
@@ -295,7 +305,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
             scicat_name="jobParameters",
             type=dict[str, Any],
             used_by_derived=True,
-            used_by_raw=False,
+            used_by_raw=True,
         ),
         Field(
             name="keywords",
@@ -329,7 +339,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="name",
-            description="A name for the dataset, given by the creator to carry some semantic meaning. Useful for display purposes e.g. instead of displaying the pid. Will be autofilled if missing using info from sourceFolder.",
+            description="A name for the dataset, given by the creator to carry some semantic meaning. Useful for display purposes e.g. instead of displaying the pid.",
             read_only=False,
             required=False,
             scicat_name="datasetName",
@@ -404,12 +414,22 @@ def used_by(self, dataset_type: DatasetType) -> bool:
             required=False,
             scicat_name="proposalId",
             type=str,
-            used_by_derived=False,
+            used_by_derived=True,
+            used_by_raw=True,
+        ),
+        Field(
+            name="proposal_ids",
+            description="The ID of the proposal to which the dataset belongs to and it has been acquired under.",
+            read_only=True,
+            required=False,
+            scicat_name="proposalIds",
+            type=list[str],
+            used_by_derived=True,
             used_by_raw=True,
         ),
         Field(
             name="relationships",
-            description="Stores the relationships with other datasets.",
+            description="Array of relationships with other datasets. It contains relationship type and destination dataset",
             read_only=False,
             required=False,
             scicat_name="relationships",
@@ -427,9 +447,19 @@ def used_by(self, dataset_type: DatasetType) -> bool:
             used_by_derived=False,
             used_by_raw=True,
         ),
+        Field(
+            name="sample_ids",
+            description="Single ID or array of IDS of the samples used when collecting the data.",
+            read_only=True,
+            required=False,
+            scicat_name="sampleIds",
+            type=list[str],
+            used_by_derived=True,
+            used_by_raw=True,
+        ),
         Field(
             name="shared_with",
-            description="List of users that the dataset has been shared with.",
+            description="List of additional users that the dataset has been shared with.",
             read_only=False,
             required=False,
             scicat_name="sharedWith",
@@ -457,9 +487,19 @@ def used_by(self, dataset_type: DatasetType) -> bool:
             used_by_derived=True,
             used_by_raw=True,
         ),
+        Field(
+            name="start_time",
+            description="Start time of data acquisition for the current dataset.<br>It is expected to be in ISO8601 format according to specifications for internet date/time format in RFC 3339, chapter 5.6 (https://www.rfc-editor.org/rfc/rfc3339#section-5).<br>Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server.",
+            read_only=False,
+            required=False,
+            scicat_name="startTime",
+            type=datetime,
+            used_by_derived=False,
+            used_by_raw=True,
+        ),
         Field(
             name="techniques",
-            description="Stores the metadata information for techniques.",
+            description="Array of techniques information, with technique name and pid.",
             read_only=False,
             required=False,
             scicat_name="techniques",
@@ -469,7 +509,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="updated_at",
-            description="Date and time when this record was updated last. This property is added and maintained by mongoose.",
+            description="Date and time when this record was updated last. This field is managed by mongoose with through the timestamp settings. The field should be a string containing a date in ISO 8601 format (2024-02-27T12:26:57.313Z)",
             read_only=True,
             required=False,
             scicat_name="updatedAt",
@@ -489,13 +529,13 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="used_software",
-            description="A list of links to software repositories which uniquely identifies the pieces of software, including versions, used for yielding the derived data. This field is required if the dataset is a Derived dataset.",
+            description="A list of links to software repositories which uniquely identifies the pieces of software, including versions, used for yielding the derived data.",
             read_only=False,
             required=True,
             scicat_name="usedSoftware",
             type=list[str],
             used_by_derived=True,
-            used_by_raw=False,
+            used_by_raw=True,
         ),
         Field(
             name="validation_status",
@@ -526,6 +566,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         "_input_datasets",
         "_instrument_group",
         "_instrument_id",
+        "_instrument_ids",
         "_investigator",
         "_is_published",
         "_job_log_data",
@@ -541,11 +582,14 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         "_pid",
         "_principal_investigator",
         "_proposal_id",
+        "_proposal_ids",
         "_relationships",
         "_sample_id",
+        "_sample_ids",
         "_shared_with",
         "_source_folder",
         "_source_folder_host",
+        "_start_time",
         "_techniques",
         "_updated_at",
         "_updated_by",
@@ -592,6 +636,7 @@ def __init__(
         shared_with: list[str] | None = None,
         source_folder: RemotePath | str | None = None,
         source_folder_host: str | None = None,
+        start_time: datetime | None = None,
         techniques: list[Technique] | None = None,
         used_software: list[str] | None = None,
         validation_status: str | None = None,
@@ -630,14 +675,18 @@ def __init__(
         self._shared_with = shared_with
         self._source_folder = _parse_remote_path(source_folder)
         self._source_folder_host = source_folder_host
+        self._start_time = start_time
         self._techniques = techniques
         self._used_software = used_software
         self._validation_status = validation_status
         self._api_version = None
         self._created_at = None
         self._created_by = None
+        self._instrument_ids = None
         self._lifecycle = None
         self._pid = None
+        self._proposal_ids = None
+        self._sample_ids = None
         self._updated_at = None
         self._updated_by = None
         self._meta = meta or {}
@@ -659,27 +708,27 @@ def access_groups(self, access_groups: list[str] | None) -> None:
 
     @property
     def api_version(self) -> str | None:
-        """Version of the API used in creation of the dataset."""
+        """Version of the API used when the dataset was created or last updated. API version is defined in code for each release. Managed by the system."""
         return self._api_version
 
     @property
     def classification(self) -> str | None:
-        """ACIA information about AUthenticity,COnfidentiality,INtegrity and AVailability requirements of dataset. E.g. AV(ailabilty)=medium could trigger the creation of a two tape copies. Format 'AV=medium,CO=low'"""
+        """ACIA information about AUthenticity,COnfidentiality,INtegrity and AVailability requirements of dataset. E.g. AV(ailabilty)=medium could trigger the creation of a two tape copies. Format 'AV=medium,CO=low'. Please check the following post for more info: https://en.wikipedia.org/wiki/Parkerian_Hexad"""
         return self._classification
 
     @classification.setter
     def classification(self, classification: str | None) -> None:
-        """ACIA information about AUthenticity,COnfidentiality,INtegrity and AVailability requirements of dataset. E.g. AV(ailabilty)=medium could trigger the creation of a two tape copies. Format 'AV=medium,CO=low'"""
+        """ACIA information about AUthenticity,COnfidentiality,INtegrity and AVailability requirements of dataset. E.g. AV(ailabilty)=medium could trigger the creation of a two tape copies. Format 'AV=medium,CO=low'. Please check the following post for more info: https://en.wikipedia.org/wiki/Parkerian_Hexad"""
         self._classification = classification
 
     @property
     def comment(self) -> str | None:
-        """Comment the user has about a given dataset."""
+        """Short comment provided by the user about a given dataset. This is additional to the description field."""
         return self._comment
 
     @comment.setter
     def comment(self, comment: str | None) -> None:
-        """Comment the user has about a given dataset."""
+        """Short comment provided by the user about a given dataset. This is additional to the description field."""
         self._comment = comment
 
     @property
@@ -694,7 +743,7 @@ def contact_email(self, contact_email: str | None) -> None:
 
     @property
     def created_at(self) -> datetime | None:
-        """Date and time when this record was created. This property is added and maintained by mongoose."""
+        """Date and time when this record was created. This field is managed by mongoose with through the timestamp settings. The field should be a string containing a date in ISO 8601 format (2024-02-27T12:26:57.313Z)"""
         return self._created_at
 
     @property
@@ -704,22 +753,22 @@ def created_by(self) -> str | None:
 
     @property
     def creation_location(self) -> str | None:
-        """Unique location identifier where data was taken, usually in the form /Site-name/facility-name/instrumentOrBeamline-name. This field is required if the dataset is a Raw dataset."""
+        """Unique location identifier where data was acquired. Usually in the form /Site-name/facility-name/instrumentOrBeamline-name."""
         return self._creation_location
 
     @creation_location.setter
     def creation_location(self, creation_location: str | None) -> None:
-        """Unique location identifier where data was taken, usually in the form /Site-name/facility-name/instrumentOrBeamline-name. This field is required if the dataset is a Raw dataset."""
+        """Unique location identifier where data was acquired. Usually in the form /Site-name/facility-name/instrumentOrBeamline-name."""
         self._creation_location = creation_location
 
     @property
     def creation_time(self) -> datetime | None:
-        """Time when dataset became fully available on disk, i.e. all containing files have been written. Format according to chapter 5.6 internet date/time format in RFC 3339. Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server."""
+        """Time when dataset became fully available on disk, i.e. all containing files have been written,  or the dataset was created in SciCat.<br>It is expected to be in ISO8601 format according to specifications for internet date/time format in RFC 3339, chapter 5.6 (https://www.rfc-editor.org/rfc/rfc3339#section-5).<br>Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server."""
         return self._creation_time
 
     @creation_time.setter
     def creation_time(self, creation_time: str | datetime | None) -> None:
-        """Time when dataset became fully available on disk, i.e. all containing files have been written. Format according to chapter 5.6 internet date/time format in RFC 3339. Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server."""
+        """Time when dataset became fully available on disk, i.e. all containing files have been written,  or the dataset was created in SciCat.<br>It is expected to be in ISO8601 format according to specifications for internet date/time format in RFC 3339, chapter 5.6 (https://www.rfc-editor.org/rfc/rfc3339#section-5).<br>Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server."""
         self._creation_time = _parse_datetime(creation_time)
 
     @property
@@ -754,22 +803,22 @@ def description(self, description: str | None) -> None:
 
     @property
     def end_time(self) -> datetime | None:
-        """End time of data acquisition for this dataset, format according to chapter 5.6 internet date/time format in RFC 3339. Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server."""
+        """End time of data acquisition for the current dataset.<br>It is expected to be in ISO8601 format according to specifications for internet date/time format in RFC 3339, chapter 5.6 (https://www.rfc-editor.org/rfc/rfc3339#section-5).<br>Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server."""
         return self._end_time
 
     @end_time.setter
     def end_time(self, end_time: datetime | None) -> None:
-        """End time of data acquisition for this dataset, format according to chapter 5.6 internet date/time format in RFC 3339. Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server."""
+        """End time of data acquisition for the current dataset.<br>It is expected to be in ISO8601 format according to specifications for internet date/time format in RFC 3339, chapter 5.6 (https://www.rfc-editor.org/rfc/rfc3339#section-5).<br>Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server."""
         self._end_time = end_time
 
     @property
     def input_datasets(self) -> list[PID] | None:
-        """Array of input dataset identifiers used in producing the derived dataset. Ideally these are the global identifier to existing datasets inside this or federated data catalogs. This field is required if the dataset is a Derived dataset."""
+        """Array of input dataset identifiers used in producing the derived dataset. Ideally these are the global identifier to existing datasets inside this or federated data catalogs."""
         return self._input_datasets
 
     @input_datasets.setter
     def input_datasets(self, input_datasets: list[PID] | None) -> None:
-        """Array of input dataset identifiers used in producing the derived dataset. Ideally these are the global identifier to existing datasets inside this or federated data catalogs. This field is required if the dataset is a Derived dataset."""
+        """Array of input dataset identifiers used in producing the derived dataset. Ideally these are the global identifier to existing datasets inside this or federated data catalogs."""
         self._input_datasets = input_datasets
 
     @property
@@ -792,14 +841,19 @@ def instrument_id(self, instrument_id: str | None) -> None:
         """ID of the instrument where the data was created."""
         self._instrument_id = instrument_id
 
+    @property
+    def instrument_ids(self) -> list[str] | None:
+        """Id of the instrument or array of IDS of the instruments where the data contained in this dataset was created/acquired."""
+        return self._instrument_ids
+
     @property
     def investigator(self) -> str | None:
-        """First name and last name of the person or people pursuing the data analysis. The string may contain a list of names, which should then be separated by semicolons."""
+        """"""
         return self._investigator
 
     @investigator.setter
     def investigator(self, investigator: str | None) -> None:
-        """First name and last name of the person or people pursuing the data analysis. The string may contain a list of names, which should then be separated by semicolons."""
+        """"""
         self._investigator = investigator
 
     @property
@@ -859,12 +913,12 @@ def lifecycle(self) -> Lifecycle | None:
 
     @property
     def name(self) -> str | None:
-        """A name for the dataset, given by the creator to carry some semantic meaning. Useful for display purposes e.g. instead of displaying the pid. Will be autofilled if missing using info from sourceFolder."""
+        """A name for the dataset, given by the creator to carry some semantic meaning. Useful for display purposes e.g. instead of displaying the pid."""
         return self._name
 
     @name.setter
     def name(self, name: str | None) -> None:
-        """A name for the dataset, given by the creator to carry some semantic meaning. Useful for display purposes e.g. instead of displaying the pid. Will be autofilled if missing using info from sourceFolder."""
+        """A name for the dataset, given by the creator to carry some semantic meaning. Useful for display purposes e.g. instead of displaying the pid."""
         self._name = name
 
     @property
@@ -932,14 +986,19 @@ def proposal_id(self, proposal_id: str | None) -> None:
         """The ID of the proposal to which the dataset belongs."""
         self._proposal_id = proposal_id
 
+    @property
+    def proposal_ids(self) -> list[str] | None:
+        """The ID of the proposal to which the dataset belongs to and it has been acquired under."""
+        return self._proposal_ids
+
     @property
     def relationships(self) -> list[Relationship] | None:
-        """Stores the relationships with other datasets."""
+        """Array of relationships with other datasets. It contains relationship type and destination dataset"""
         return self._relationships
 
     @relationships.setter
     def relationships(self, relationships: list[Relationship] | None) -> None:
-        """Stores the relationships with other datasets."""
+        """Array of relationships with other datasets. It contains relationship type and destination dataset"""
         self._relationships = relationships
 
     @property
@@ -952,14 +1011,19 @@ def sample_id(self, sample_id: str | None) -> None:
         """ID of the sample used when collecting the data."""
         self._sample_id = sample_id
 
+    @property
+    def sample_ids(self) -> list[str] | None:
+        """Single ID or array of IDS of the samples used when collecting the data."""
+        return self._sample_ids
+
     @property
     def shared_with(self) -> list[str] | None:
-        """List of users that the dataset has been shared with."""
+        """List of additional users that the dataset has been shared with."""
         return self._shared_with
 
     @shared_with.setter
     def shared_with(self, shared_with: list[str] | None) -> None:
-        """List of users that the dataset has been shared with."""
+        """List of additional users that the dataset has been shared with."""
         self._shared_with = shared_with
 
     @property
@@ -982,19 +1046,29 @@ def source_folder_host(self, source_folder_host: str | None) -> None:
         """DNS host name of file server hosting sourceFolder, optionally including a protocol e.g. [protocol://]fileserver1.example.com"""
         self._source_folder_host = source_folder_host
 
+    @property
+    def start_time(self) -> datetime | None:
+        """Start time of data acquisition for the current dataset.<br>It is expected to be in ISO8601 format according to specifications for internet date/time format in RFC 3339, chapter 5.6 (https://www.rfc-editor.org/rfc/rfc3339#section-5).<br>Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server."""
+        return self._start_time
+
+    @start_time.setter
+    def start_time(self, start_time: datetime | None) -> None:
+        """Start time of data acquisition for the current dataset.<br>It is expected to be in ISO8601 format according to specifications for internet date/time format in RFC 3339, chapter 5.6 (https://www.rfc-editor.org/rfc/rfc3339#section-5).<br>Local times without timezone/offset info are automatically transformed to UTC using the timezone of the API server."""
+        self._start_time = start_time
+
     @property
     def techniques(self) -> list[Technique] | None:
-        """Stores the metadata information for techniques."""
+        """Array of techniques information, with technique name and pid."""
         return self._techniques
 
     @techniques.setter
     def techniques(self, techniques: list[Technique] | None) -> None:
-        """Stores the metadata information for techniques."""
+        """Array of techniques information, with technique name and pid."""
         self._techniques = techniques
 
     @property
     def updated_at(self) -> datetime | None:
-        """Date and time when this record was updated last. This property is added and maintained by mongoose."""
+        """Date and time when this record was updated last. This field is managed by mongoose with through the timestamp settings. The field should be a string containing a date in ISO 8601 format (2024-02-27T12:26:57.313Z)"""
         return self._updated_at
 
     @property
@@ -1004,12 +1078,12 @@ def updated_by(self) -> str | None:
 
     @property
     def used_software(self) -> list[str] | None:
-        """A list of links to software repositories which uniquely identifies the pieces of software, including versions, used for yielding the derived data. This field is required if the dataset is a Derived dataset."""
+        """A list of links to software repositories which uniquely identifies the pieces of software, including versions, used for yielding the derived data."""
         return self._used_software
 
     @used_software.setter
     def used_software(self, used_software: list[str] | None) -> None:
-        """A list of links to software repositories which uniquely identifies the pieces of software, including versions, used for yielding the derived data. This field is required if the dataset is a Derived dataset."""
+        """A list of links to software repositories which uniquely identifies the pieces of software, including versions, used for yielding the derived data."""
         self._used_software = used_software
 
     @property
diff --git a/src/scitacean/model.py b/src/scitacean/model.py
index cdc88510..de38c145 100644
--- a/src/scitacean/model.py
+++ b/src/scitacean/model.py
@@ -101,14 +101,11 @@
 from .thumbnail import Thumbnail
 
 
-class DownloadDataset(
-    BaseModel, masked=("attachments", "datablocks", "history", "origdatablocks")
-):
+class DownloadDataset(BaseModel, masked=("history",)):
     contactEmail: str | None = None
     creationLocation: str | None = None
     creationTime: datetime | None = None
     inputDatasets: list[PID] | None = None
-    investigator: str | None = None
     numberOfFilesArchived: NonNegativeInt | None = None
     owner: str | None = None
     ownerGroup: str | None = None
@@ -127,7 +124,7 @@ class DownloadDataset(
     description: str | None = None
     endTime: datetime | None = None
     instrumentGroup: str | None = None
-    instrumentId: str | None = None
+    instrumentIds: list[str] | None = None
     isPublished: bool | None = None
     jobLogData: str | None = None
     jobParameters: dict[str, Any] | None = None
@@ -141,12 +138,13 @@ class DownloadDataset(
     ownerEmail: str | None = None
     packedSize: NonNegativeInt | None = None
     pid: PID | None = None
-    proposalId: str | None = None
+    proposalIds: list[str] | None = None
     relationships: list[DownloadRelationship] | None = None
-    sampleId: str | None = None
+    sampleIds: list[str] | None = None
     sharedWith: list[str] | None = None
     size: NonNegativeInt | None = None
     sourceFolderHost: str | None = None
+    startTime: datetime | None = None
     techniques: list[DownloadTechnique] | None = None
     updatedAt: datetime | None = None
     updatedBy: str | None = None
@@ -195,6 +193,7 @@ class UploadDerivedDataset(BaseModel):
     orcidOfOwner: str | None = None
     ownerEmail: str | None = None
     packedSize: NonNegativeInt | None = None
+    proposalId: str | None = None
     relationships: list[UploadRelationship] | None = None
     sharedWith: list[str] | None = None
     size: NonNegativeInt | None = None
@@ -219,12 +218,15 @@ class UploadRawDataset(BaseModel):
     contactEmail: str
     creationLocation: str
     creationTime: datetime
+    inputDatasets: list[PID]
+    investigator: str
     numberOfFilesArchived: NonNegativeInt
     owner: str
     ownerGroup: str
     principalInvestigator: str
     sourceFolder: RemotePath
     type: DatasetType
+    usedSoftware: list[str]
     accessGroups: list[str] | None = None
     classification: str | None = None
     comment: str | None = None
@@ -235,6 +237,8 @@ class UploadRawDataset(BaseModel):
     instrumentGroup: str | None = None
     instrumentId: str | None = None
     isPublished: bool | None = None
+    jobLogData: str | None = None
+    jobParameters: dict[str, Any] | None = None
     keywords: list[str] | None = None
     license: str | None = None
     scientificMetadata: dict[str, Any] | None = None
@@ -249,6 +253,7 @@ class UploadRawDataset(BaseModel):
     sharedWith: list[str] | None = None
     size: NonNegativeInt | None = None
     sourceFolderHost: str | None = None
+    startTime: datetime | None = None
     techniques: list[UploadTechnique] | None = None
     validationStatus: str | None = None
 
@@ -316,13 +321,13 @@ def download_model_type(cls) -> type[DownloadAttachment]:
 
 class DownloadOrigDatablock(BaseModel):
     dataFileList: list[DownloadDataFile] | None = None
-    datasetId: PID | None = None
     size: NonNegativeInt | None = None
     id: str | None = pydantic.Field(alias="_id", default=None)
     accessGroups: list[str] | None = None
     chkAlg: str | None = None
     createdAt: datetime | None = None
     createdBy: str | None = None
+    datasetId: PID | None = None
     instrumentGroup: str | None = None
     isPublished: bool | None = None
     ownerGroup: str | None = None
@@ -472,9 +477,9 @@ def download_model_type(cls) -> type[DownloadRelationship]:
 
 
 class DownloadHistory(BaseModel):
-    id: str | None = pydantic.Field(alias="_id", default=None)
+    id: str | None = None
     updatedAt: datetime | None = None
-    updatedBy: datetime | None = None
+    updatedBy: str | None = None
 
     @pydantic.field_validator("updatedAt", mode="before")
     def _validate_datetime(cls, value: Any) -> Any:
@@ -764,20 +769,20 @@ def download_model_type(cls) -> type[DownloadRelationship]:
 
 @dataclass(kw_only=True, slots=True)
 class History(BaseUserModel):
-    __id: str | None = None
+    _id: str | None = None
     _updated_at: datetime | None = None
-    _updated_by: datetime | None = None
+    _updated_by: str | None = None
 
     @property
-    def _id(self) -> str | None:
-        return self.__id
+    def id(self) -> str | None:
+        return self._id
 
     @property
     def updated_at(self) -> datetime | None:
         return self._updated_at
 
     @property
-    def updated_by(self) -> datetime | None:
+    def updated_by(self) -> str | None:
         return self._updated_by
 
     @classmethod
diff --git a/tests/client/dataset_client_test.py b/tests/client/dataset_client_test.py
index 73f2ab21..d8fb4287 100644
--- a/tests/client/dataset_client_test.py
+++ b/tests/client/dataset_client_test.py
@@ -43,7 +43,7 @@ def derived_dataset(scicat_access):
 @pytest.mark.parametrize("key", ["raw", "derived"])
 def test_get_dataset_model(scicat_client, key):
     dset = INITIAL_DATASETS[key]
-    downloaded = scicat_client.get_dataset_model(dset.pid)
+    downloaded = scicat_client.get_dataset_model(dset.pid, strict_validation=True)
     # The backend may update the dataset after upload.
     # We cannot easily predict when that happens.
     downloaded.updatedAt = dset.updatedAt
@@ -57,7 +57,7 @@ def test_get_dataset_model_bad_id(scicat_client):
 
 def test_create_dataset_model(scicat_client, derived_dataset):
     finalized = scicat_client.create_dataset_model(derived_dataset)
-    downloaded = scicat_client.get_dataset_model(finalized.pid)
+    downloaded = scicat_client.get_dataset_model(finalized.pid, strict_validation=True)
     for key, expected in finalized:
         # The database populates a number of fields that are None in dset.
         # But we don't want to test those here as we don't want to test the database.
@@ -75,7 +75,7 @@ def test_validate_dataset_model(real_client, require_scicat_backend, derived_dat
 def test_get_dataset(client):
     dset = INITIAL_DATASETS["raw"]
     dblock = INITIAL_ORIG_DATABLOCKS["raw"][0]
-    downloaded = client.get_dataset(dset.pid)
+    downloaded = client.get_dataset(dset.pid, strict_validation=True)
 
     assert downloaded.source_folder == dset.sourceFolder
     assert downloaded.creation_time == dset.creationTime
@@ -96,7 +96,7 @@ def test_can_get_public_dataset_without_login(require_scicat_backend, scicat_acc
 
     dset = INITIAL_DATASETS["public"]
     dblock = INITIAL_ORIG_DATABLOCKS["public"][0]
-    downloaded = client.get_dataset(dset.pid)
+    downloaded = client.get_dataset(dset.pid, strict_validation=True)
 
     assert downloaded.source_folder == dset.sourceFolder
     assert downloaded.creation_time == dset.creationTime
diff --git a/tools/model-generation/README.md b/tools/model-generation/README.md
index b59d4adb..4a2ecd87 100644
--- a/tools/model-generation/README.md
+++ b/tools/model-generation/README.md
@@ -24,7 +24,7 @@ python generate_models.py --launch-scicat
 ```
 
 This overwrites the relevant files in the source directory.
-If will clean up the docker resources afterwards.
+It will clean up the docker resources afterward.
 
 See `generate_models.py` for options to configure the schema URL and output file paths.
 
diff --git a/tools/model-generation/spec/__init__.py b/tools/model-generation/spec/__init__.py
index 43b53ac9..e1b53779 100644
--- a/tools/model-generation/spec/__init__.py
+++ b/tools/model-generation/spec/__init__.py
@@ -160,8 +160,8 @@ def _collect_schemas(
 ) -> dict[str, _UpDownSchemas | _DatasetSchemas]:
     return {
         "Dataset": _DatasetSchemas(
-            upload_derived=schemas["CreateDerivedDatasetDto"],
-            upload_raw=schemas["CreateRawDatasetDto"],
+            upload_derived=schemas["CreateDerivedDatasetObsoleteDto"],
+            upload_raw=schemas["CreateRawDatasetObsoleteDto"],
             download=schemas["DatasetClass"],
         ),
         **{
diff --git a/tools/model-generation/spec/masked-fields.yml b/tools/model-generation/spec/masked-fields.yml
index 1af45ac9..a58a3c11 100644
--- a/tools/model-generation/spec/masked-fields.yml
+++ b/tools/model-generation/spec/masked-fields.yml
@@ -4,8 +4,5 @@
 # what model to mask it in.
 # Field names must be SciCat names (camelCase).
 Dataset:
-  - attachments
-  - datablocks
   - history  # because history is dropped (see field-validations.yml)
-  - origdatablocks
   - datasetlifecycle: upload
diff --git a/tools/model-generation/spec/schema.py b/tools/model-generation/spec/schema.py
index 0905556a..43aea246 100644
--- a/tools/model-generation/spec/schema.py
+++ b/tools/model-generation/spec/schema.py
@@ -30,10 +30,10 @@ def parse_field_type(spec: dict[str, Any]):
         return parse_field_type(spec["allOf"][0])
     if "$ref" in spec:
         return spec["$ref"].rsplit("/", 1)[1]
-    if "enum" in spec:
-        if spec["type"] != "string":
-            raise ValueError(f"Enum fields must have type 'string', got: {spec}")
-        return "Enum[" + ", ".join(spec["enum"]) + "]"
+    # if "enum" in spec:
+    #     if spec["type"] != "string":
+    #         raise ValueError(f"Enum fields must have type 'string', got: {spec}")
+    #     return "Enum[" + ", ".join(spec["enum"]) + "]"
     if spec["type"] == "number":
         return "int"
     if spec["type"] == "string":
diff --git a/tools/model-generation/templates/model.py.jinja b/tools/model-generation/templates/model.py.jinja
index fc247ad9..c03414af 100644
--- a/tools/model-generation/templates/model.py.jinja
+++ b/tools/model-generation/templates/model.py.jinja
@@ -12,7 +12,7 @@
 
 {% macro mask_keyword(spec, kind) %}
 {% if kind == "download" and spec.masked_fields_download %}
-, masked=({{ spec.masked_fields_download|map("quote")|join(", ") }})
+, masked=({{ spec.masked_fields_download|map("quote")|join(", ") }},)
 {% endif %}
 {% endmacro %}
 

From 70cd90cf32021a6449d3fd45cbb8476c9d3a14f6 Mon Sep 17 00:00:00 2001
From: Jan-Lukas Wynen <jan-lukas.wynen@ess.eu>
Date: Mon, 11 Nov 2024 15:02:44 +0100
Subject: [PATCH 03/13] Attempt 2

---
 src/scitacean/_base_model.py           |   2 +-
 src/scitacean/_dataset_fields.py       | 120 +++++++++++++------------
 src/scitacean/client.py                |   6 +-
 src/scitacean/dataset.py               |   4 +-
 src/scitacean/model.py                 |  12 +--
 src/scitacean/testing/backend/seed.py  |   4 +
 tests/client/attachment_client_test.py |   2 +-
 tests/client/query_client_test.py      |  28 +++---
 tests/dataset_fields_test.py           |   3 +
 tests/dataset_test.py                  |  51 +++--------
 tests/html_repr/html_repr_test.py      |   2 +-
 tests/model_test.py                    |   6 +-
 tools/model-generation/spec/schema.py  |   4 -
 13 files changed, 119 insertions(+), 125 deletions(-)

diff --git a/src/scitacean/_base_model.py b/src/scitacean/_base_model.py
index b2d20a1f..6b3f0575 100644
--- a/src/scitacean/_base_model.py
+++ b/src/scitacean/_base_model.py
@@ -313,7 +313,7 @@ def _model_field_name_of(cls_name: str, name: str) -> str:
 
     Converts snake_case to camelCase and strips leading underscores.
     E.g.,
-    `proposal_id` -> `proposalId`,
+    `proposal_ids` -> `proposalIds`,
     `_created_at` -> `createdAt`,
     `_History__id` -> `id`.
     """
diff --git a/src/scitacean/_dataset_fields.py b/src/scitacean/_dataset_fields.py
index 51816f7d..40fdf74d 100644
--- a/src/scitacean/_dataset_fields.py
+++ b/src/scitacean/_dataset_fields.py
@@ -99,7 +99,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="access_groups",
-            description="Optional additional groups which have read access to the data. Users which are members in one of the groups listed here are allowed to access this data. The special group 'public' makes data available to all users.",
+            description="List of groups which have access to this item.",
             read_only=False,
             required=False,
             scicat_name="accessGroups",
@@ -199,7 +199,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="data_quality_metrics",
-            description="Data Quality Metrics given by the user to rate the dataset.",
+            description="Data Quality Metrics is a number given by the user to rate the dataset.",
             read_only=False,
             required=False,
             scicat_name="dataQualityMetrics",
@@ -239,7 +239,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="instrument_group",
-            description="Optional additional groups which have read and write access to the data. Users which are members in one of the groups listed here are allowed to access this data.",
+            description="Group of the instrument which this item was acquired on.",
             read_only=False,
             required=False,
             scicat_name="instrumentGroup",
@@ -248,12 +248,12 @@ def used_by(self, dataset_type: DatasetType) -> bool:
             used_by_raw=True,
         ),
         Field(
-            name="instrument_id",
+            name="instrument_ids",
             description="ID of the instrument where the data was created.",
             read_only=False,
             required=False,
-            scicat_name="instrumentId",
-            type=str,
+            scicat_name="instrumentIds",
+            type=list[str],
             used_by_derived=False,
             used_by_raw=True,
         ),
@@ -379,7 +379,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="owner_group",
-            description="Defines the group which owns the data, and therefore has unrestricted access to this data. Usually a pgroup like p12151",
+            description="Name of the group owning this item.",
             read_only=False,
             required=True,
             scicat_name="ownerGroup",
@@ -389,7 +389,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="pid",
-            description="Persistent Identifier for datasets derived from UUIDv4 and prepended automatically by site specific PID prefix like 20.500.12345/",
+            description="Persistent identifier of the dataset.",
             read_only=True,
             required=False,
             scicat_name="pid",
@@ -408,20 +408,10 @@ def used_by(self, dataset_type: DatasetType) -> bool:
             used_by_raw=True,
         ),
         Field(
-            name="proposal_id",
+            name="proposal_ids",
             description="The ID of the proposal to which the dataset belongs.",
             read_only=False,
             required=False,
-            scicat_name="proposalId",
-            type=str,
-            used_by_derived=True,
-            used_by_raw=True,
-        ),
-        Field(
-            name="proposal_ids",
-            description="The ID of the proposal to which the dataset belongs to and it has been acquired under.",
-            read_only=True,
-            required=False,
             scicat_name="proposalIds",
             type=list[str],
             used_by_derived=True,
@@ -438,12 +428,22 @@ def used_by(self, dataset_type: DatasetType) -> bool:
             used_by_raw=True,
         ),
         Field(
-            name="sample_id",
-            description="ID of the sample used when collecting the data.",
+            name="run_number",
+            description="Run number assigned by the system to the data acquisition for the current dataset.",
             read_only=False,
             required=False,
-            scicat_name="sampleId",
+            scicat_name="runNumber",
             type=str,
+            used_by_derived=True,
+            used_by_raw=True,
+        ),
+        Field(
+            name="sample_ids",
+            description="ID of the sample used when collecting the data.",
+            read_only=False,
+            required=False,
+            scicat_name="sampleIds",
+            type=list[str],
             used_by_derived=False,
             used_by_raw=True,
         ),
@@ -565,7 +565,6 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         "_end_time",
         "_input_datasets",
         "_instrument_group",
-        "_instrument_id",
         "_instrument_ids",
         "_investigator",
         "_is_published",
@@ -581,10 +580,9 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         "_owner_group",
         "_pid",
         "_principal_investigator",
-        "_proposal_id",
         "_proposal_ids",
         "_relationships",
-        "_sample_id",
+        "_run_number",
         "_sample_ids",
         "_shared_with",
         "_source_folder",
@@ -617,7 +615,7 @@ def __init__(
         end_time: datetime | None = None,
         input_datasets: list[PID] | None = None,
         instrument_group: str | None = None,
-        instrument_id: str | None = None,
+        instrument_ids: list[str] | None = None,
         investigator: str | None = None,
         is_published: bool | None = None,
         job_log_data: str | None = None,
@@ -630,9 +628,10 @@ def __init__(
         owner_email: str | None = None,
         owner_group: str | None = None,
         principal_investigator: str | None = None,
-        proposal_id: str | None = None,
+        proposal_ids: list[str] | None = None,
         relationships: list[Relationship] | None = None,
-        sample_id: str | None = None,
+        run_number: str | None = None,
+        sample_ids: list[str] | None = None,
         shared_with: list[str] | None = None,
         source_folder: RemotePath | str | None = None,
         source_folder_host: str | None = None,
@@ -656,7 +655,7 @@ def __init__(
         self._end_time = end_time
         self._input_datasets = input_datasets
         self._instrument_group = instrument_group
-        self._instrument_id = instrument_id
+        self._instrument_ids = instrument_ids
         self._investigator = investigator
         self._is_published = is_published
         self._job_log_data = job_log_data
@@ -669,9 +668,10 @@ def __init__(
         self._owner_email = owner_email
         self._owner_group = owner_group
         self._principal_investigator = principal_investigator
-        self._proposal_id = proposal_id
+        self._proposal_ids = proposal_ids
         self._relationships = relationships
-        self._sample_id = sample_id
+        self._run_number = run_number
+        self._sample_ids = sample_ids
         self._shared_with = shared_with
         self._source_folder = _parse_remote_path(source_folder)
         self._source_folder_host = source_folder_host
@@ -698,12 +698,12 @@ def __init__(
 
     @property
     def access_groups(self) -> list[str] | None:
-        """Optional additional groups which have read access to the data. Users which are members in one of the groups listed here are allowed to access this data. The special group 'public' makes data available to all users."""
+        """List of groups which have access to this item."""
         return self._access_groups
 
     @access_groups.setter
     def access_groups(self, access_groups: list[str] | None) -> None:
-        """Optional additional groups which have read access to the data. Users which are members in one of the groups listed here are allowed to access this data. The special group 'public' makes data available to all users."""
+        """List of groups which have access to this item."""
         self._access_groups = access_groups
 
     @property
@@ -783,12 +783,12 @@ def data_format(self, data_format: str | None) -> None:
 
     @property
     def data_quality_metrics(self) -> int | None:
-        """Data Quality Metrics given by the user to rate the dataset."""
+        """Data Quality Metrics is a number given by the user to rate the dataset."""
         return self._data_quality_metrics
 
     @data_quality_metrics.setter
     def data_quality_metrics(self, data_quality_metrics: int | None) -> None:
-        """Data Quality Metrics given by the user to rate the dataset."""
+        """Data Quality Metrics is a number given by the user to rate the dataset."""
         self._data_quality_metrics = data_quality_metrics
 
     @property
@@ -823,23 +823,23 @@ def input_datasets(self, input_datasets: list[PID] | None) -> None:
 
     @property
     def instrument_group(self) -> str | None:
-        """Optional additional groups which have read and write access to the data. Users which are members in one of the groups listed here are allowed to access this data."""
+        """Group of the instrument which this item was acquired on."""
         return self._instrument_group
 
     @instrument_group.setter
     def instrument_group(self, instrument_group: str | None) -> None:
-        """Optional additional groups which have read and write access to the data. Users which are members in one of the groups listed here are allowed to access this data."""
+        """Group of the instrument which this item was acquired on."""
         self._instrument_group = instrument_group
 
     @property
-    def instrument_id(self) -> str | None:
+    def instrument_ids(self) -> list[str] | None:
         """ID of the instrument where the data was created."""
-        return self._instrument_id
+        return self._instrument_ids
 
-    @instrument_id.setter
-    def instrument_id(self, instrument_id: str | None) -> None:
+    @instrument_ids.setter
+    def instrument_ids(self, instrument_ids: list[str] | None) -> None:
         """ID of the instrument where the data was created."""
-        self._instrument_id = instrument_id
+        self._instrument_ids = instrument_ids
 
     @property
     def instrument_ids(self) -> list[str] | None:
@@ -953,17 +953,17 @@ def owner_email(self, owner_email: str | None) -> None:
 
     @property
     def owner_group(self) -> str | None:
-        """Defines the group which owns the data, and therefore has unrestricted access to this data. Usually a pgroup like p12151"""
+        """Name of the group owning this item."""
         return self._owner_group
 
     @owner_group.setter
     def owner_group(self, owner_group: str | None) -> None:
-        """Defines the group which owns the data, and therefore has unrestricted access to this data. Usually a pgroup like p12151"""
+        """Name of the group owning this item."""
         self._owner_group = owner_group
 
     @property
     def pid(self) -> PID | None:
-        """Persistent Identifier for datasets derived from UUIDv4 and prepended automatically by site specific PID prefix like 20.500.12345/"""
+        """Persistent identifier of the dataset."""
         return self._pid
 
     @property
@@ -977,14 +977,14 @@ def principal_investigator(self, principal_investigator: str | None) -> None:
         self._principal_investigator = principal_investigator
 
     @property
-    def proposal_id(self) -> str | None:
+    def proposal_ids(self) -> list[str] | None:
         """The ID of the proposal to which the dataset belongs."""
-        return self._proposal_id
+        return self._proposal_ids
 
-    @proposal_id.setter
-    def proposal_id(self, proposal_id: str | None) -> None:
+    @proposal_ids.setter
+    def proposal_ids(self, proposal_ids: list[str] | None) -> None:
         """The ID of the proposal to which the dataset belongs."""
-        self._proposal_id = proposal_id
+        self._proposal_ids = proposal_ids
 
     @property
     def proposal_ids(self) -> list[str] | None:
@@ -1002,14 +1002,24 @@ def relationships(self, relationships: list[Relationship] | None) -> None:
         self._relationships = relationships
 
     @property
-    def sample_id(self) -> str | None:
+    def run_number(self) -> str | None:
+        """Run number assigned by the system to the data acquisition for the current dataset."""
+        return self._run_number
+
+    @run_number.setter
+    def run_number(self, run_number: str | None) -> None:
+        """Run number assigned by the system to the data acquisition for the current dataset."""
+        self._run_number = run_number
+
+    @property
+    def sample_ids(self) -> list[str] | None:
         """ID of the sample used when collecting the data."""
-        return self._sample_id
+        return self._sample_ids
 
-    @sample_id.setter
-    def sample_id(self, sample_id: str | None) -> None:
+    @sample_ids.setter
+    def sample_ids(self, sample_ids: str | None) -> None:
         """ID of the sample used when collecting the data."""
-        self._sample_id = sample_id
+        self._sample_ids = sample_ids
 
     @property
     def sample_ids(self) -> list[str] | None:
diff --git a/src/scitacean/client.py b/src/scitacean/client.py
index 700665e7..06f96e74 100644
--- a/src/scitacean/client.py
+++ b/src/scitacean/client.py
@@ -761,7 +761,7 @@ def query_datasets(
 
         .. code-block:: python
 
-            scicat_client.query_datasets({'proposalId': 'abc.123'})
+            scicat_client.query_datasets({'proposalIds': ['abc.123']})
 
         Get all datasets that belong to proposal ``abc.123``
         **and** have name ``"ds name"``: (The name and proposal must match exactly.)
@@ -769,7 +769,7 @@ def query_datasets(
         .. code-block:: python
 
             scicat_client.query_datasets({
-                'proposalId': 'abc.123',
+                'proposalIds': ['abc.123'],
                 'datasetName': 'ds name'
             })
 
@@ -778,7 +778,7 @@ def query_datasets(
         .. code-block:: python
 
             scicat_client.query_datasets(
-                {'proposalId': 'bc.123'},
+                {'proposalIds': ['bc.123']},
                 limit=5,
                 order="creationTime:desc",
             )
diff --git a/src/scitacean/dataset.py b/src/scitacean/dataset.py
index 36430ba6..95bc6026 100644
--- a/src/scitacean/dataset.py
+++ b/src/scitacean/dataset.py
@@ -437,7 +437,7 @@ def make_upload_model(self) -> UploadDerivedDataset | UploadRawDataset:
         )
         # Datablocks are not included here because they are handled separately
         # by make_datablock_upload_models and their own endpoints.
-        special = ("relationships", "techniques")
+        special = ("relationships", "techniques", "input_datasets", "used_software")
         return model(
             numberOfFiles=self.number_of_files,
             numberOfFilesArchived=self.number_of_files_archived,
@@ -450,6 +450,8 @@ def make_upload_model(self) -> UploadDerivedDataset | UploadRawDataset:
             relationships=convert_user_to_upload_model(  # type: ignore[arg-type]
                 self.relationships
             ),
+            inputDatasets=self.input_datasets or [],
+            usedSoftware=self.used_software or [],
             **{
                 field.scicat_name: value
                 for field in self.fields()
diff --git a/src/scitacean/model.py b/src/scitacean/model.py
index de38c145..57286018 100644
--- a/src/scitacean/model.py
+++ b/src/scitacean/model.py
@@ -140,6 +140,7 @@ class DownloadDataset(BaseModel, masked=("history",)):
     pid: PID | None = None
     proposalIds: list[str] | None = None
     relationships: list[DownloadRelationship] | None = None
+    runNumber: str | None = None
     sampleIds: list[str] | None = None
     sharedWith: list[str] | None = None
     size: NonNegativeInt | None = None
@@ -193,8 +194,9 @@ class UploadDerivedDataset(BaseModel):
     orcidOfOwner: str | None = None
     ownerEmail: str | None = None
     packedSize: NonNegativeInt | None = None
-    proposalId: str | None = None
+    proposalIds: list[str] | None = None
     relationships: list[UploadRelationship] | None = None
+    runNumber: str | None = None
     sharedWith: list[str] | None = None
     size: NonNegativeInt | None = None
     sourceFolderHost: str | None = None
@@ -219,7 +221,6 @@ class UploadRawDataset(BaseModel):
     creationLocation: str
     creationTime: datetime
     inputDatasets: list[PID]
-    investigator: str
     numberOfFilesArchived: NonNegativeInt
     owner: str
     ownerGroup: str
@@ -235,7 +236,7 @@ class UploadRawDataset(BaseModel):
     description: str | None = None
     endTime: datetime | None = None
     instrumentGroup: str | None = None
-    instrumentId: str | None = None
+    instrumentIds: list[str] | None = None
     isPublished: bool | None = None
     jobLogData: str | None = None
     jobParameters: dict[str, Any] | None = None
@@ -247,9 +248,10 @@ class UploadRawDataset(BaseModel):
     orcidOfOwner: str | None = None
     ownerEmail: str | None = None
     packedSize: NonNegativeInt | None = None
-    proposalId: str | None = None
+    proposalIds: list[str] | None = None
     relationships: list[UploadRelationship] | None = None
-    sampleId: str | None = None
+    runNumber: str | None = None
+    sampleIds: list[str] | None = None
     sharedWith: list[str] | None = None
     size: NonNegativeInt | None = None
     sourceFolderHost: str | None = None
diff --git a/src/scitacean/testing/backend/seed.py b/src/scitacean/testing/backend/seed.py
index 4d8f188e..eed7ceed 100644
--- a/src/scitacean/testing/backend/seed.py
+++ b/src/scitacean/testing/backend/seed.py
@@ -56,6 +56,8 @@
             "temperature": {"value": "123", "unit": "K"},
             "weight": {"value": "42", "unit": "mg"},
         },
+        usedSoftware=[],
+        inputDatasets=[],
     ),
     "derived": UploadDerivedDataset(
         ownerGroup="PLACEHOLDER",
@@ -96,6 +98,8 @@
         principalInvestigator="Mustrum Ridcully",
         creationLocation=SITE,
         techniques=[UploadTechnique(pid="S", name="shoes")],
+        inputDatasets=[],
+        usedSoftware=["scitacean"],
     ),
     "partially-broken": model.construct(
         UploadDerivedDataset,
diff --git a/tests/client/attachment_client_test.py b/tests/client/attachment_client_test.py
index c627c0d1..99f575e7 100644
--- a/tests/client/attachment_client_test.py
+++ b/tests/client/attachment_client_test.py
@@ -120,7 +120,7 @@ def test_create_attachment_for_dataset_for_dataset_populates_ids(
     assert finalized.id is not None
     assert finalized.datasetId is not None
     assert finalized.sampleId is None
-    assert finalized.proposalId is None
+    assert finalized.proposalIds is None
 
 
 def test_get_attachments_for_dataset(scicat_client):
diff --git a/tests/client/query_client_test.py b/tests/client/query_client_test.py
index 243b25e0..1a478351 100644
--- a/tests/client/query_client_test.py
+++ b/tests/client/query_client_test.py
@@ -22,7 +22,9 @@
         type=DatasetType.RAW,
         principalInvestigator="investigator 1",
         creationLocation="UU",
-        proposalId="p0124",
+        proposalIds=["p0124"],
+        inputDatasets=[],
+        usedSoftware=["scitacean"],
     ),
     "raw2": model.UploadRawDataset(
         ownerGroup="PLACEHOLDER",
@@ -37,7 +39,9 @@
         type=DatasetType.RAW,
         principalInvestigator="investigator 2",
         creationLocation="UU",
-        proposalId="p0124",
+        proposalIds=["p0124"],
+        inputDatasets=[],
+        usedSoftware=[],
     ),
     "raw3": model.UploadRawDataset(
         ownerGroup="PLACEHOLDER",
@@ -52,7 +56,9 @@
         type=DatasetType.RAW,
         principalInvestigator="investigator 1",
         creationLocation="UU",
-        proposalId="p0124",
+        proposalIds=["p0124"],
+        inputDatasets=[],
+        usedSoftware=["scitacean"],
     ),
     "raw4": model.UploadRawDataset(
         ownerGroup="PLACEHOLDER",
@@ -67,6 +73,8 @@
         type=DatasetType.RAW,
         principalInvestigator="investigator X",
         creationLocation="UU",
+        inputDatasets=[],
+        usedSoftware=[],
     ),
     "derived1": model.UploadDerivedDataset(
         ownerGroup="PLACEHOLDER",
@@ -118,7 +126,7 @@ def _seed_database(request: pytest.FixtureRequest, scicat_access: SciCatAccess)
 
 @pytest.mark.usefixtures("_seed_database")
 def test_query_dataset_multiple_by_single_field(real_client):
-    datasets = real_client.scicat.query_datasets({"proposalId": "p0124"})
+    datasets = real_client.scicat.query_datasets({"proposalIds": ["p0124"]})
     actual = {ds.pid: ds for ds in datasets}
     expected = {SEED[key].pid: SEED[key] for key in ("raw1", "raw2", "raw3")}
     assert actual == expected
@@ -133,7 +141,7 @@ def test_query_dataset_no_match(real_client):
 @pytest.mark.usefixtures("_seed_database")
 def test_query_dataset_multiple_by_multiple_fields(real_client):
     datasets = real_client.scicat.query_datasets(
-        {"proposalId": "p0124", "principalInvestigator": "investigator 1"},
+        {"proposalIds": ["p0124"], "principalInvestigator": "investigator 1"},
     )
     actual = {ds.pid: ds for ds in datasets}
     expected = {SEED[key].pid: SEED[key] for key in ("raw1", "raw3")}
@@ -153,7 +161,7 @@ def test_query_dataset_multiple_by_derived_field(real_client):
 @pytest.mark.usefixtures("_seed_database")
 def test_query_dataset_uses_conjunction_of_fields(real_client):
     datasets = real_client.scicat.query_datasets(
-        {"proposalId": "p0124", "investigator": "investigator X"},
+        {"proposalIds": ["p0124"], "investigator": "investigator X"},
     )
     assert not datasets
 
@@ -170,7 +178,7 @@ def test_query_dataset_can_use_custom_type(real_client):
 @pytest.mark.usefixtures("_seed_database")
 def test_query_dataset_set_order(real_client):
     datasets = real_client.scicat.query_datasets(
-        {"proposalId": "p0124"},
+        {"proposalIds": ["p0124"]},
         order="creationTime:desc",
     )
     # This test uses a list to check the order
@@ -181,7 +189,7 @@ def test_query_dataset_set_order(real_client):
 @pytest.mark.usefixtures("_seed_database")
 def test_query_dataset_limit_ascending_creation_time(real_client):
     datasets = real_client.scicat.query_datasets(
-        {"proposalId": "p0124"},
+        {"proposalIds": "p0124"},
         limit=2,
         order="creationTime:asc",
     )
@@ -193,7 +201,7 @@ def test_query_dataset_limit_ascending_creation_time(real_client):
 @pytest.mark.usefixtures("_seed_database")
 def test_query_dataset_limit_descending_creation_time(real_client):
     datasets = real_client.scicat.query_datasets(
-        {"proposalId": "p0124"},
+        {"proposalIds": ["p0124"]},
         limit=2,
         order="creationTime:desc",
     )
@@ -206,7 +214,7 @@ def test_query_dataset_limit_descending_creation_time(real_client):
 def test_query_dataset_limit_needs_order(real_client):
     with pytest.raises(ValueError, match="limit"):
         real_client.scicat.query_datasets(
-            {"proposalId": "p0124"},
+            {"proposalIds": ["p0124"]},
             limit=2,
         )
 
diff --git a/tests/dataset_fields_test.py b/tests/dataset_fields_test.py
index 84e090bc..9a9b245b 100644
--- a/tests/dataset_fields_test.py
+++ b/tests/dataset_fields_test.py
@@ -344,6 +344,7 @@ def test_make_raw_model():
         source_folder=RemotePath("/hex/source62"),
         creation_location="ANK/UU",
         shared_with=["librarian", "hicks"],
+        used_software=["scitacean"],
     )
     expected = UploadRawDataset(
         contactEmail="p.stibbons@uu.am",
@@ -360,6 +361,8 @@ def test_make_raw_model():
         numberOfFilesArchived=0,
         packedSize=0,
         size=0,
+        inputDatasets=[],
+        usedSoftware=["scitacean"],
     )
     assert dset.make_upload_model() == expected
 
diff --git a/tests/dataset_test.py b/tests/dataset_test.py
index bb649419..2a37f67c 100644
--- a/tests/dataset_test.py
+++ b/tests/dataset_test.py
@@ -43,7 +43,7 @@ def raw_download_model():
         description="Some shady data",
         endTime=parse_datetime("1995-08-03T00:00:00Z"),
         instrumentGroup="professors",
-        instrumentId="0000-aa",
+        instrumentIds=["0000-aa"],
         isPublished=True,
         jobLogData=None,
         jobParameters=None,
@@ -55,8 +55,8 @@ def raw_download_model():
         ownerEmail="m.ridcully@uu.am",
         packedSize=0,
         pid=PID.parse("123.cc/948.f7.2a"),
-        proposalId="33.dc",
-        sampleId="bac.a4",
+        proposalIds=["33.dc"],
+        sampleIds=["bac.a4"],
         sharedWith=["librarian"],
         size=400,
         sourceFolderHost="ftp://uu.am/data",
@@ -112,7 +112,7 @@ def derived_download_model():
         description="Dubiously analyzed data",
         endTime=None,
         instrumentGroup="professors",
-        instrumentId=None,
+        instrumentIds=None,
         isPublished=True,
         jobLogData="process interrupted",
         jobParameters={"nodes": 4},
@@ -124,8 +124,8 @@ def derived_download_model():
         ownerEmail="m.ridcully@uu.am",
         packedSize=0,
         pid=PID.parse("123.cc/948.f7.2a"),
-        proposalId=None,
-        sampleId=None,
+        proposalIds=None,
+        sampleIds=None,
         sharedWith=["librarian"],
         size=400,
         sourceFolderHost="ftp://uu.am/data",
@@ -797,13 +797,8 @@ def test_derive_removes_attachments(initial, attachments):
     assert derived.attachments == []
 
 
-def invalid_field_example(my_type):
-    if my_type == DatasetType.DERIVED:
-        return "data_format", "sth_not_None"
-    elif my_type == DatasetType.RAW:
-        return "job_log_data", "sth_not_None"
-    else:
-        raise ValueError(my_type, " is not valid DatasetType.")
+def invalid_field_example() -> tuple[str, str]:
+    return "not_a_field", "sth_not_None"
 
 
 @given(initial=sst.datasets(for_upload=True))
@@ -815,22 +810,6 @@ def test_dataset_dict_like_keys_per_type(initial: Dataset) -> None:
     assert set(initial.keys()) == my_names
 
 
-@given(initial=sst.datasets(for_upload=True))
-@settings(max_examples=10)
-def test_dataset_dict_like_keys_including_invalid_field(initial):
-    invalid_name, invalid_value = invalid_field_example(initial.type)
-
-    my_names = {
-        field.name for field in Dataset._FIELD_SPEC if field.used_by(initial.type)
-    }
-    assert invalid_name not in my_names
-    my_names.add(invalid_name)
-
-    setattr(initial, invalid_name, invalid_value)
-
-    assert set(initial.keys()) == my_names
-
-
 @given(initial=sst.datasets(for_upload=True))
 @settings(max_examples=10)
 def test_dataset_dict_like_values(initial: Dataset) -> None:
@@ -841,7 +820,7 @@ def test_dataset_dict_like_values(initial: Dataset) -> None:
 @given(initial=sst.datasets(for_upload=True))
 @settings(max_examples=10)
 def test_dataset_dict_like_values_with_invalid_field(initial: Dataset) -> None:
-    setattr(initial, *invalid_field_example(initial.type))
+    setattr(initial, *invalid_field_example())
     for key, value in zip(initial.keys(), initial.values(), strict=True):
         assert value == getattr(initial, key)
 
@@ -849,7 +828,7 @@ def test_dataset_dict_like_values_with_invalid_field(initial: Dataset) -> None:
 @given(initial=sst.datasets(for_upload=True))
 @settings(max_examples=10)
 def test_dataset_dict_like_items_with_invalid_field(initial: Dataset) -> None:
-    setattr(initial, *invalid_field_example(initial.type))
+    setattr(initial, *invalid_field_example())
     for key, value in initial.items():
         assert value == getattr(initial, key)
 
@@ -884,16 +863,6 @@ def test_dataset_dict_like_setitem(initial: Dataset) -> None:
     assert initial["comment"] == sample_comment
 
 
-@given(initial=sst.datasets(for_upload=True))
-@settings(max_examples=10)
-def test_dataset_dict_like_setitem_invalid_field(initial: Dataset) -> None:
-    # ``__setitem__`` doesn't check if the item is invalid for the current type or not.
-    invalid_field, invalid_value = invalid_field_example(initial.type)
-    assert initial[invalid_field] is None
-    initial[invalid_field] = invalid_value
-    assert initial[invalid_field] == invalid_value
-
-
 @pytest.mark.parametrize(
     ("is_attr", "wrong_field", "wrong_value"),
     [(True, "size", 10), (False, "OBVIOUSLYWRONGNAME", "OBVIOUSLYWRONGVALUE")],
diff --git a/tests/html_repr/html_repr_test.py b/tests/html_repr/html_repr_test.py
index da172ffc..433baa61 100644
--- a/tests/html_repr/html_repr_test.py
+++ b/tests/html_repr/html_repr_test.py
@@ -13,7 +13,7 @@ def test_dataset_html_repr():
         name="My dataset",
         contact_email="devsci.cat",
         owner="The People",
-        instrument_id="the-peoples-neutron-gun",
+        instrument_ids=["the-peoples-neutron-gun"],
         used_software=["scitacean"],
         source_folder=RemotePath("/remote/dir/"),
         meta={
diff --git a/tests/model_test.py b/tests/model_test.py
index 59b3903b..ee33206c 100644
--- a/tests/model_test.py
+++ b/tests/model_test.py
@@ -200,12 +200,12 @@ def test_raw_dataset_default_values(real_client, require_scicat_backend, scicat_
     assert finalized.dataFormat is None
     assert finalized.description is None
     assert finalized.endTime is None
-    assert finalized.instrumentId is None
+    assert finalized.instrumentIds is None
     assert finalized.license is None
     assert finalized.orcidOfOwner is None
     assert finalized.ownerEmail is None
-    assert finalized.proposalId is None
-    assert finalized.sampleId is None
+    assert finalized.proposalIds is None
+    assert finalized.sampleIds is None
     assert finalized.sourceFolderHost is None
     assert finalized.validationStatus is None
     assert finalized.version is None
diff --git a/tools/model-generation/spec/schema.py b/tools/model-generation/spec/schema.py
index 43aea246..9632c0a0 100644
--- a/tools/model-generation/spec/schema.py
+++ b/tools/model-generation/spec/schema.py
@@ -30,10 +30,6 @@ def parse_field_type(spec: dict[str, Any]):
         return parse_field_type(spec["allOf"][0])
     if "$ref" in spec:
         return spec["$ref"].rsplit("/", 1)[1]
-    # if "enum" in spec:
-    #     if spec["type"] != "string":
-    #         raise ValueError(f"Enum fields must have type 'string', got: {spec}")
-    #     return "Enum[" + ", ".join(spec["enum"]) + "]"
     if spec["type"] == "number":
         return "int"
     if spec["type"] == "string":

From 361f034cd57f8927c9200196ff90dd531bee937d Mon Sep 17 00:00:00 2001
From: Jan-Lukas Wynen <jan-lukas.wynen@ess.eu>
Date: Mon, 16 Dec 2024 11:38:46 +0100
Subject: [PATCH 04/13] Make investigator optional

---
 src/scitacean/model.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/scitacean/model.py b/src/scitacean/model.py
index 57286018..152b2862 100644
--- a/src/scitacean/model.py
+++ b/src/scitacean/model.py
@@ -140,7 +140,6 @@ class DownloadDataset(BaseModel, masked=("history",)):
     pid: PID | None = None
     proposalIds: list[str] | None = None
     relationships: list[DownloadRelationship] | None = None
-    runNumber: str | None = None
     sampleIds: list[str] | None = None
     sharedWith: list[str] | None = None
     size: NonNegativeInt | None = None
@@ -194,9 +193,8 @@ class UploadDerivedDataset(BaseModel):
     orcidOfOwner: str | None = None
     ownerEmail: str | None = None
     packedSize: NonNegativeInt | None = None
-    proposalIds: list[str] | None = None
+    proposalId: str | None = None
     relationships: list[UploadRelationship] | None = None
-    runNumber: str | None = None
     sharedWith: list[str] | None = None
     size: NonNegativeInt | None = None
     sourceFolderHost: str | None = None
@@ -221,6 +219,7 @@ class UploadRawDataset(BaseModel):
     creationLocation: str
     creationTime: datetime
     inputDatasets: list[PID]
+    investigator: str | None = None
     numberOfFilesArchived: NonNegativeInt
     owner: str
     ownerGroup: str
@@ -236,7 +235,7 @@ class UploadRawDataset(BaseModel):
     description: str | None = None
     endTime: datetime | None = None
     instrumentGroup: str | None = None
-    instrumentIds: list[str] | None = None
+    instrumentId: str | None = None
     isPublished: bool | None = None
     jobLogData: str | None = None
     jobParameters: dict[str, Any] | None = None
@@ -248,10 +247,9 @@ class UploadRawDataset(BaseModel):
     orcidOfOwner: str | None = None
     ownerEmail: str | None = None
     packedSize: NonNegativeInt | None = None
-    proposalIds: list[str] | None = None
+    proposalId: str | None = None
     relationships: list[UploadRelationship] | None = None
-    runNumber: str | None = None
-    sampleIds: list[str] | None = None
+    sampleId: str | None = None
     sharedWith: list[str] | None = None
     size: NonNegativeInt | None = None
     sourceFolderHost: str | None = None
@@ -259,6 +257,19 @@ class UploadRawDataset(BaseModel):
     techniques: list[UploadTechnique] | None = None
     validationStatus: str | None = None
 
+    @pydantic.model_validator(mode="before")
+    @classmethod
+    def _set_investigator(cls, data):
+        # The model currently has both `investigator` and `principalInvestigator`
+        # and both are mandatory. Eventually, `investigator` will be removed.
+        # So make sure we can construct the model if only one is given.
+        if isinstance(data, dict):
+            if (inv := data.get("investigator")) is not None:
+                data.setdefault("principalInvestigator", inv)
+            elif (pi := data.get("principalInvestigator")) is not None:
+                data["investigator"] = pi
+        return data
+
     @pydantic.field_validator("creationTime", "endTime", mode="before")
     def _validate_datetime(cls, value: Any) -> Any:
         return validate_datetime(value)

From 61efd9e0285457b0697b798081578bfbf98b73e2 Mon Sep 17 00:00:00 2001
From: Jan-Lukas Wynen <jan-lukas.wynen@ess.eu>
Date: Mon, 16 Dec 2024 14:11:21 +0100
Subject: [PATCH 05/13] Work around schema inconsistencies

---
 src/scitacean/_dataset_fields.py              | 124 ++++++++----------
 src/scitacean/model.py                        |   2 +-
 src/scitacean/testing/backend/seed.py         |   2 +
 src/scitacean/testing/client.py               |   4 +
 tests/client/attachment_client_test.py        |   2 +-
 tests/client/dataset_client_test.py           |   1 +
 tests/client/query_client_test.py             |   6 +-
 tests/dataset_test.py                         |  18 ++-
 tests/html_repr/html_repr_test.py             |   2 +-
 tests/model_test.py                           |   1 -
 .../templates/dataset_fields.py.jinja         |   4 +-
 11 files changed, 89 insertions(+), 77 deletions(-)

diff --git a/src/scitacean/_dataset_fields.py b/src/scitacean/_dataset_fields.py
index 40fdf74d..6b994aa4 100644
--- a/src/scitacean/_dataset_fields.py
+++ b/src/scitacean/_dataset_fields.py
@@ -99,7 +99,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="access_groups",
-            description="List of groups which have access to this item.",
+            description="Optional additional groups which have read access to the data. Users which are members in one of the groups listed here are allowed to access this data. The special group 'public' makes data available to all users.",
             read_only=False,
             required=False,
             scicat_name="accessGroups",
@@ -199,7 +199,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="data_quality_metrics",
-            description="Data Quality Metrics is a number given by the user to rate the dataset.",
+            description="Data Quality Metrics given by the user to rate the dataset.",
             read_only=False,
             required=False,
             scicat_name="dataQualityMetrics",
@@ -239,7 +239,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="instrument_group",
-            description="Group of the instrument which this item was acquired on.",
+            description="Optional additional groups which have read and write access to the data. Users which are members in one of the groups listed here are allowed to access this data.",
             read_only=False,
             required=False,
             scicat_name="instrumentGroup",
@@ -248,12 +248,12 @@ def used_by(self, dataset_type: DatasetType) -> bool:
             used_by_raw=True,
         ),
         Field(
-            name="instrument_ids",
+            name="instrument_id",
             description="ID of the instrument where the data was created.",
             read_only=False,
             required=False,
-            scicat_name="instrumentIds",
-            type=list[str],
+            scicat_name="instrumentId",
+            type=str,
             used_by_derived=False,
             used_by_raw=True,
         ),
@@ -379,7 +379,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="owner_group",
-            description="Name of the group owning this item.",
+            description="Defines the group which owns the data, and therefore has unrestricted access to this data. Usually a pgroup like p12151",
             read_only=False,
             required=True,
             scicat_name="ownerGroup",
@@ -389,7 +389,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         ),
         Field(
             name="pid",
-            description="Persistent identifier of the dataset.",
+            description="Persistent Identifier for datasets derived from UUIDv4 and prepended automatically by site specific PID prefix like 20.500.12345/",
             read_only=True,
             required=False,
             scicat_name="pid",
@@ -408,10 +408,20 @@ def used_by(self, dataset_type: DatasetType) -> bool:
             used_by_raw=True,
         ),
         Field(
-            name="proposal_ids",
+            name="proposal_id",
             description="The ID of the proposal to which the dataset belongs.",
             read_only=False,
             required=False,
+            scicat_name="proposalId",
+            type=str,
+            used_by_derived=True,
+            used_by_raw=True,
+        ),
+        Field(
+            name="proposal_ids",
+            description="The ID of the proposal to which the dataset belongs to and it has been acquired under.",
+            read_only=True,
+            required=False,
             scicat_name="proposalIds",
             type=list[str],
             used_by_derived=True,
@@ -428,22 +438,12 @@ def used_by(self, dataset_type: DatasetType) -> bool:
             used_by_raw=True,
         ),
         Field(
-            name="run_number",
-            description="Run number assigned by the system to the data acquisition for the current dataset.",
-            read_only=False,
-            required=False,
-            scicat_name="runNumber",
-            type=str,
-            used_by_derived=True,
-            used_by_raw=True,
-        ),
-        Field(
-            name="sample_ids",
+            name="sample_id",
             description="ID of the sample used when collecting the data.",
             read_only=False,
             required=False,
-            scicat_name="sampleIds",
-            type=list[str],
+            scicat_name="sampleId",
+            type=str,
             used_by_derived=False,
             used_by_raw=True,
         ),
@@ -565,6 +565,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         "_end_time",
         "_input_datasets",
         "_instrument_group",
+        "_instrument_id",
         "_instrument_ids",
         "_investigator",
         "_is_published",
@@ -580,9 +581,10 @@ def used_by(self, dataset_type: DatasetType) -> bool:
         "_owner_group",
         "_pid",
         "_principal_investigator",
+        "_proposal_id",
         "_proposal_ids",
         "_relationships",
-        "_run_number",
+        "_sample_id",
         "_sample_ids",
         "_shared_with",
         "_source_folder",
@@ -615,7 +617,7 @@ def __init__(
         end_time: datetime | None = None,
         input_datasets: list[PID] | None = None,
         instrument_group: str | None = None,
-        instrument_ids: list[str] | None = None,
+        instrument_id: str | None = None,
         investigator: str | None = None,
         is_published: bool | None = None,
         job_log_data: str | None = None,
@@ -628,10 +630,9 @@ def __init__(
         owner_email: str | None = None,
         owner_group: str | None = None,
         principal_investigator: str | None = None,
-        proposal_ids: list[str] | None = None,
+        proposal_id: str | None = None,
         relationships: list[Relationship] | None = None,
-        run_number: str | None = None,
-        sample_ids: list[str] | None = None,
+        sample_id: str | None = None,
         shared_with: list[str] | None = None,
         source_folder: RemotePath | str | None = None,
         source_folder_host: str | None = None,
@@ -655,7 +656,7 @@ def __init__(
         self._end_time = end_time
         self._input_datasets = input_datasets
         self._instrument_group = instrument_group
-        self._instrument_ids = instrument_ids
+        self._instrument_id = instrument_id
         self._investigator = investigator
         self._is_published = is_published
         self._job_log_data = job_log_data
@@ -668,10 +669,9 @@ def __init__(
         self._owner_email = owner_email
         self._owner_group = owner_group
         self._principal_investigator = principal_investigator
-        self._proposal_ids = proposal_ids
+        self._proposal_id = proposal_id
         self._relationships = relationships
-        self._run_number = run_number
-        self._sample_ids = sample_ids
+        self._sample_id = sample_id
         self._shared_with = shared_with
         self._source_folder = _parse_remote_path(source_folder)
         self._source_folder_host = source_folder_host
@@ -698,12 +698,12 @@ def __init__(
 
     @property
     def access_groups(self) -> list[str] | None:
-        """List of groups which have access to this item."""
+        """Optional additional groups which have read access to the data. Users which are members in one of the groups listed here are allowed to access this data. The special group 'public' makes data available to all users."""
         return self._access_groups
 
     @access_groups.setter
     def access_groups(self, access_groups: list[str] | None) -> None:
-        """List of groups which have access to this item."""
+        """Optional additional groups which have read access to the data. Users which are members in one of the groups listed here are allowed to access this data. The special group 'public' makes data available to all users."""
         self._access_groups = access_groups
 
     @property
@@ -783,12 +783,12 @@ def data_format(self, data_format: str | None) -> None:
 
     @property
     def data_quality_metrics(self) -> int | None:
-        """Data Quality Metrics is a number given by the user to rate the dataset."""
+        """Data Quality Metrics given by the user to rate the dataset."""
         return self._data_quality_metrics
 
     @data_quality_metrics.setter
     def data_quality_metrics(self, data_quality_metrics: int | None) -> None:
-        """Data Quality Metrics is a number given by the user to rate the dataset."""
+        """Data Quality Metrics given by the user to rate the dataset."""
         self._data_quality_metrics = data_quality_metrics
 
     @property
@@ -823,23 +823,23 @@ def input_datasets(self, input_datasets: list[PID] | None) -> None:
 
     @property
     def instrument_group(self) -> str | None:
-        """Group of the instrument which this item was acquired on."""
+        """Optional additional groups which have read and write access to the data. Users which are members in one of the groups listed here are allowed to access this data."""
         return self._instrument_group
 
     @instrument_group.setter
     def instrument_group(self, instrument_group: str | None) -> None:
-        """Group of the instrument which this item was acquired on."""
+        """Optional additional groups which have read and write access to the data. Users which are members in one of the groups listed here are allowed to access this data."""
         self._instrument_group = instrument_group
 
     @property
-    def instrument_ids(self) -> list[str] | None:
+    def instrument_id(self) -> str | None:
         """ID of the instrument where the data was created."""
-        return self._instrument_ids
+        return self._instrument_id
 
-    @instrument_ids.setter
-    def instrument_ids(self, instrument_ids: list[str] | None) -> None:
+    @instrument_id.setter
+    def instrument_id(self, instrument_id: str | None) -> None:
         """ID of the instrument where the data was created."""
-        self._instrument_ids = instrument_ids
+        self._instrument_id = instrument_id
 
     @property
     def instrument_ids(self) -> list[str] | None:
@@ -953,17 +953,17 @@ def owner_email(self, owner_email: str | None) -> None:
 
     @property
     def owner_group(self) -> str | None:
-        """Name of the group owning this item."""
+        """Defines the group which owns the data, and therefore has unrestricted access to this data. Usually a pgroup like p12151"""
         return self._owner_group
 
     @owner_group.setter
     def owner_group(self, owner_group: str | None) -> None:
-        """Name of the group owning this item."""
+        """Defines the group which owns the data, and therefore has unrestricted access to this data. Usually a pgroup like p12151"""
         self._owner_group = owner_group
 
     @property
     def pid(self) -> PID | None:
-        """Persistent identifier of the dataset."""
+        """Persistent Identifier for datasets derived from UUIDv4 and prepended automatically by site specific PID prefix like 20.500.12345/"""
         return self._pid
 
     @property
@@ -977,14 +977,14 @@ def principal_investigator(self, principal_investigator: str | None) -> None:
         self._principal_investigator = principal_investigator
 
     @property
-    def proposal_ids(self) -> list[str] | None:
+    def proposal_id(self) -> str | None:
         """The ID of the proposal to which the dataset belongs."""
-        return self._proposal_ids
+        return self._proposal_id
 
-    @proposal_ids.setter
-    def proposal_ids(self, proposal_ids: list[str] | None) -> None:
+    @proposal_id.setter
+    def proposal_id(self, proposal_id: str | None) -> None:
         """The ID of the proposal to which the dataset belongs."""
-        self._proposal_ids = proposal_ids
+        self._proposal_id = proposal_id
 
     @property
     def proposal_ids(self) -> list[str] | None:
@@ -1002,24 +1002,14 @@ def relationships(self, relationships: list[Relationship] | None) -> None:
         self._relationships = relationships
 
     @property
-    def run_number(self) -> str | None:
-        """Run number assigned by the system to the data acquisition for the current dataset."""
-        return self._run_number
-
-    @run_number.setter
-    def run_number(self, run_number: str | None) -> None:
-        """Run number assigned by the system to the data acquisition for the current dataset."""
-        self._run_number = run_number
-
-    @property
-    def sample_ids(self) -> list[str] | None:
+    def sample_id(self) -> str | None:
         """ID of the sample used when collecting the data."""
-        return self._sample_ids
+        return self._sample_id
 
-    @sample_ids.setter
-    def sample_ids(self, sample_ids: str | None) -> None:
+    @sample_id.setter
+    def sample_id(self, sample_id: str | None) -> None:
         """ID of the sample used when collecting the data."""
-        self._sample_ids = sample_ids
+        self._sample_id = sample_id
 
     @property
     def sample_ids(self) -> list[str] | None:
@@ -1130,7 +1120,9 @@ def _prepare_fields_from_download(
         for field in DatasetBase._FIELD_SPEC:
             if field.read_only:
                 read_only["_" + field.name] = getattr(download_model, field.scicat_name)
-            else:
+            elif hasattr(
+                download_model, field.scicat_name
+            ):  # TODO remove condition in API v4
                 init_args[field.name] = getattr(download_model, field.scicat_name)
 
         init_args["meta"] = download_model.scientificMetadata
diff --git a/src/scitacean/model.py b/src/scitacean/model.py
index 152b2862..99f66384 100644
--- a/src/scitacean/model.py
+++ b/src/scitacean/model.py
@@ -219,7 +219,6 @@ class UploadRawDataset(BaseModel):
     creationLocation: str
     creationTime: datetime
     inputDatasets: list[PID]
-    investigator: str | None = None
     numberOfFilesArchived: NonNegativeInt
     owner: str
     ownerGroup: str
@@ -227,6 +226,7 @@ class UploadRawDataset(BaseModel):
     sourceFolder: RemotePath
     type: DatasetType
     usedSoftware: list[str]
+    investigator: str | None = None
     accessGroups: list[str] | None = None
     classification: str | None = None
     comment: str | None = None
diff --git a/src/scitacean/testing/backend/seed.py b/src/scitacean/testing/backend/seed.py
index eed7ceed..ff8c2bf0 100644
--- a/src/scitacean/testing/backend/seed.py
+++ b/src/scitacean/testing/backend/seed.py
@@ -48,6 +48,7 @@
         ownerEmail="PLACE@HOLD.ER",
         sourceFolder=RemotePath("/hex/data/123"),
         type=DatasetType.RAW,
+        investigator="Ponder Stibbons",
         principalInvestigator="Ponder Stibbons",
         creationLocation=SITE,
         techniques=[UploadTechnique(pid="DM666", name="dark_magic")],
@@ -95,6 +96,7 @@
         ownerEmail="PLACE@HOLD.ER",
         sourceFolder=RemotePath("/hex/secret/stuff"),
         type=DatasetType.RAW,
+        investigator="Mustrum Ridcully",
         principalInvestigator="Mustrum Ridcully",
         creationLocation=SITE,
         techniques=[UploadTechnique(pid="S", name="shoes")],
diff --git a/src/scitacean/testing/client.py b/src/scitacean/testing/client.py
index 55e8bddb..ea084650 100644
--- a/src/scitacean/testing/client.py
+++ b/src/scitacean/testing/client.py
@@ -324,6 +324,10 @@ def _process_dataset(
         )
     if "techniques" in fields:
         fields["techniques"] = list(map(_process_technique, fields["techniques"]))
+    # TODO remove in API v4
+    for singular in ("proposalId", "sampleId", "instrumentId"):
+        if singular in fields:
+            fields[singular + "s"] = [fields[singular]]
     return model.construct(
         model.DownloadDataset,
         _strict_validation=False,
diff --git a/tests/client/attachment_client_test.py b/tests/client/attachment_client_test.py
index 99f575e7..c627c0d1 100644
--- a/tests/client/attachment_client_test.py
+++ b/tests/client/attachment_client_test.py
@@ -120,7 +120,7 @@ def test_create_attachment_for_dataset_for_dataset_populates_ids(
     assert finalized.id is not None
     assert finalized.datasetId is not None
     assert finalized.sampleId is None
-    assert finalized.proposalIds is None
+    assert finalized.proposalId is None
 
 
 def test_get_attachments_for_dataset(scicat_client):
diff --git a/tests/client/dataset_client_test.py b/tests/client/dataset_client_test.py
index d8fb4287..d39eea22 100644
--- a/tests/client/dataset_client_test.py
+++ b/tests/client/dataset_client_test.py
@@ -76,6 +76,7 @@ def test_get_dataset(client):
     dset = INITIAL_DATASETS["raw"]
     dblock = INITIAL_ORIG_DATABLOCKS["raw"][0]
     downloaded = client.get_dataset(dset.pid, strict_validation=True)
+    print(downloaded.source_folder)
 
     assert downloaded.source_folder == dset.sourceFolder
     assert downloaded.creation_time == dset.creationTime
diff --git a/tests/client/query_client_test.py b/tests/client/query_client_test.py
index 1a478351..4081ea42 100644
--- a/tests/client/query_client_test.py
+++ b/tests/client/query_client_test.py
@@ -22,7 +22,7 @@
         type=DatasetType.RAW,
         principalInvestigator="investigator 1",
         creationLocation="UU",
-        proposalIds=["p0124"],
+        proposalId="p0124",
         inputDatasets=[],
         usedSoftware=["scitacean"],
     ),
@@ -39,7 +39,7 @@
         type=DatasetType.RAW,
         principalInvestigator="investigator 2",
         creationLocation="UU",
-        proposalIds=["p0124"],
+        proposalId="p0124",
         inputDatasets=[],
         usedSoftware=[],
     ),
@@ -56,7 +56,7 @@
         type=DatasetType.RAW,
         principalInvestigator="investigator 1",
         creationLocation="UU",
-        proposalIds=["p0124"],
+        proposalId="p0124",
         inputDatasets=[],
         usedSoftware=["scitacean"],
     ),
diff --git a/tests/dataset_test.py b/tests/dataset_test.py
index 2a37f67c..3804c2a3 100644
--- a/tests/dataset_test.py
+++ b/tests/dataset_test.py
@@ -24,7 +24,6 @@ def raw_download_model():
         creationLocation="UnseenUniversity",
         creationTime=parse_datetime("1995-08-06T14:14:14Z"),
         inputDatasets=None,
-        investigator=None,
         numberOfFilesArchived=None,
         owner="pstibbons",
         ownerGroup="faculty",
@@ -93,11 +92,10 @@ def derived_download_model():
         creationLocation=None,
         creationTime=parse_datetime("1995-08-06T14:14:14Z"),
         inputDatasets=[PID.parse("123.cc/948.f7.2a")],
-        investigator="Ponder Stibbons",
         numberOfFilesArchived=None,
         owner="pstibbons",
         ownerGroup="faculty",
-        principalInvestigator=None,
+        principalInvestigator="Ponder Stibbons",
         sourceFolder=RemotePath("/uu/hex"),
         type=DatasetType.DERIVED,
         usedSoftware=["scitacean"],
@@ -173,6 +171,8 @@ def get_model_field(name):
 
     dset = Dataset.from_download_models(dataset_download_model, [])
     for field in dset.fields():
+        if field.name in ("instrument_id", "sample_id", "proposal_id", "investigator"):
+            continue  # TODO remove when API v4 is released
         if field.used_by(dataset_download_model.type):
             assert getattr(dset, field.name) == get_model_field(field.scicat_name)
 
@@ -180,6 +180,8 @@ def get_model_field(name):
 def test_from_download_models_does_not_initialize_wrong_fields(dataset_download_model):
     dset = Dataset.from_download_models(dataset_download_model, [])
     for field in dset.fields():
+        if field.name == "principal_investigator":
+            continue  # TODO remove when API v4 is released
         if not field.used_by(dataset_download_model.type):
             assert getattr(dset, field.name) is None
 
@@ -318,6 +320,16 @@ def test_dataset_models_roundtrip(initial):
         orig_datablock_models=dblock_models,
         attachment_models=attachment_models,
     )
+
+    # TODO remove in API v4
+    rebuilt.investigator = initial.investigator
+    rebuilt.proposal_id = initial.proposal_id
+    initial._proposal_ids = rebuilt.proposal_ids
+    rebuilt.sample_id = initial.sample_id
+    initial._sample_ids = rebuilt.sample_ids
+    rebuilt.instrument_id = initial.instrument_id
+    initial._instrument_ids = rebuilt.instrument_ids
+
     assert initial == rebuilt
 
 
diff --git a/tests/html_repr/html_repr_test.py b/tests/html_repr/html_repr_test.py
index 433baa61..da172ffc 100644
--- a/tests/html_repr/html_repr_test.py
+++ b/tests/html_repr/html_repr_test.py
@@ -13,7 +13,7 @@ def test_dataset_html_repr():
         name="My dataset",
         contact_email="devsci.cat",
         owner="The People",
-        instrument_ids=["the-peoples-neutron-gun"],
+        instrument_id="the-peoples-neutron-gun",
         used_software=["scitacean"],
         source_folder=RemotePath("/remote/dir/"),
         meta={
diff --git a/tests/model_test.py b/tests/model_test.py
index ee33206c..24386e2c 100644
--- a/tests/model_test.py
+++ b/tests/model_test.py
@@ -226,7 +226,6 @@ def test_default_masked_fields_are_dropped():
 
 def test_custom_masked_fields_are_dropped():
     mod = DownloadDataset(  # type: ignore[call-arg]
-        attachments=[{"id": "abc"}],
         id="abc",
         _id="def",
         _v="123",
diff --git a/tools/model-generation/templates/dataset_fields.py.jinja b/tools/model-generation/templates/dataset_fields.py.jinja
index 2e8d5a3b..a5eb10d5 100644
--- a/tools/model-generation/templates/dataset_fields.py.jinja
+++ b/tools/model-generation/templates/dataset_fields.py.jinja
@@ -202,7 +202,9 @@ class DatasetBase:
         for field in DatasetBase._FIELD_SPEC:
             if field.read_only:
                 read_only["_"+field.name] = getattr(download_model, field.scicat_name)
-            else:
+            elif hasattr(
+                download_model, field.scicat_name
+            ):  # TODO remove condition in API v4
                 init_args[field.name] = getattr(download_model, field.scicat_name)
 
         init_args["meta"] = download_model.scientificMetadata

From 8e538fbb9fb49f20476c47c5d5ccf69a08644deb Mon Sep 17 00:00:00 2001
From: Jan-Lukas Wynen <jan-lukas.wynen@ess.eu>
Date: Tue, 17 Dec 2024 09:58:24 +0100
Subject: [PATCH 06/13] Make datasetName mandatory

---
 src/scitacean/_dataset_fields.py       | 2 +-
 src/scitacean/model.py                 | 4 ++--
 tests/client/attachment_client_test.py | 1 +
 tests/client/datablock_client_test.py  | 1 +
 tests/client/dataset_client_test.py    | 1 +
 tests/dataset_fields_test.py           | 5 +++++
 6 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/scitacean/_dataset_fields.py b/src/scitacean/_dataset_fields.py
index 6b994aa4..7496ca0a 100644
--- a/src/scitacean/_dataset_fields.py
+++ b/src/scitacean/_dataset_fields.py
@@ -341,7 +341,7 @@ def used_by(self, dataset_type: DatasetType) -> bool:
             name="name",
             description="A name for the dataset, given by the creator to carry some semantic meaning. Useful for display purposes e.g. instead of displaying the pid.",
             read_only=False,
-            required=False,
+            required=True,
             scicat_name="datasetName",
             type=str,
             used_by_derived=True,
diff --git a/src/scitacean/model.py b/src/scitacean/model.py
index 99f66384..1f51a120 100644
--- a/src/scitacean/model.py
+++ b/src/scitacean/model.py
@@ -176,6 +176,7 @@ class UploadDerivedDataset(BaseModel):
     sourceFolder: RemotePath
     type: DatasetType
     usedSoftware: list[str]
+    datasetName: str
     accessGroups: list[str] | None = None
     classification: str | None = None
     comment: str | None = None
@@ -188,7 +189,6 @@ class UploadDerivedDataset(BaseModel):
     keywords: list[str] | None = None
     license: str | None = None
     scientificMetadata: dict[str, Any] | None = None
-    datasetName: str | None = None
     numberOfFiles: NonNegativeInt | None = None
     orcidOfOwner: str | None = None
     ownerEmail: str | None = None
@@ -226,6 +226,7 @@ class UploadRawDataset(BaseModel):
     sourceFolder: RemotePath
     type: DatasetType
     usedSoftware: list[str]
+    datasetName: str
     investigator: str | None = None
     accessGroups: list[str] | None = None
     classification: str | None = None
@@ -242,7 +243,6 @@ class UploadRawDataset(BaseModel):
     keywords: list[str] | None = None
     license: str | None = None
     scientificMetadata: dict[str, Any] | None = None
-    datasetName: str | None = None
     numberOfFiles: NonNegativeInt | None = None
     orcidOfOwner: str | None = None
     ownerEmail: str | None = None
diff --git a/tests/client/attachment_client_test.py b/tests/client/attachment_client_test.py
index c627c0d1..89d90615 100644
--- a/tests/client/attachment_client_test.py
+++ b/tests/client/attachment_client_test.py
@@ -29,6 +29,7 @@ def scicat_client(client: Client) -> ScicatClient:
 @pytest.fixture
 def derived_dataset(scicat_access):
     return UploadDerivedDataset(
+        datasetName="Koelsche Lieder",
         contactEmail="black.foess@dom.koelle",
         creationTime=parse_date("1995-11-11T11:11:11.000Z"),
         owner="bfoess",
diff --git a/tests/client/datablock_client_test.py b/tests/client/datablock_client_test.py
index 27f5f07f..bcfd753b 100644
--- a/tests/client/datablock_client_test.py
+++ b/tests/client/datablock_client_test.py
@@ -26,6 +26,7 @@ def scicat_client(client: Client) -> ScicatClient:
 @pytest.fixture
 def derived_dataset(scicat_access):
     return UploadDerivedDataset(
+        datasetName="Koelsche Lieder",
         contactEmail="black.foess@dom.koelle",
         creationTime=parse_date("1995-11-11T11:11:11.000Z"),
         owner="bfoess",
diff --git a/tests/client/dataset_client_test.py b/tests/client/dataset_client_test.py
index d39eea22..77d8195f 100644
--- a/tests/client/dataset_client_test.py
+++ b/tests/client/dataset_client_test.py
@@ -26,6 +26,7 @@ def scicat_client(client: Client) -> ScicatClient:
 @pytest.fixture
 def derived_dataset(scicat_access):
     return UploadDerivedDataset(
+        datasetName="Koelsche Lieder",
         contactEmail="black.foess@dom.koelle",
         creationTime=parse_date("1995-11-11T11:11:11.000Z"),
         owner="bfoess",
diff --git a/tests/dataset_fields_test.py b/tests/dataset_fields_test.py
index 9a9b245b..126fe6ff 100644
--- a/tests/dataset_fields_test.py
+++ b/tests/dataset_fields_test.py
@@ -335,6 +335,7 @@ def test_fields_read_only__and_type_filter():
 
 def test_make_raw_model():
     dset = Dataset(
+        name="raw-dataset-62",
         type="raw",
         contact_email="p.stibbons@uu.am",
         creation_time="2142-04-02T16:44:56",
@@ -349,6 +350,7 @@ def test_make_raw_model():
     expected = UploadRawDataset(
         contactEmail="p.stibbons@uu.am",
         creationTime=dateutil.parser.parse("2142-04-02T16:44:56"),
+        datasetName="raw-dataset-62",
         owner="Ponder Stibbons;Mustrum Ridcully",
         ownerGroup="faculty",
         principalInvestigator="my principal investigator",
@@ -370,6 +372,7 @@ def test_make_raw_model():
 def test_make_derived_model():
     dset = Dataset(
         type="derived",
+        name="derived-dataset",
         contact_email="p.stibbons@uu.am;m.ridcully@uu.am",
         creation_time="2142-04-02T16:44:56",
         owner="Ponder Stibbons;Mustrum Ridcully",
@@ -381,6 +384,7 @@ def test_make_derived_model():
         used_software=["scitacean", "magick"],
     )
     expected = UploadDerivedDataset(
+        datasetName="derived-dataset",
         contactEmail="p.stibbons@uu.am;m.ridcully@uu.am",
         creationTime=dateutil.parser.parse("2142-04-02T16:44:56"),
         owner="Ponder Stibbons;Mustrum Ridcully",
@@ -484,6 +488,7 @@ def test_email_validation(field):
 def test_orcid_validation_valid(good_orcid):
     dset = Dataset(
         type="raw",
+        name="test ORCID",
         contact_email="jan-lukas.wynen@ess.eu",
         creation_location="scitacean/tests",
         creation_time="2142-04-02T16:44:56",

From d6cdf36e984da0ef9d8a7b24e96f588d817c9a86 Mon Sep 17 00:00:00 2001
From: Jan-Lukas Wynen <jan-lukas.wynen@ess.eu>
Date: Tue, 17 Dec 2024 10:16:47 +0100
Subject: [PATCH 07/13] Be lenient about plural fields

---
 src/scitacean/model.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/scitacean/model.py b/src/scitacean/model.py
index 1f51a120..9d3465d1 100644
--- a/src/scitacean/model.py
+++ b/src/scitacean/model.py
@@ -101,7 +101,10 @@
 from .thumbnail import Thumbnail
 
 
-class DownloadDataset(BaseModel, masked=("history",)):
+# TODO remove extra masks after API v4
+class DownloadDataset(
+    BaseModel, masked=("history", "proposalId", "sampleId", "instrumentId")
+):
     contactEmail: str | None = None
     creationLocation: str | None = None
     creationTime: datetime | None = None
@@ -164,6 +167,25 @@ def _validate_emails(cls, value: Any) -> Any:
     def _validate_orcids(cls, value: Any) -> Any:
         return validate_orcids(value)
 
+    # TODO remove after API v4
+    @pydantic.field_validator("sampleIds", mode="before")
+    def _validate_sample_ids(cls, value: Any) -> Any:
+        if value == [None]:
+            return []
+        return value
+
+    @pydantic.field_validator("proposalIds", mode="before")
+    def _validate_proposal_ids(cls, value: Any) -> Any:
+        if value == [None]:
+            return []
+        return value
+
+    @pydantic.field_validator("instrumentIds", mode="before")
+    def _validate_instrument_ids(cls, value: Any) -> Any:
+        if value == [None]:
+            return []
+        return value
+
 
 class UploadDerivedDataset(BaseModel):
     contactEmail: str

From 81dc0c1b0df2642ccf644c0c187c2ca35680acd4 Mon Sep 17 00:00:00 2001
From: Jan-Lukas Wynen <jan-lukas.wynen@ess.eu>
Date: Tue, 17 Dec 2024 10:25:46 +0100
Subject: [PATCH 08/13] Fix more tests

---
 tests/model_test.py         | 22 ++++++++++++++--------
 tests/transfer/sftp_test.py |  1 +
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/tests/model_test.py b/tests/model_test.py
index 24386e2c..841f7214 100644
--- a/tests/model_test.py
+++ b/tests/model_test.py
@@ -101,6 +101,7 @@ def test_derived_dataset_default_values(
         accessGroups=["access1"],
         contactEmail="contact@email.com",
         creationTime=parse_date("2000-01-01T01:01:01.000Z"),
+        datasetName="Test derived dataset",
         inputDatasets=[PID(prefix="PID.prefix.a0b1", pid="abcd")],
         investigator="inv@esti.gator",
         numberOfFilesArchived=0,
@@ -117,8 +118,9 @@ def test_derived_dataset_default_values(
     assert finalized.accessGroups == ["access1"]
     assert finalized.contactEmail == "contact@email.com"
     assert finalized.creationTime == parse_date("2000-01-01T01:01:01.000Z")
+    assert finalized.datasetName == "Test derived dataset"
     assert finalized.inputDatasets == [PID(prefix="PID.prefix.a0b1", pid="abcd")]
-    assert finalized.investigator == "inv@esti.gator"
+    assert finalized.principalInvestigator == "inv@esti.gator"
     assert finalized.owner == scicat_access.user.username
     assert finalized.ownerGroup == scicat_access.user.group
     assert finalized.sourceFolder == "/source/folder"
@@ -128,7 +130,6 @@ def test_derived_dataset_default_values(
     assert finalized.createdAt  # some non-empty str
     assert finalized.createdBy  # some non-empty str
     assert finalized.classification  # some non-empty str
-    assert finalized.datasetName  # some non-empty str
     assert finalized.isPublished is False
     assert finalized.keywords == []
     assert finalized.numberOfFiles == 0
@@ -140,6 +141,7 @@ def test_derived_dataset_default_values(
     assert finalized.size == 0
     assert finalized.techniques == []
     assert finalized.updatedAt  # some non-empty str
+    assert finalized.version == "v3"
 
     # Left empty
     assert finalized.description is None is None
@@ -150,7 +152,6 @@ def test_derived_dataset_default_values(
     assert finalized.ownerEmail is None
     assert finalized.sourceFolderHost is None
     assert finalized.validationStatus is None
-    assert finalized.version is None
 
 
 def test_raw_dataset_default_values(real_client, require_scicat_backend, scicat_access):
@@ -159,56 +160,61 @@ def test_raw_dataset_default_values(real_client, require_scicat_backend, scicat_
         contactEmail="contact@email.com",
         creationTime=parse_date("2000-01-01T01:01:01.000Z"),
         creationLocation="site",
+        datasetName="Test raw dataset",
+        inputDatasets=[],
         numberOfFilesArchived=0,
         owner=scicat_access.user.username,
         ownerGroup=scicat_access.user.group,
         principalInvestigator="inv@esti.gator",
         sourceFolder=RemotePath("/source/folder"),
         type=DatasetType.RAW,
+        usedSoftware=["software1"],
     )
     pid = real_client.scicat.create_dataset_model(dset).pid
     finalized = real_client.scicat.get_dataset_model(pid)
 
     # Inputs
+    assert finalized.datasetName == "Test raw dataset"
     assert finalized.accessGroups == ["access1"]
     assert finalized.contactEmail == "contact@email.com"
     assert finalized.creationLocation == "site"
     assert finalized.creationTime == parse_date("2000-01-01T01:01:01.000Z")
+    assert finalized.inputDatasets == []
     assert finalized.owner == scicat_access.user.username
     assert finalized.ownerGroup == scicat_access.user.group
     assert finalized.principalInvestigator == "inv@esti.gator"
     assert finalized.sourceFolder == "/source/folder"
+    assert finalized.usedSoftware == ["software1"]
 
     # Default values
     assert finalized.createdAt  # some non-empty str
     assert finalized.createdBy  # some non-empty str
     assert finalized.classification  # some non-empty str
-    assert finalized.datasetName  # some non-empty str
+    assert finalized.instrumentIds == []
     assert finalized.isPublished is False
     assert finalized.keywords == []
     assert finalized.numberOfFiles == 0
     assert finalized.numberOfFilesArchived == 0
     assert finalized.packedSize == 0
     assert finalized.pid  # some non-empty str
+    assert finalized.proposalIds == []
+    assert finalized.sampleIds == []
     assert finalized.scientificMetadata == {}
     assert finalized.sharedWith == []
     assert finalized.size == 0
     assert finalized.techniques == []
     assert finalized.updatedAt  # some non-empty str
+    assert finalized.version == "v3"
 
     # Left empty
     assert finalized.dataFormat is None
     assert finalized.description is None
     assert finalized.endTime is None
-    assert finalized.instrumentIds is None
     assert finalized.license is None
     assert finalized.orcidOfOwner is None
     assert finalized.ownerEmail is None
-    assert finalized.proposalIds is None
-    assert finalized.sampleIds is None
     assert finalized.sourceFolderHost is None
     assert finalized.validationStatus is None
-    assert finalized.version is None
 
 
 def test_default_masked_fields_are_dropped():
diff --git a/tests/transfer/sftp_test.py b/tests/transfer/sftp_test.py
index 8c461798..d6676630 100644
--- a/tests/transfer/sftp_test.py
+++ b/tests/transfer/sftp_test.py
@@ -402,6 +402,7 @@ def test_client_with_sftp(
         contact_email="p.stibbons@uu.am",
         creation_location="UU",
         creation_time=datetime(2023, 6, 23, 10, 0, 0, tzinfo=timezone.utc),
+        name="Secret Thaum Storage",
         owner="PonderStibbons",
         owner_group="uu",
         principal_investigator="MustrumRidcully",

From 470afce5b3286d5093eb6912b4b4ffad9d4f7660 Mon Sep 17 00:00:00 2001
From: Jan-Lukas Wynen <jan-lukas.wynen@ess.eu>
Date: Tue, 17 Dec 2024 11:50:45 +0100
Subject: [PATCH 09/13] Remove print

---
 tests/client/dataset_client_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/client/dataset_client_test.py b/tests/client/dataset_client_test.py
index 77d8195f..e882865c 100644
--- a/tests/client/dataset_client_test.py
+++ b/tests/client/dataset_client_test.py
@@ -77,7 +77,6 @@ def test_get_dataset(client):
     dset = INITIAL_DATASETS["raw"]
     dblock = INITIAL_ORIG_DATABLOCKS["raw"][0]
     downloaded = client.get_dataset(dset.pid, strict_validation=True)
-    print(downloaded.source_folder)
 
     assert downloaded.source_folder == dset.sourceFolder
     assert downloaded.creation_time == dset.creationTime

From e381c530fb0b1c0957e180ccfa80b1d55bb6a8d5 Mon Sep 17 00:00:00 2001
From: Jan-Lukas Wynen <jan-lukas.wynen@ess.eu>
Date: Tue, 17 Dec 2024 14:55:30 +0100
Subject: [PATCH 10/13] Fix query

---
 tests/client/query_client_test.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/client/query_client_test.py b/tests/client/query_client_test.py
index 4081ea42..2ab5db4f 100644
--- a/tests/client/query_client_test.py
+++ b/tests/client/query_client_test.py
@@ -151,17 +151,19 @@ def test_query_dataset_multiple_by_multiple_fields(real_client):
 @pytest.mark.usefixtures("_seed_database")
 def test_query_dataset_multiple_by_derived_field(real_client):
     datasets = real_client.scicat.query_datasets(
-        {"investigator": "investigator 1"},
+        {"principalInvestigator": "investigator 1"}
     )
     actual = {ds.pid: ds for ds in datasets}
-    expected = {SEED[key].pid: SEED[key] for key in ("derived1", "derived2")}
+    expected = {
+        SEED[key].pid: SEED[key] for key in ("derived1", "derived2", "raw1", "raw3")
+    }
     assert actual == expected
 
 
 @pytest.mark.usefixtures("_seed_database")
 def test_query_dataset_uses_conjunction_of_fields(real_client):
     datasets = real_client.scicat.query_datasets(
-        {"proposalIds": ["p0124"], "investigator": "investigator X"},
+        {"proposalIds": ["p0124"], "principalInvestigator": "investigator X"},
     )
     assert not datasets
 

From 5419c9367ca8fe8786e7720e1c2c020a2bc15773 Mon Sep 17 00:00:00 2001
From: Jan-Lukas Wynen <jan-lukas.wynen@ess.eu>
Date: Tue, 17 Dec 2024 14:59:26 +0100
Subject: [PATCH 11/13] Appease mypy

---
 src/scitacean/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/scitacean/model.py b/src/scitacean/model.py
index 9d3465d1..43f9715b 100644
--- a/src/scitacean/model.py
+++ b/src/scitacean/model.py
@@ -281,7 +281,7 @@ class UploadRawDataset(BaseModel):
 
     @pydantic.model_validator(mode="before")
     @classmethod
-    def _set_investigator(cls, data):
+    def _set_investigator(cls, data: Any) -> Any:
         # The model currently has both `investigator` and `principalInvestigator`
         # and both are mandatory. Eventually, `investigator` will be removed.
         # So make sure we can construct the model if only one is given.

From aab22d3b53bd0c29c069c420ef94a1a826b4a1fd Mon Sep 17 00:00:00 2001
From: Jan-Lukas Wynen <jan-lukas.wynen@ess.eu>
Date: Tue, 17 Dec 2024 15:49:14 +0100
Subject: [PATCH 12/13] Remove investigator in fake upload

---
 src/scitacean/testing/client.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/scitacean/testing/client.py b/src/scitacean/testing/client.py
index ea084650..97ca5ce6 100644
--- a/src/scitacean/testing/client.py
+++ b/src/scitacean/testing/client.py
@@ -324,10 +324,13 @@ def _process_dataset(
         )
     if "techniques" in fields:
         fields["techniques"] = list(map(_process_technique, fields["techniques"]))
+
     # TODO remove in API v4
     for singular in ("proposalId", "sampleId", "instrumentId"):
         if singular in fields:
             fields[singular + "s"] = [fields[singular]]
+    fields.pop("investigator")
+
     return model.construct(
         model.DownloadDataset,
         _strict_validation=False,

From 00aa09d7990333d55c9024d3a0f39c9be06ccc21 Mon Sep 17 00:00:00 2001
From: Jan-Lukas Wynen <jan-lukas.wynen@ess.eu>
Date: Tue, 17 Dec 2024 15:50:31 +0100
Subject: [PATCH 13/13] Add missing dataset name in docs

---
 docs/user-guide/testing.ipynb | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/user-guide/testing.ipynb b/docs/user-guide/testing.ipynb
index 278d019b..79e1e273 100644
--- a/docs/user-guide/testing.ipynb
+++ b/docs/user-guide/testing.ipynb
@@ -36,6 +36,7 @@
     "\n",
     "dataset = Dataset(\n",
     "    type=\"raw\",\n",
+    "    name=\"Important data\",\n",
     "    owner_group=\"faculty\",\n",
     "    owner=\"ridcully\",\n",
     "    principal_investigator=\"Ridcully\",\n",