From b6d75650dd52356ad29c48ab498b1894cf85298a Mon Sep 17 00:00:00 2001 From: Steve Wirt Date: Wed, 20 Nov 2024 17:19:05 -0500 Subject: [PATCH 1/4] DKAN-4287 Make harvest_run id not a primary key. --- modules/harvest/harvest.install | 160 +++++++++++++++++- .../harvest/src/Commands/HarvestCommands.php | 5 - modules/harvest/src/Entity/HarvestRun.php | 29 +++- .../src/Entity/HarvestRunRepository.php | 91 +++++++--- .../harvest/src/HarvestPlanListBuilder.php | 4 +- modules/harvest/src/HarvestService.php | 42 ++--- modules/harvest/src/HarvestUtility.php | 6 +- .../harvest/src/OrphanDatasetsProcessor.php | 2 +- .../Entity/HarvestRunRepositoryTest.php | 39 ++--- .../src/Kernel/HarvestPlanListBuilderTest.php | 4 +- .../tests/src/Kernel/HarvestServiceTest.php | 7 +- 11 files changed, 292 insertions(+), 97 deletions(-) diff --git a/modules/harvest/harvest.install b/modules/harvest/harvest.install index 64c25a4134..42defc8699 100644 --- a/modules/harvest/harvest.install +++ b/modules/harvest/harvest.install @@ -1,5 +1,10 @@ select($table_name_temp, 'hrt') + ->fields('hrt', ['id']); + $query->orderBy('id', 'ASC'); + $result = $query->execute()->fetchCol(0); + // Can't do orderBy as the sort end up natural, not numeric. + asort($result, SORT_NUMERIC); + + return $result ?? []; +} + +/** + * Reads a single harvest row from the temp table. + * + * @param string $table_name_temp + * Name of the table to read from. + * + * @param string $time_id + * The id to read from, which was also the timestamp. + * + * @return array + * Elements from the row ['id', 'harvest_plan_id', 'data', 'extract_status']. + */ +function harvest_read_harvest_run(string $table_name_temp, string $time_id): array { + $connection = Database::getConnection(); + $query = $connection->select($table_name_temp, 'hrt') + ->fields('hrt', ['id', 'harvest_plan_id', 'data', 'extract_status']) + ->condition('id', $time_id, '=') + ->orderBy('id', 'ASC'); + $result = $query->execute()->fetchAll(PDO::FETCH_ASSOC); + return reset($result); +} + +function harvest_write_harvest_run(string $id, string $harvest_plan_id, string $data, string $extract_status) { + /** @var \Drupal\Core\Database\Connection $connection */ + $connection = \Drupal::service('database'); + $result = $connection->insert('harvest_runs') + ->fields([ + 'timestamp' => (int) $id, + 'harvest_plan_id' => $harvest_plan_id, + 'data' => $data, + 'extract_status' => $extract_status, + ]) + ->execute(); +} + /** * Uninstall obsolete submodule harvest_dashboard. */ @@ -95,8 +157,100 @@ function harvest_update_8007(&$sandbox) { * This finishes the process started by harvest_update_8007. */ function harvest_update_8008(&$sandbox) { + // Moved and repeated to 8010. +} + +/** + * Update harvest_run schema to add timestamp, uuid, and true id. + * + * @see https://github.com/GetDKAN/dkan/issues/4287 + */ +function harvest_update_8009(&$sandbox) { + $table_name = 'harvest_runs'; + $table_name_temp = "{$table_name}_temp"; + $entity_type_name = 'harvest_run'; + + $definition_update_manager = \Drupal::entityDefinitionUpdateManager(); + $entity_type_manager = \Drupal::entityTypeManager(); + $schema = \Drupal::database()->schema(); + + // Move the table so we can rebuild from it. + $schema->renameTable($table_name, $table_name_temp); + $messages = "Table {$table_name} moved to {$table_name_temp}. " . PHP_EOL; + // Uninstall the the original entity. + $original_type = $definition_update_manager->getEntityType($entity_type_name); + $definition_update_manager->uninstallEntityType($original_type); + $messages .= "Old harvest_run entity removed. " . PHP_EOL; + $entity_type_manager->clearCachedDefinitions(); + // Install the new entity. + //$entity_type = $entity_type_manager->get($entity_type_name); + $entity_type_manager->clearCachedDefinitions(); + $entity_type_def = $entity_type_manager->getDefinition($entity_type_name); + $definition_update_manager->installEntityType($entity_type_def); + $messages .= "New harvest_run entity installed. " . PHP_EOL; + + return $messages; +} + +/** + * Move data from temp table back into harvest_run. + * + * @see https://github.com/GetDKAN/dkan/issues/4287 + */ +function harvest_update_8010(&$sandbox) { + $table_name = 'harvest_runs'; + $table_name_temp = "{$table_name}_temp"; + $messages = ''; + $schema = \Drupal::database()->schema(); + + if (!isset($sandbox['total'])) { + // Sandbox has not been initiated, so initiate it. + $sandbox['items_to_process'] = harvest_get_temp_run_ids($table_name_temp); + $sandbox['total'] = count($sandbox['items_to_process']); + $sandbox['current'] = 0; + } + // Process them in batches of 25. + $harvest_runs_batch = array_slice($sandbox['items_to_process'], 0, 25, TRUE); + // Loop through all the entries in temp table and save them new. + foreach ($harvest_runs_batch as $key => $time_id) { + // Load the old row. + $row = harvest_read_harvest_run($table_name_temp, $time_id); + // Write the new harvest run. + harvest_write_harvest_run($row['id'], $row['harvest_plan_id'], $row['data'], $row['extract_status']); + // The item has been processed, remove it from the array. + unset($sandbox['items_to_process'][$key]); + } + + // Determine when to stop batching. + $sandbox['current'] = ($sandbox['total'] - count($sandbox['items_to_process'])); + $sandbox['#finished'] = (empty($sandbox['total'])) ? 1 : ($sandbox['current'] / $sandbox['total']); + $vars = [ + '@completed' => $sandbox['current'], + '@total' => $sandbox['total'], + ]; + + $messages = t('Processed: @completed/@total.', $vars) . PHP_EOL; + // Log the all finished notice. + if ($sandbox['#finished'] === 1) { + // The update of the harvest_runs is complete. + $messages .= t('Data in harvest_runs updated to new schema:') . PHP_EOL; + $dropped = $schema->dropTable($table_name_temp); + if ($dropped) { + $messages .= t('Temporary table dropped.') . PHP_EOL; + } + } + + return $messages; +} + +/** + * Move entries from harvest_[ID]_runs to harvest_runs. + * + * This finishes the process started by harvest_update_8007 and re-runs 8008. +*/ +function harvest_update_8011(&$sandbox) { /** @var \Drupal\harvest\HarvestUtility $harvest_utility */ - $harvest_utility = \Drupal::service('dkan.harvest.utility'); - $harvest_utility->harvestRunsUpdate(); - return 'Harvest runs coalesced into table harvest_runs.'; + $harvest_utility = \Drupal::service('dkan.harvest.utility'); + $harvest_utility->harvestRunsUpdate(); + return 'Harvest plan specific run tables coalesced into table harvest_runs.'; } diff --git a/modules/harvest/src/Commands/HarvestCommands.php b/modules/harvest/src/Commands/HarvestCommands.php index 9308c98ee8..f28ed9d89b 100644 --- a/modules/harvest/src/Commands/HarvestCommands.php +++ b/modules/harvest/src/Commands/HarvestCommands.php @@ -221,11 +221,6 @@ public function runAll($options = ['new' => FALSE]) { foreach ($plan_ids as $plan_id) { $result = $this->harvestService->runHarvest($plan_id); $runs_info[] = $result; - // Since run IDs are also one-second-resolution timestamps, we must wait - // one second before running the next harvest. - // @todo Remove this sleep when we've switched to a better system for - // timestamps. - sleep(1); } $this->renderHarvestRunsInfo($runs_info); } diff --git a/modules/harvest/src/Entity/HarvestRun.php b/modules/harvest/src/Entity/HarvestRun.php index 66de79c20d..42c5f3f96d 100644 --- a/modules/harvest/src/Entity/HarvestRun.php +++ b/modules/harvest/src/Entity/HarvestRun.php @@ -46,15 +46,12 @@ * admin_permission = "administer harvest_run", * entity_keys = { * "id" = "id", - * "label" = "id", + * "label" = "ID", * }, * links = { * "canonical" = "/harvest-run/{harvest_run}", * }, * ) - * - * @todo Convert to using microtime() or other better system for the timestamp/ - * id. */ final class HarvestRun extends HarvestEntityBase implements HarvestRunInterface { @@ -64,11 +61,25 @@ final class HarvestRun extends HarvestEntityBase implements HarvestRunInterface public static function baseFieldDefinitions(EntityTypeInterface $entity_type) { $base_fields = parent::baseFieldDefinitions($entity_type); - // The id is the unique ID for the harvest run, and also the timestamp at - // which the run occurred, generated by time(). - $base_fields['id'] = static::getBaseFieldIdentifier( - new TranslatableMarkup('Harvest Run') - ); + $base_fields['id'] = BaseFieldDefinition::create('integer') + ->setLabel(t('ID')) + ->setDescription(t('The ID of the Harvest Run entity.')) + ->setDisplayOptions('view', [ + 'label' => 'inline', + 'weight' => 0, + ]) + ->setReadOnly(TRUE); + + $base_fields['timestamp'] = BaseFieldDefinition::create('timestamp') + ->setLabel(t('timestamp')) + ->setDescription(t('The timestamp of when this harvest was run.')) + ->setRequired(TRUE); + + $base_fields['uuid'] = BaseFieldDefinition::create('uuid') + ->setLabel(t('UUID')) + ->setDescription(t('The unique identifier for this harvest_run')) + ->setRequired(TRUE) + ->setReadOnly(TRUE); // Harvest plan id. This is the name of the harvest plan as seen in the UI. $base_fields['harvest_plan_id'] = BaseFieldDefinition::create('string') diff --git a/modules/harvest/src/Entity/HarvestRunRepository.php b/modules/harvest/src/Entity/HarvestRunRepository.php index 04af86439b..d5765c2776 100644 --- a/modules/harvest/src/Entity/HarvestRunRepository.php +++ b/modules/harvest/src/Entity/HarvestRunRepository.php @@ -24,7 +24,7 @@ class HarvestRunRepository { * * @var \Drupal\Core\Entity\EntityStorageInterface */ - private EntityStorageInterface $runStorage; + public EntityStorageInterface $runStorage; /** * Database connection service. @@ -89,8 +89,8 @@ public function destructForPlanId(string $plan_id) { * Run data. Usually the result returned by Harvester::harvest(). * @param string $plan_id * The plan identifier. - * @param string $run_id - * The run identifier, which is also a timestamp. + * @param string $timestamp + * The run timestamp. * * @return string * The run identifier. @@ -100,9 +100,9 @@ public function destructForPlanId(string $plan_id) { * @todo Eventually all the subsystems will be able to understand the entity * rather than needing conversion to and from the array format. */ - public function storeRun(array $run_data, string $plan_id, string $run_id): string { + public function storeRun(array $run_data, string $plan_id, string $timestamp): string { $field_values = [ - 'id' => $run_id, + 'timestamp' => (int) $timestamp, 'harvest_plan_id' => $plan_id, ]; $field_values['extract_status'] = $run_data['status']['extract'] ?? 'FAILURE'; @@ -139,7 +139,7 @@ public function storeRun(array $run_data, string $plan_id, string $run_id): stri // JSON encode remaining run data. $field_values['data'] = json_encode($run_data); - return $this->writeEntity($field_values, $plan_id, $run_id); + return $this->writeEntity($field_values, $plan_id, $timestamp); } /** @@ -147,15 +147,18 @@ public function storeRun(array $run_data, string $plan_id, string $run_id): stri * * @param string $plan_id * The harvest plan identifier. - * @param string $run_id - * The harvest run identifier. + * @param string|null $timestamp + * The harvest run timestamp. No longer used. Retained for BC. * * @return string|null * JSON-encoded run result data, or NULL if none could be found. */ - public function retrieveRunJson(string $plan_id, string $run_id): ?string { - if ($entity = $this->loadEntity($plan_id, $run_id)) { - return json_encode($entity->toResult()); + public function retrieveRunJson(string $plan_id, $timestamp = NULL): ?string { + $run_ids = $this->retrieveAllRunIds($plan_id); + $run_id = reset($run_ids); + if ($run_id) { + $run_entity = $this->runStorage->load($run_id); + return json_encode($run_entity->toResult()); } return NULL; } @@ -198,7 +201,7 @@ public function retrieveAllRunsJson(string $plan_id): array { } /** - * Get all the harvet plan ids available in the harvest runs table. + * Get all the harvest plan ids available in the harvest runs table. * * @return array * All the harvest plan ids present in the harvest runs table, as both key @@ -218,18 +221,18 @@ public function getUniqueHarvestPlanIds(): array { /** * Get the extracted UUIDs from the given harvest run. * - * @param string $plan_id - * The harvest plan ID. - * @param string $run_id - * The harvest run ID. + * @param string $planId + * The harvest plan ID. deprecated: no longer needed but kept for BC. + * @param string $runId + * The harvest_run entity id. * * @return string[] * Array of UUIDs, keyed by UUID. Note that these are UUIDs by convention; * they could be any string value. */ - public function getExtractedUuids(string $plan_id, string $run_id): array { + public function getExtractedUuids(string $planId, string $runId): array { $extracted = []; - if ($entity = $this->loadEntity($plan_id, $run_id)) { + if ($entity = $this->runStorage->load($runId)) { foreach ($entity->get('extracted_uuid')->getValue() as $field) { $uuid = $field['value']; $extracted[$uuid] = $uuid; @@ -243,16 +246,17 @@ public function getExtractedUuids(string $plan_id, string $run_id): array { * * @param string $plan_id * Plan ID. - * @param string $run_id - * Run ID, which is a timestamp. + * @param string $timestamp + * The timestamp for the run. Formerly the id. * * @return \Drupal\harvest\HarvestRunInterface|\Drupal\Core\Entity\EntityInterface|null * The loaded entity or NULL if none could be loaded. */ - public function loadEntity(string $plan_id, string $run_id): ?HarvestRunInterface { + public function loadEntity(string $plan_id, string $timestamp): ?HarvestRunInterface { if ($ids = $this->runStorage->getQuery() - ->condition('id', $run_id) + ->condition('timestamp', $timestamp) ->condition('harvest_plan_id', $plan_id) + ->sort('id', 'DESC') ->range(0, 1) ->accessCheck(FALSE) ->execute() @@ -262,6 +266,36 @@ public function loadEntity(string $plan_id, string $run_id): ?HarvestRunInterfac return NULL; } + /** + * Helper method to load the most recent harvest_run entity given a plan ID. + * + * @param string $harvest_plan_id + * Plan ID. + * + * @return \Drupal\harvest\HarvestRunInterface|null + * The loaded harvest_run entity or NULL if none could be loaded. + */ + public function loadRunByPlan($harvest_plan_id): ?HarvestRunInterface { + $run_id = $this->getLastHarvestRunId($harvest_plan_id); + return ($run_id) ? $this->runStorage->load($run_id) : NULL; + } + + /** + * Get a harvest's most recent run identifier. + * + * Since the run record id is a timestamp, we can sort on the id. + * + * @param string $plan_id + * The harvest plan identifier. + * + * @return string + * The entity id of the most recent harvest run. + */ + public function getLastHarvestRunId(string $plan_id): string { + $run_ids = $this->retrieveAllRunIds($plan_id); + return reset($run_ids); + } + /** * Write a harvest_run entity, updating or saving as needed. * @@ -269,21 +303,22 @@ public function loadEntity(string $plan_id, string $run_id): ?HarvestRunInterfac * Structured data ready to send to entity_storage->create(). * @param string $plan_id * Harvest plan identifier. - * @param string $run_id - * Harvest run identifier. + * @param mixed $timestamp + * Harvest run timestamp. * * @return string - * Harvest plan identifier for the entity that was written. + * Harvest run id. */ - public function writeEntity(array $field_values, string $plan_id, string $run_id) { + public function writeEntity(array $field_values, string $plan_id, mixed $timestamp) { + $timestamp = (int) $timestamp; /** @var \Drupal\harvest\HarvestRunInterface $entity */ - $entity = $this->loadEntity($plan_id, $run_id); + $entity = $this->loadEntity($plan_id, $timestamp); if ($entity) { // Modify entity. - unset($field_values['id']); foreach ($field_values as $key => $value) { $entity->set($key, $value); } + $field_values['id'] = $entity->id(); } else { // Create new entity. diff --git a/modules/harvest/src/HarvestPlanListBuilder.php b/modules/harvest/src/HarvestPlanListBuilder.php index 7b457a5e13..b3d6a8c44b 100644 --- a/modules/harvest/src/HarvestPlanListBuilder.php +++ b/modules/harvest/src/HarvestPlanListBuilder.php @@ -74,9 +74,9 @@ public function buildRow(EntityInterface $entity) { $harvest_plan_id = $entity->get('id')->getString(); $run_entity = NULL; - if ($run_id = $this->harvestService->getLastHarvestRunId($harvest_plan_id)) { + if ($run_id = $this->harvestService->runRepository->getLastHarvestRunId($harvest_plan_id)) { // There is a run identifier, so we should get that info. - $run_entity = $this->harvestRunRepository->loadEntity($harvest_plan_id, $run_id); + $run_entity = $this->harvestRunRepository->runStorage->load($run_id); } // Default values for a row if there's no info. diff --git a/modules/harvest/src/HarvestService.php b/modules/harvest/src/HarvestService.php index 6d3ba1ac19..04d93312d4 100644 --- a/modules/harvest/src/HarvestService.php +++ b/modules/harvest/src/HarvestService.php @@ -58,7 +58,7 @@ class HarvestService implements ContainerInjectionInterface { * * @var \Drupal\harvest\Entity\HarvestRunRepository */ - private HarvestRunRepository $runRepository; + public HarvestRunRepository $runRepository; /** * DKAN logger channel. @@ -208,7 +208,7 @@ public function revertHarvest($id) { public function runHarvest($plan_id) { $harvester = $this->getHarvester($plan_id); - $run_id = (string) time(); + $timestamp = (string) time(); $result = $harvester->harvest(); if (empty($result['status']['extracted_items_ids'])) { @@ -218,8 +218,8 @@ public function runHarvest($plan_id) { $this->getOrphanIdsFromResult($plan_id, $result['status']['extracted_items_ids']); $this->processOrphanIds($result['status']['orphan_ids']); - $result['identifier'] = $run_id; - $this->runRepository->storeRun($result, $plan_id, $run_id); + $result['identifier'] = $timestamp; + $this->runRepository->storeRun($result, $plan_id, $timestamp); return $result; } @@ -229,15 +229,15 @@ public function runHarvest($plan_id) { * * @param string $plan_id * The harvest plan ID. - * @param string $run_id - * The harvest run ID. + * @param string $timestamp + * The timestamp of the harvest_run. * * @return bool|string * JSON-encoded run information for the given run, or FALSE if no matching * runID is found. */ - public function getHarvestRunInfo(string $plan_id, string $run_id): bool|string { - if ($info = $this->runRepository->retrieveRunJson($plan_id, $run_id)) { + public function getHarvestRunInfo(string $plan_id, string $timestamp): bool|string { + if ($info = $this->runRepository->retrieveRunJson($plan_id, $timestamp)) { return $info; } return FALSE; @@ -248,14 +248,16 @@ public function getHarvestRunInfo(string $plan_id, string $run_id): bool|string * * @param string $plan_id * Harvest plan ID. - * @param string $run_id - * Harvest run ID. + * @param string $timestamp + * Harvest run timestamp. * * @return array * Array of status info from the run. */ - public function getHarvestRunResult(string $plan_id, string $run_id): array { - if ($entity = $this->runRepository->loadEntity($plan_id, $run_id)) { + public function getHarvestRunResult(string $plan_id, string $timestamp): array { + // This one has to keep using the loadEntity method as it may be looking up + // more than the most recent run. + if ($entity = $this->runRepository->loadEntity($plan_id, $timestamp)) { return $entity->toResult(); } else { @@ -296,20 +298,18 @@ public function getRunIdsForHarvest(string $plan_id): array { } /** - * Get a harvest's most recent run identifier. - * - * Since the run record id is a timestamp, we can sort on the id. + * Get a harvest's most recent run id. Passthrough for HarvestRunRepository. * * @param string $plan_id - * The harvest identifier. + * The harvest plan identifier. * * @return string - * The most recent harvest run record identifier. + * The entity id of the most recent harvest run. + * + * @deprecated in dkan:2.19.11 and is removed from dkan:3.0.0 Use runStorage::load(). */ public function getLastHarvestRunId(string $plan_id): string { - $run_ids = $this->runRepository->retrieveAllRunIds($plan_id); - rsort($run_ids); - return reset($run_ids); + return $this->runRepository->getLastHarvestRunId($plan_id); } /** @@ -354,7 +354,7 @@ protected function bulkUpdateStatus(string $harvestId, string $method): array { throw new \OutOfRangeException("Method {$method} does not exist"); } - $lastRunId = $this->getLastHarvestRunId($harvestId); + $lastRunId = $this->runRepository->getLastHarvestRunId($harvestId); $lastRunInfo = json_decode($this->getHarvestRunInfo($harvestId, $lastRunId)); $status = $lastRunInfo->status ?? NULL; if (!isset($status->extracted_items_ids)) { diff --git a/modules/harvest/src/HarvestUtility.php b/modules/harvest/src/HarvestUtility.php index 54cee32996..5a8e03b5cb 100644 --- a/modules/harvest/src/HarvestUtility.php +++ b/modules/harvest/src/HarvestUtility.php @@ -217,10 +217,10 @@ public function harvestHashUpdate() { */ public function convertRunTable(string $plan_id) { $old_runs_table = $this->storeFactory->getInstance('harvest_' . $plan_id . '_runs'); - foreach ($old_runs_table->retrieveAll() as $id) { - if ($data = $old_runs_table->retrieve($id)) { + foreach ($old_runs_table->retrieveAll() as $timestamp) { + if ($data = $old_runs_table->retrieve($timestamp)) { // Explicitly decode the data as an array. - $this->runRepository->storeRun(json_decode($data, TRUE), $plan_id, $id); + $this->runRepository->storeRun(json_decode($data, TRUE), $plan_id, $timestamp); } } } diff --git a/modules/harvest/src/OrphanDatasetsProcessor.php b/modules/harvest/src/OrphanDatasetsProcessor.php index 48d5f35bfd..a688df20de 100644 --- a/modules/harvest/src/OrphanDatasetsProcessor.php +++ b/modules/harvest/src/OrphanDatasetsProcessor.php @@ -32,7 +32,7 @@ public function setEntityTypeManager(EntityTypeManagerInterface $entityTypeManag * Get the dataset identifiers orphaned by the harvest currently in progress. */ private function getOrphanIdsFromResult(string $harvestId, array $extractedIds) : array { - if ($lastRunId = $this->getLastHarvestRunId($harvestId)) { + if ($lastRunId = $this->runRepository->getLastHarvestRunId($harvestId)) { $previouslyExtractedIds = $this->getExtractedIds($harvestId, $lastRunId); return array_values(array_diff($previouslyExtractedIds, $extractedIds)); } diff --git a/modules/harvest/tests/src/Kernel/Entity/HarvestRunRepositoryTest.php b/modules/harvest/tests/src/Kernel/Entity/HarvestRunRepositoryTest.php index e941cdbded..7fd20087ae 100644 --- a/modules/harvest/tests/src/Kernel/Entity/HarvestRunRepositoryTest.php +++ b/modules/harvest/tests/src/Kernel/Entity/HarvestRunRepositoryTest.php @@ -32,8 +32,8 @@ protected function setUp() : void { * @covers ::destructForPlanId */ public function testDestructForPlanId() { - $destruct_id = '1711038292'; - $keep_id = '1711038293'; + $destruct_timestamp = '1711038292'; + $keep_timestamp = '1711038293'; $destruct_this_plan_id = 'DESTRUCTTHISPLAN'; $keep_this_plan_id = 'KEEPTHISPLAN'; $run_data = [ @@ -44,10 +44,10 @@ public function testDestructForPlanId() { ]; /** @var \Drupal\harvest\Entity\HarvestRunRepository $harvest_run_repo */ $harvest_run_repo = $this->container->get('dkan.harvest.storage.harvest_run_repository'); - $run_data['identifier'] = $keep_id; - $harvest_run_repo->storeRun($run_data, $keep_this_plan_id, $keep_id); - $run_data['identifier'] = $destruct_id; - $harvest_run_repo->storeRun($run_data, $destruct_this_plan_id, $destruct_id); + $run_data['identifier'] = $keep_timestamp; + $harvest_run_repo->storeRun($run_data, $keep_this_plan_id, $keep_timestamp); + $run_data['identifier'] = $destruct_timestamp; + $harvest_run_repo->storeRun($run_data, $destruct_this_plan_id, $destruct_timestamp); $harvest_run_repo->destructForPlanId($destruct_this_plan_id); @@ -56,9 +56,9 @@ public function testDestructForPlanId() { /** @var \Drupal\harvest\HarvestRunInterface $run */ $this->assertInstanceOf( HarvestRunInterface::class, - $run = $harvest_run_repo->loadEntity($keep_this_plan_id, $keep_id) + $run = $harvest_run_repo->loadEntity($keep_this_plan_id, $keep_timestamp) ); - $this->assertEquals($keep_id, $run->id()); + $this->assertEquals($keep_timestamp, $run->get('timestamp')->value); } /** @@ -75,23 +75,23 @@ public function testRetrieveJson() { // Set up an entity. $plan_id = 'plan'; - $run_id = '1711038293'; + $run_timestamp = '1711038293'; $run_data = [ 'plan' => '{"plan": "json"}', 'status' => [ 'extract' => 'AWESOME', ], - 'identifier' => $run_id, + 'identifier' => $run_timestamp, ]; - $harvest_run_repo->storeRun($run_data, $plan_id, $run_id); + $run_id = $harvest_run_repo->storeRun($run_data, $plan_id, $run_timestamp); // Retrieve one run as JSON. $this->assertIsString( - $run_json = $harvest_run_repo->retrieveRunJson($plan_id, $run_id) + $run_json = $harvest_run_repo->retrieveRunJson($plan_id, $run_timestamp) ); $this->assertIsObject($run = json_decode($run_json)); foreach (array_keys($run_data) as $key) { - $this->assertObjectHasProperty($key, $run); + $this->assertObjectHasProperty($key, $run, "The key $key was not found in the harvest_run."); } // Retrieve all runs as JSON. @@ -99,10 +99,10 @@ public function testRetrieveJson() { $runs_json = $harvest_run_repo->retrieveAllRunsJson($plan_id) ); // Do some assertions. - $this->assertArrayHasKey($run_id, $runs_json); + $this->assertArrayHasKey($run_id, $runs_json, 'The harvest_run id of the test save was not found.'); $this->assertIsString($runs_json[$run_id]); $this->assertIsObject($run_decoded = json_decode($runs_json[$run_id])); - $this->assertEquals($run_id, $run_decoded->identifier); + $this->assertEquals($run_timestamp, $run_decoded->identifier, 'The harvest_run timestamps do not match.'); } /** @@ -112,11 +112,12 @@ public function testGetExtractedUuids() { /** @var \Drupal\harvest\Entity\HarvestRunRepository $harvest_run_repo */ $harvest_run_repo = $this->container->get('dkan.harvest.storage.harvest_run_repository'); $plan_id = 'plan'; - $run_id = '1711038293'; + $non_existent_run_id = '123456777'; + $timestamp = '1711038293'; // There are no entities at this point, so trying to get UUIDs should // result in an empty array. - $this->assertIsArray($uuids = $harvest_run_repo->getExtractedUuids($plan_id, $run_id)); + $this->assertIsArray($uuids = $harvest_run_repo->getExtractedUuids($plan_id, $non_existent_run_id)); $this->assertEquals([], $uuids); $uuids = [ @@ -131,9 +132,9 @@ public function testGetExtractedUuids() { 'extract' => 'AWESOME', 'extracted_items_ids' => $uuids, ], - 'identifier' => $run_id, + 'identifier' => $timestamp, ]; - $harvest_run_repo->storeRun($run_data, $plan_id, $run_id); + $run_id = $harvest_run_repo->storeRun($run_data, $plan_id, $timestamp); // Get the extracted UUIDs. $this->assertIsArray($extracted_uuids = $harvest_run_repo->getExtractedUuids($plan_id, $run_id)); diff --git a/modules/harvest/tests/src/Kernel/HarvestPlanListBuilderTest.php b/modules/harvest/tests/src/Kernel/HarvestPlanListBuilderTest.php index ef64c31371..d2cddc790d 100644 --- a/modules/harvest/tests/src/Kernel/HarvestPlanListBuilderTest.php +++ b/modules/harvest/tests/src/Kernel/HarvestPlanListBuilderTest.php @@ -101,9 +101,7 @@ public function testGoodHarvestRun() { $this->registerHarvestPlan($harvest_service, $plan_id); $run_result = $harvest_service->runHarvest($plan_id); $this->assertEquals('SUCCESS', $run_result['status']['extract'] ?? 'not_a_successful_run'); - // Get the actual run ID because it happens to be a timestamp for the last - // run, and it could be the next second by the time we assert against it. - $run_id = $harvest_service->getLastHarvestRunId($plan_id); + $run_id = $harvest_service->runRepository->getLastHarvestRunId($plan_id); $response = $list_builder->render(); diff --git a/modules/harvest/tests/src/Kernel/HarvestServiceTest.php b/modules/harvest/tests/src/Kernel/HarvestServiceTest.php index b1333db3c6..0d16ceb415 100644 --- a/modules/harvest/tests/src/Kernel/HarvestServiceTest.php +++ b/modules/harvest/tests/src/Kernel/HarvestServiceTest.php @@ -196,7 +196,8 @@ public function testGetHarvestRunResult() { // getHarvestRunResult should return an empty array. /** @var \Drupal\harvest\HarvestService $harvest_service */ $harvest_service = $this->container->get('dkan.harvest.service'); - $this->assertEquals([], $harvest_service->getHarvestRunResult('any_plan', 'any_id')); + $any_harvest_run_id = 111; + $this->assertEquals([], $harvest_service->getHarvestRunResult('any_plan', $any_harvest_run_id)); // Register a harvest and run it. $plan_identifier = 'test_plan'; @@ -214,10 +215,10 @@ public function testGetHarvestRunResult() { $this->assertEquals($plan_identifier, $harvest_service->registerHarvest($plan)); $run_result = $harvest_service->runHarvest($plan_identifier); - $this->assertNotEmpty($run_id = $run_result['identifier']); + $this->assertNotEmpty($timestamp = $run_result['identifier']); // Compare the reloaded results to the ones from the original run. - $this->assertEquals($run_result, $harvest_service->getHarvestRunResult($plan_identifier, $run_id)); + $this->assertEquals($run_result, $harvest_service->getHarvestRunResult($plan_identifier, $timestamp)); } } From 918a569143c12367d17b9f63b0540d457ee33a14 Mon Sep 17 00:00:00 2001 From: Steve Wirt Date: Thu, 12 Dec 2024 13:22:47 -0500 Subject: [PATCH 2/4] Fix harvest_run entity label. --- modules/harvest/src/Entity/HarvestRun.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/harvest/src/Entity/HarvestRun.php b/modules/harvest/src/Entity/HarvestRun.php index 42c5f3f96d..f2c097234e 100644 --- a/modules/harvest/src/Entity/HarvestRun.php +++ b/modules/harvest/src/Entity/HarvestRun.php @@ -46,7 +46,7 @@ * admin_permission = "administer harvest_run", * entity_keys = { * "id" = "id", - * "label" = "ID", + * "label" = "id", * }, * links = { * "canonical" = "/harvest-run/{harvest_run}", From 1a382f67d98bafe9db52258f2aba9b5bd438b91d Mon Sep 17 00:00:00 2001 From: Steve Wirt Date: Thu, 12 Dec 2024 13:09:13 -0500 Subject: [PATCH 3/4] Move update support functions to HarvestUtility. --- modules/harvest/harvest.install | 68 ++------------------ modules/harvest/harvest.services.yml | 1 + modules/harvest/src/HarvestUtility.php | 86 +++++++++++++++++++++++++- 3 files changed, 89 insertions(+), 66 deletions(-) diff --git a/modules/harvest/harvest.install b/modules/harvest/harvest.install index 42defc8699..b09b7d2cb6 100644 --- a/modules/harvest/harvest.install +++ b/modules/harvest/harvest.install @@ -1,8 +1,5 @@ select($table_name_temp, 'hrt') - ->fields('hrt', ['id']); - $query->orderBy('id', 'ASC'); - $result = $query->execute()->fetchCol(0); - // Can't do orderBy as the sort end up natural, not numeric. - asort($result, SORT_NUMERIC); - - return $result ?? []; -} - -/** - * Reads a single harvest row from the temp table. - * - * @param string $table_name_temp - * Name of the table to read from. - * - * @param string $time_id - * The id to read from, which was also the timestamp. - * - * @return array - * Elements from the row ['id', 'harvest_plan_id', 'data', 'extract_status']. - */ -function harvest_read_harvest_run(string $table_name_temp, string $time_id): array { - $connection = Database::getConnection(); - $query = $connection->select($table_name_temp, 'hrt') - ->fields('hrt', ['id', 'harvest_plan_id', 'data', 'extract_status']) - ->condition('id', $time_id, '=') - ->orderBy('id', 'ASC'); - $result = $query->execute()->fetchAll(PDO::FETCH_ASSOC); - return reset($result); -} - -function harvest_write_harvest_run(string $id, string $harvest_plan_id, string $data, string $extract_status) { - /** @var \Drupal\Core\Database\Connection $connection */ - $connection = \Drupal::service('database'); - $result = $connection->insert('harvest_runs') - ->fields([ - 'timestamp' => (int) $id, - 'harvest_plan_id' => $harvest_plan_id, - 'data' => $data, - 'extract_status' => $extract_status, - ]) - ->execute(); -} - /** * Uninstall obsolete submodule harvest_dashboard. */ @@ -202,10 +142,12 @@ function harvest_update_8010(&$sandbox) { $table_name_temp = "{$table_name}_temp"; $messages = ''; $schema = \Drupal::database()->schema(); + /** @var \Drupal\harvest\HarvestUtility $harvest_utility */ + $harvest_utility = \Drupal::service('dkan.harvest.utility'); if (!isset($sandbox['total'])) { // Sandbox has not been initiated, so initiate it. - $sandbox['items_to_process'] = harvest_get_temp_run_ids($table_name_temp); + $sandbox['items_to_process'] = $harvest_utility->getTempRunIdsForUpdate($table_name_temp); $sandbox['total'] = count($sandbox['items_to_process']); $sandbox['current'] = 0; } @@ -214,9 +156,9 @@ function harvest_update_8010(&$sandbox) { // Loop through all the entries in temp table and save them new. foreach ($harvest_runs_batch as $key => $time_id) { // Load the old row. - $row = harvest_read_harvest_run($table_name_temp, $time_id); + $row = $harvest_utility->readTempHarvestRunForUpdate($table_name_temp, $time_id); // Write the new harvest run. - harvest_write_harvest_run($row['id'], $row['harvest_plan_id'], $row['data'], $row['extract_status']); + $harvest_utility->writeHarvestRunFromUpdate($row['id'], $row['harvest_plan_id'], $row['data'], $row['extract_status']); // The item has been processed, remove it from the array. unset($sandbox['items_to_process'][$key]); } diff --git a/modules/harvest/harvest.services.yml b/modules/harvest/harvest.services.yml index 8d9cb604eb..10dd0569ce 100644 --- a/modules/harvest/harvest.services.yml +++ b/modules/harvest/harvest.services.yml @@ -19,6 +19,7 @@ services: - '@dkan.harvest.storage.harvest_run_repository' - '@database' - '@dkan.harvest.logger_channel' + - '@uuid' dkan.harvest.harvest_plan_repository: class: Drupal\harvest\Entity\HarvestPlanRepository arguments: diff --git a/modules/harvest/src/HarvestUtility.php b/modules/harvest/src/HarvestUtility.php index 5a8e03b5cb..6c2aad0002 100644 --- a/modules/harvest/src/HarvestUtility.php +++ b/modules/harvest/src/HarvestUtility.php @@ -2,6 +2,7 @@ namespace Drupal\harvest; +use Drupal\Component\Uuid\UuidInterface; use Drupal\Core\Database\Connection; use Drupal\harvest\Entity\HarvestRunRepository; use Drupal\harvest\Storage\DatabaseTableFactory; @@ -11,8 +12,8 @@ /** * DKAN Harvest utility service for maintenance tasks. * - * These methods generally exist to support a thin Drush layer. These are - * methods that we don't need in the HarvestService object. + * These methods generally exist to support a thin Drush layer or hook_update_n. + * These are methods that we don't need in the HarvestService object. */ class HarvestUtility { @@ -58,6 +59,13 @@ class HarvestUtility { */ private LoggerInterface $logger; + /** + * Uuid service. + * + * @var \Drupal\Component\Uuid\UuidInterface + */ + private UuidInterface $uuidService; + /** * Constructor. */ @@ -67,7 +75,8 @@ public function __construct( HarvestHashesDatabaseTableFactory $hashesFactory, HarvestRunRepository $runRepository, Connection $connection, - LoggerInterface $loggerChannel + LoggerInterface $loggerChannel, + UuidInterface $uuid_service ) { $this->harvestService = $harvestService; $this->storeFactory = $storeFactory; @@ -75,6 +84,7 @@ public function __construct( $this->runRepository = $runRepository; $this->connection = $connection; $this->logger = $loggerChannel; + $this->uuidService = $uuid_service; } /** @@ -243,4 +253,74 @@ public function harvestRunsUpdate() { } } + /** + * Get the ids from the temp harvest run table. + * + * Only needed for harvest_update_8010. + * + * @param mixed $table_name_temp + * The name of the temp table. + * + * @return array + * The ids of all the harvest runs in the table sorted oldest to newest. + */ + public function getTempRunIdsForUpdate($table_name_temp) : array { + $query = $this->connection->select($table_name_temp, 'hrt') + ->fields('hrt', ['id']) + ->orderBy('id', 'ASC'); + $result = $query->execute()->fetchCol(0); + // Can't rely on orderBy as the sort ends up natural, not numeric. + asort($result, SORT_NUMERIC); + + return $result ?? []; + } + + /** + * Reads a single harvest row from the harvest run temp table. + * + * Only needed for harvest_update_8010. + * + * @param string $table_name_temp + * Name of the table to read from. + * @param string $timestamp + * The id to read from, which was also the timestamp. + * + * @return array + * Elements from the row['id', 'harvest_plan_id', 'data', 'extract_status']. + */ + public function readTempHarvestRunForUpdate(string $table_name_temp, string $timestamp): array { + $query = $this->connection->select($table_name_temp, 'hrt') + ->fields('hrt', ['id', 'harvest_plan_id', 'data', 'extract_status']) + ->condition('id', $timestamp, '=') + ->orderBy('id', 'ASC'); + $result = $query->execute()->fetchAll(\PDO::FETCH_ASSOC); + return reset($result); + } + + /** + * Creates a new entry in harvest_run based on data from harvest run temp. + * + * Only needed for harvest_update_8010. + * + * @param string $timestamp + * The id from the old harvest run, which was a timestamp. + * @param string $harvest_plan_id + * The harvest plan id. + * @param string $data + * Data about the harvest. + * @param string $extract_status + * The status of the harvest. + */ + public function writeHarvestRunFromUpdate(string $timestamp, string $harvest_plan_id, string $data, string $extract_status): void { + $this->connection->insert('harvest_runs') + ->fields([ + 'timestamp' => (int) $timestamp, + 'harvest_plan_id' => $harvest_plan_id, + 'uuid' => $this->uuidService->generate(), + 'data' => $data, + 'extract_status' => $extract_status, + ]) + ->execute(); + } + } From e6d855fb483a0a3ff09d5d966c76d1410947ea78 Mon Sep 17 00:00:00 2001 From: Steve Wirt Date: Thu, 12 Dec 2024 17:52:27 -0500 Subject: [PATCH 4/4] Add proper injection of HarvestRunRepository to HarvestPlanListBuilder. --- .../src/Entity/HarvestRunRepository.php | 2 +- .../harvest/src/HarvestPlanListBuilder.php | 22 ++++++++++--------- modules/harvest/src/HarvestService.php | 2 +- .../src/Kernel/HarvestPlanListBuilderTest.php | 3 +-- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/modules/harvest/src/Entity/HarvestRunRepository.php b/modules/harvest/src/Entity/HarvestRunRepository.php index d5765c2776..165ea349f5 100644 --- a/modules/harvest/src/Entity/HarvestRunRepository.php +++ b/modules/harvest/src/Entity/HarvestRunRepository.php @@ -24,7 +24,7 @@ class HarvestRunRepository { * * @var \Drupal\Core\Entity\EntityStorageInterface */ - public EntityStorageInterface $runStorage; + protected EntityStorageInterface $runStorage; /** * Database connection service. diff --git a/modules/harvest/src/HarvestPlanListBuilder.php b/modules/harvest/src/HarvestPlanListBuilder.php index b3d6a8c44b..7096f7c2d0 100644 --- a/modules/harvest/src/HarvestPlanListBuilder.php +++ b/modules/harvest/src/HarvestPlanListBuilder.php @@ -4,6 +4,7 @@ use Drupal\Core\Entity\EntityInterface; use Drupal\Core\Entity\EntityListBuilder; +use Drupal\Core\Entity\EntityStorageInterface; use Drupal\Core\Entity\EntityTypeInterface; use Drupal\Core\Link; use Drupal\Core\Url; @@ -19,26 +20,26 @@ class HarvestPlanListBuilder extends EntityListBuilder { /** - * Harvest service. + * Harvest run repository service. * - * @var \Drupal\harvest\HarvestService + * @var \Drupal\harvest\Entity\HarvestRunRepository */ - protected HarvestService $harvestService; + protected HarvestRunRepository $harvestRunRepository; /** - * Harvest run repository service. + * Entity storage service for the harvest_run entity type. * - * @var \Drupal\harvest\Entity\HarvestRunRepository + * @var \Drupal\Core\Entity\EntityStorageInterface */ - protected HarvestRunRepository $harvestRunRepository; + protected EntityStorageInterface $harvestRunStorage; /** * {@inheritDoc} */ public static function createInstance(ContainerInterface $container, EntityTypeInterface $entity_type) { $builder = parent::createInstance($container, $entity_type); - $builder->harvestService = $container->get('dkan.harvest.service'); $builder->harvestRunRepository = $container->get('dkan.harvest.storage.harvest_run_repository'); + $builder->harvestRunStorage = $container->get('entity_type.manager')->getStorage('harvest_run'); return $builder; } @@ -74,9 +75,10 @@ public function buildRow(EntityInterface $entity) { $harvest_plan_id = $entity->get('id')->getString(); $run_entity = NULL; - if ($run_id = $this->harvestService->runRepository->getLastHarvestRunId($harvest_plan_id)) { + if ($run_id = $this->harvestRunRepository->getLastHarvestRunId($harvest_plan_id)) { // There is a run identifier, so we should get that info. - $run_entity = $this->harvestRunRepository->runStorage->load($run_id); + /** @var \Drupal\harvest\HarvestRunInterface $run_entity */ + $run_entity = $this->harvestRunStorage->load($run_id); } // Default values for a row if there's no info. @@ -100,7 +102,7 @@ public function buildRow(EntityInterface $entity) { 'data' => $extract_status, 'class' => strtolower($extract_status), ]; - $row['last_run'] = date('m/d/y H:m:s T', $run_id); + $row['last_run'] = date('m/d/y H:m:s T', $run_entity->get('timestamp')->value); $row['dataset_count'] = $interpreter->countProcessed(); } // Don't call parent::buildRow() because we don't want operations (yet). diff --git a/modules/harvest/src/HarvestService.php b/modules/harvest/src/HarvestService.php index 04d93312d4..7fc1f98827 100644 --- a/modules/harvest/src/HarvestService.php +++ b/modules/harvest/src/HarvestService.php @@ -217,7 +217,7 @@ public function runHarvest($plan_id) { $result['status']['orphan_ids'] = $this->getOrphanIdsFromResult($plan_id, $result['status']['extracted_items_ids']); $this->processOrphanIds($result['status']['orphan_ids']); - + // For legacy reasons, the identifier is the timestamp. $result['identifier'] = $timestamp; $this->runRepository->storeRun($result, $plan_id, $timestamp); diff --git a/modules/harvest/tests/src/Kernel/HarvestPlanListBuilderTest.php b/modules/harvest/tests/src/Kernel/HarvestPlanListBuilderTest.php index d2cddc790d..e3a96e30be 100644 --- a/modules/harvest/tests/src/Kernel/HarvestPlanListBuilderTest.php +++ b/modules/harvest/tests/src/Kernel/HarvestPlanListBuilderTest.php @@ -101,7 +101,6 @@ public function testGoodHarvestRun() { $this->registerHarvestPlan($harvest_service, $plan_id); $run_result = $harvest_service->runHarvest($plan_id); $this->assertEquals('SUCCESS', $run_result['status']['extract'] ?? 'not_a_successful_run'); - $run_id = $harvest_service->runRepository->getLastHarvestRunId($plan_id); $response = $list_builder->render(); @@ -109,7 +108,7 @@ public function testGoodHarvestRun() { $strings = array_merge(self::HARVEST_HEADERS, [ 'harvest_link', 'SUCCESS', - json_encode(date('m/d/y H:m:s T', $run_id)), + json_encode(date('m/d/y H:m:s T', $run_result['identifier'])), '2', ]); foreach ($strings as $string) {