From 6a9c4299d9d86b5a153e13591e89dcda73bad944 Mon Sep 17 00:00:00 2001 From: Nicola Girardi <122285152+nicolagi@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:14:20 +0100 Subject: [PATCH] FDN-2460: Patch partman5.run_maintenance to avoid table scans (#29) The real diff this introduces is https://github.com/pgpartman/pg_partman/pull/662/files. Avoiding table scans helps making `run_maintenance` fast, which is important because it takes an exclusive lock which prevents other uses of the table that it's processing. (The exclusive lock is acquired when dropping partitions to enforce the retention policy.) More background in [FDN-2442]. --- scripts/20240606-132415.sql | 450 ++++++++++++++++++++++++++++++++++++ 1 file changed, 450 insertions(+) create mode 100644 scripts/20240606-132415.sql diff --git a/scripts/20240606-132415.sql b/scripts/20240606-132415.sql new file mode 100644 index 0000000..137f3e4 --- /dev/null +++ b/scripts/20240606-132415.sql @@ -0,0 +1,450 @@ +CREATE OR REPLACE FUNCTION partman5.run_maintenance( + p_parent_table text DEFAULT NULL + -- If these defaults change reflect them in `run_maintenance_proc`! + , p_analyze boolean DEFAULT false + , p_jobmon boolean DEFAULT true +) + RETURNS void + LANGUAGE plpgsql + AS $$ +DECLARE + +ex_context text; +ex_detail text; +ex_hint text; +ex_message text; +v_adv_lock boolean; +v_analyze boolean := FALSE; +v_check_subpart int; +v_control_type text; +v_create_count int := 0; +v_current_partition_id bigint; +v_current_partition_timestamp timestamptz; +v_default_tablename text; +v_drop_count int := 0; +v_exact_control_type text; +v_is_default text; +v_job_id bigint; +v_jobmon_schema text; +v_last_partition text; +v_last_partition_created boolean; +v_last_partition_id bigint; +v_last_partition_timestamp timestamptz; +v_max_id bigint; +v_max_id_default bigint; +v_max_time_default timestamptz; +v_child_timestamp timestamptz; +v_new_search_path text; +v_next_partition_id bigint; +v_next_partition_timestamp timestamptz; +v_old_search_path text; +v_parent_exists text; +v_parent_oid oid; +v_parent_schema text; +v_parent_tablename text; +v_partition_expression text; +v_premade_count int; +v_row record; +v_row_max_id record; +v_row_max_time record; +v_sql text; +v_step_id bigint; +v_step_overflow_id bigint; +v_sub_id_max bigint; +v_sub_id_max_suffix bigint; +v_sub_id_min bigint; +v_sub_parent text; +v_sub_timestamp_max timestamptz; +v_sub_timestamp_max_suffix timestamptz; +v_sub_timestamp_min timestamptz; +v_tables_list_sql text; + +BEGIN +/* + * Function to manage pre-creation of the next partitions in a set. + * Also manages dropping old partitions if the retention option is set. + * If p_parent_table is passed, will only run run_maintenance() on that one table (no matter what the configuration table may have set for it) + * Otherwise, will run on all tables in the config table with p_automatic_maintenance() set to true. + * For large partition sets, running analyze can cause maintenance to take longer than expected so is not done by default. Can set p_analyze to true to force analyze. Be aware that constraint exclusion may not work properly until an analyze on the partition set is run. + */ + +v_adv_lock := pg_try_advisory_xact_lock(hashtext('pg_partman run_maintenance')); +IF v_adv_lock = 'false' THEN + RAISE NOTICE 'Partman maintenance already running.'; + RETURN; +END IF; + +IF pg_is_in_recovery() THEN + RAISE DEBUG 'pg_partmain maintenance called on replica. Doing nothing.'; + RETURN; +END IF; + +SELECT current_setting('search_path') INTO v_old_search_path; +IF length(v_old_search_path) > 0 THEN + v_new_search_path := 'partman5,pg_temp,'||v_old_search_path; +ELSE + v_new_search_path := 'partman5,pg_temp'; +END IF; +IF p_jobmon THEN + SELECT nspname INTO v_jobmon_schema FROM pg_catalog.pg_namespace n, pg_catalog.pg_extension e WHERE e.extname = 'pg_jobmon'::name AND e.extnamespace = n.oid; + IF v_jobmon_schema IS NOT NULL THEN + v_new_search_path := format('%s,%s',v_jobmon_schema, v_new_search_path); + END IF; +END IF; +EXECUTE format('SELECT set_config(%L, %L, %L)', 'search_path', v_new_search_path, 'false'); + +IF v_jobmon_schema IS NOT NULL THEN + v_job_id := add_job('PARTMAN RUN MAINTENANCE'); + v_step_id := add_step(v_job_id, 'Running maintenance loop'); +END IF; + +v_tables_list_sql := 'SELECT parent_table + , partition_type + , partition_interval + , control + , premake + , undo_in_progress + , sub_partition_set_full + , epoch + , infinite_time_partitions + , retention + , ignore_default_data + , datetime_string + , maintenance_order + FROM partman5.part_config + WHERE undo_in_progress = false'; + +IF p_parent_table IS NULL THEN + v_tables_list_sql := v_tables_list_sql || format(' AND automatic_maintenance = %L ', 'on'); +ELSE + v_tables_list_sql := v_tables_list_sql || format(' AND parent_table = %L ', p_parent_table); +END IF; + +v_tables_list_sql := v_tables_list_sql || format(' ORDER BY maintenance_order ASC NULLS LAST, parent_table ASC NULLS LAST '); + +RAISE DEBUG 'run_maint: v_tables_list_sql: %', v_tables_list_sql; + +FOR v_row IN EXECUTE v_tables_list_sql +LOOP + + CONTINUE WHEN v_row.undo_in_progress; + + -- When sub-partitioning, retention may drop tables that were already put into the query loop values. + -- Check if they still exist in part_config before continuing + v_parent_exists := NULL; + SELECT parent_table INTO v_parent_exists FROM partman5.part_config WHERE parent_table = v_row.parent_table; + IF v_parent_exists IS NULL THEN + RAISE DEBUG 'run_maint: Parent table possibly removed from part_config by retenion'; + END IF; + CONTINUE WHEN v_parent_exists IS NULL; + + -- Check for old quarterly and ISO weekly partitioning from prior to version 5.x. Error out to avoid breaking these partition sets + -- with new datetime_string formats + IF v_row.datetime_string IN ('YYYY"q"Q', 'IYYY"w"IW') THEN + RAISE EXCEPTION 'Quarterly and ISO weekly partitioning is no longer supported in pg_partman 5.0.0 and greater. Please see documentation for migrating away from these partitioning patterns. Partition set: %', v_row.parent_table; + END IF; + + -- Check for consistent data in part_config_sub table. Was unable to get this working properly as either a constraint or trigger. + -- Would either delay raising an error until the next write (which I cannot predict) or disallow future edits to update a sub-partition set's configuration. + -- This way at least provides a consistent way to check that I know will run. If anyone can get a working constraint/trigger, please help! + SELECT sub_parent INTO v_sub_parent FROM partman5.part_config_sub WHERE sub_parent = v_row.parent_table; + IF v_sub_parent IS NOT NULL THEN + SELECT count(*) INTO v_check_subpart FROM partman5.check_subpart_sameconfig(v_row.parent_table); + IF v_check_subpart > 1 THEN + RAISE EXCEPTION 'Inconsistent data in part_config_sub table. Sub-partition tables that are themselves sub-partitions cannot have differing configuration values among their siblings. + Run this query: "SELECT * FROM partman5.check_subpart_sameconfig(''%'');" This should only return a single row or nothing. + If multiple rows are returned, the results are differing configurations in the part_config_sub table for children of the given parent. + Determine the child tables of the given parent and look up their entries based on the "part_config_sub.sub_parent" column. + Update the differing values to be consistent for your desired values.', v_row.parent_table; + END IF; + END IF; + + SELECT n.nspname, c.relname, c.oid + INTO v_parent_schema, v_parent_tablename, v_parent_oid + FROM pg_catalog.pg_class c + JOIN pg_catalog.pg_namespace n ON c.relnamespace = n.oid + WHERE n.nspname = split_part(v_row.parent_table, '.', 1)::name + AND c.relname = split_part(v_row.parent_table, '.', 2)::name; + + -- Always returns the default partition first if it exists + SELECT partition_tablename INTO v_default_tablename + FROM partman5.show_partitions(v_row.parent_table, p_include_default := true) LIMIT 1; + + SELECT pg_get_expr(relpartbound, v_parent_oid) INTO v_is_default + FROM pg_catalog.pg_class c + JOIN pg_catalog.pg_namespace n on c.relnamespace = n.oid + WHERE n.nspname = v_parent_schema + AND c.relname = v_default_tablename; + + IF v_is_default != 'DEFAULT' THEN + -- Parent table will never have data, but allows code below to "just work" + v_default_tablename := v_parent_tablename; + END IF; + + SELECT general_type, exact_type + INTO v_control_type, v_exact_control_type + FROM partman5.check_control_type(v_parent_schema, v_parent_tablename, v_row.control); + + v_partition_expression := CASE + WHEN v_row.epoch = 'seconds' THEN format('to_timestamp(%I)', v_row.control) + WHEN v_row.epoch = 'milliseconds' THEN format('to_timestamp((%I/1000)::float)', v_row.control) + WHEN v_row.epoch = 'nanoseconds' THEN format('to_timestamp((%I/1000000000)::float)', v_row.control) + ELSE format('%I', v_row.control) + END; + RAISE DEBUG 'run_maint: v_partition_expression: %', v_partition_expression; + + SELECT partition_tablename INTO v_last_partition FROM partman5.show_partitions(v_row.parent_table, 'DESC') LIMIT 1; + RAISE DEBUG 'run_maint: parent_table: %, v_last_partition: %', v_row.parent_table, v_last_partition; + + IF v_control_type = 'time' OR (v_control_type = 'id' AND v_row.epoch <> 'none') THEN + + -- Run retention if needed + IF v_row.retention IS NOT NULL THEN + v_drop_count := v_drop_count + partman5.drop_partition_time(v_row.parent_table); + END IF; + + IF v_row.sub_partition_set_full THEN + UPDATE partman5.part_config SET maintenance_last_run = clock_timestamp() WHERE parent_table = v_row.parent_table; + CONTINUE; + END IF; + + SELECT child_start_time INTO v_last_partition_timestamp + FROM partman5.show_partition_info(v_parent_schema||'.'||v_last_partition, v_row.partition_interval, v_row.parent_table); + + -- Must be reset to null otherwise if the next partition set in the loop is empty, the previous partition set's value could be used + v_current_partition_timestamp := NULL; + + -- Loop through child tables starting from highest to get current max value in partition set + -- Avoids doing a scan on entire partition set and/or getting any values accidentally in default. + FOR v_row_max_time IN + SELECT partition_schemaname, partition_tablename FROM partman5.show_partitions(v_row.parent_table, 'DESC', false) + LOOP + EXECUTE format('SELECT %s::text FROM %I.%I LIMIT 1' + , v_partition_expression + , v_row_max_time.partition_schemaname + , v_row_max_time.partition_tablename + ) INTO v_child_timestamp; + + IF v_row.infinite_time_partitions AND v_child_timestamp < CURRENT_TIMESTAMP THEN + -- No new data has been inserted relative to "now", but keep making child tables anyway + v_current_partition_timestamp = CURRENT_TIMESTAMP; + -- Nothing else to do in this case so just end early + EXIT; + END IF; + IF v_child_timestamp IS NOT NULL THEN + SELECT suffix_timestamp INTO v_current_partition_timestamp FROM partman5.show_partition_name(v_row.parent_table, v_child_timestamp::text); + EXIT; + END IF; + END LOOP; + IF v_row.infinite_time_partitions AND v_child_timestamp IS NULL THEN + -- If partition set is completely empty, still keep making child tables anyway + -- Has to be separate check outside above loop since "future" tables are likely going to be empty and make max value in that loop NULL + v_current_partition_timestamp = CURRENT_TIMESTAMP; + END IF; + + + -- If not ignoring the default table, check for max values there. If they are there and greater than all child values, use that instead + -- Note the default is NOT to care about data in the default, so maintenance will fail if new child table boundaries overlap with + -- data that exists in the default. This is intentional so user removes data from default to avoid larger problems. + IF v_row.ignore_default_data THEN + v_max_time_default := NULL; + ELSE + EXECUTE format('SELECT max(%s) FROM ONLY %I.%I', v_partition_expression, v_parent_schema, v_default_tablename) INTO v_max_time_default; + END IF; + RAISE DEBUG 'run_maint: v_current_partition_timestamp: %, v_max_time_default: %', v_current_partition_timestamp, v_max_time_default; + IF v_current_partition_timestamp IS NULL AND v_max_time_default IS NULL THEN + -- Partition set is completely empty and infinite time partitions not set + -- Nothing to do + UPDATE partman5.part_config SET maintenance_last_run = clock_timestamp() WHERE parent_table = v_row.parent_table; + CONTINUE; + END IF; + RAISE DEBUG 'run_maint: v_child_timestamp: %, v_current_partition_timestamp: %, v_max_time_default: %', v_child_timestamp, v_current_partition_timestamp, v_max_time_default; + IF v_current_partition_timestamp IS NULL OR (v_max_time_default > v_current_partition_timestamp) THEN + SELECT suffix_timestamp INTO v_current_partition_timestamp FROM partman5.show_partition_name(v_row.parent_table, v_max_time_default::text); + END IF; + + -- If this is a subpartition, determine if the last child table has been made. If so, mark it as full so future maintenance runs can skip it + SELECT sub_min::timestamptz, sub_max::timestamptz INTO v_sub_timestamp_min, v_sub_timestamp_max FROM partman5.check_subpartition_limits(v_row.parent_table, 'time'); + IF v_sub_timestamp_max IS NOT NULL THEN + SELECT suffix_timestamp INTO v_sub_timestamp_max_suffix FROM partman5.show_partition_name(v_row.parent_table, v_sub_timestamp_max::text); + IF v_sub_timestamp_max_suffix = v_last_partition_timestamp THEN + -- Final partition for this set is created. Set full and skip it + UPDATE partman5.part_config + SET sub_partition_set_full = true, maintenance_last_run = clock_timestamp() + WHERE parent_table = v_row.parent_table; + CONTINUE; + END IF; + END IF; + + -- Check and see how many premade partitions there are. + v_premade_count = round(EXTRACT('epoch' FROM age(v_last_partition_timestamp, v_current_partition_timestamp)) / EXTRACT('epoch' FROM v_row.partition_interval::interval)); + v_next_partition_timestamp := v_last_partition_timestamp; + RAISE DEBUG 'run_maint before loop: last_partition_timestamp: %, current_partition_timestamp: %, v_premade_count: %, v_sub_timestamp_min: %, v_sub_timestamp_max: %' + , v_last_partition_timestamp + , v_current_partition_timestamp + , v_premade_count + , v_sub_timestamp_min + , v_sub_timestamp_max; + -- Loop premaking until config setting is met. Allows it to catch up if it fell behind or if premake changed + WHILE (v_premade_count < v_row.premake) LOOP + RAISE DEBUG 'run_maint: parent_table: %, v_premade_count: %, v_next_partition_timestamp: %', v_row.parent_table, v_premade_count, v_next_partition_timestamp; + IF v_next_partition_timestamp < v_sub_timestamp_min OR v_next_partition_timestamp > v_sub_timestamp_max THEN + -- With subpartitioning, no need to run if the timestamp is not in the parent table's range + EXIT; + END IF; + BEGIN + v_next_partition_timestamp := v_next_partition_timestamp + v_row.partition_interval::interval; + EXCEPTION WHEN datetime_field_overflow THEN + v_premade_count := v_row.premake; -- do this so it can exit the premake check loop and continue in the outer for loop + IF v_jobmon_schema IS NOT NULL THEN + v_step_overflow_id := add_step(v_job_id, 'Attempted partition time interval is outside PostgreSQL''s supported time range.'); + PERFORM update_step(v_step_overflow_id, 'CRITICAL', format('Child partition creation skipped for parent table: %s', v_partition_time)); + END IF; + RAISE WARNING 'Attempted partition time interval is outside PostgreSQL''s supported time range. Child partition creation skipped for parent table %', v_row.parent_table; + CONTINUE; + END; + + v_last_partition_created := partman5.create_partition_time(v_row.parent_table + , ARRAY[v_next_partition_timestamp]); + IF v_last_partition_created THEN + v_analyze := true; + v_create_count := v_create_count + 1; + END IF; + + v_premade_count = round(EXTRACT('epoch' FROM age(v_next_partition_timestamp, v_current_partition_timestamp)) / EXTRACT('epoch' FROM v_row.partition_interval::interval)); + END LOOP; + + ELSIF v_control_type = 'id' THEN + + -- Run retention if needed + IF v_row.retention IS NOT NULL THEN + v_drop_count := v_drop_count + partman5.drop_partition_id(v_row.parent_table); + END IF; + + IF v_row.sub_partition_set_full THEN + UPDATE partman5.part_config SET maintenance_last_run = clock_timestamp() WHERE parent_table = v_row.parent_table; + CONTINUE; + END IF; + + -- Must be reset to null otherwise if the next partition set in the loop is empty, the previous partition set's value could be used + v_current_partition_id := NULL; + + FOR v_row_max_id IN + SELECT partition_schemaname, partition_tablename FROM partman5.show_partitions(v_row.parent_table, 'DESC', false) + LOOP + -- Loop through child tables starting from highest to get current max value in partition set + -- Avoids doing a scan on entire partition set and/or getting any values accidentally in default. + EXECUTE format('SELECT trunc(max(%I))::bigint FROM %I.%I' + , v_row.control + , v_row_max_id.partition_schemaname + , v_row_max_id.partition_tablename) INTO v_max_id; + IF v_max_id IS NOT NULL THEN + SELECT suffix_id INTO v_current_partition_id FROM partman5.show_partition_name(v_row.parent_table, v_max_id::text); + EXIT; + END IF; + END LOOP; + -- If not ignoring the default table, check for max values there. If they are there and greater than all child values, use that instead + -- Note the default is NOT to care about data in the default, so maintenance will fail if new child table boundaries overlap with + -- data that exists in the default. This is intentional so user removes data from default to avoid larger problems. + IF v_row.ignore_default_data THEN + v_max_id_default := NULL; + ELSE + EXECUTE format('SELECT trunc(max(%I))::bigint FROM ONLY %I.%I', v_row.control, v_parent_schema, v_default_tablename) INTO v_max_id_default; + END IF; + RAISE DEBUG 'run_maint: v_max_id: %, v_current_partition_id: %, v_max_id_default: %', v_max_id, v_current_partition_id, v_max_id_default; + IF v_current_partition_id IS NULL AND v_max_id_default IS NULL THEN + -- Partition set is completely empty. Nothing to do + UPDATE partman5.part_config SET maintenance_last_run = clock_timestamp() WHERE parent_table = v_row.parent_table; + CONTINUE; + END IF; + IF v_current_partition_id IS NULL OR (v_max_id_default > v_current_partition_id) THEN + SELECT suffix_id INTO v_current_partition_id FROM partman5.show_partition_name(v_row.parent_table, v_max_id_default::text); + END IF; + + SELECT child_start_id INTO v_last_partition_id + FROM partman5.show_partition_info(v_parent_schema||'.'||v_last_partition, v_row.partition_interval, v_row.parent_table); + -- Determine if this table is a child of a subpartition parent. If so, get limits to see if run_maintenance even needs to run for it. + SELECT sub_min::bigint, sub_max::bigint INTO v_sub_id_min, v_sub_id_max FROM partman5.check_subpartition_limits(v_row.parent_table, 'id'); + + IF v_sub_id_max IS NOT NULL THEN + SELECT suffix_id INTO v_sub_id_max_suffix FROM partman5.show_partition_name(v_row.parent_table, v_sub_id_max::text); + IF v_sub_id_max_suffix = v_last_partition_id THEN + -- Final partition for this set is created. Set full and skip it + UPDATE partman5.part_config + SET sub_partition_set_full = true, maintenance_last_run = clock_timestamp() + WHERE parent_table = v_row.parent_table; + CONTINUE; + END IF; + END IF; + + v_next_partition_id := v_last_partition_id; + v_premade_count := ((v_last_partition_id - v_current_partition_id) / v_row.partition_interval::bigint); + -- Loop premaking until config setting is met. Allows it to catch up if it fell behind or if premake changed. + RAISE DEBUG 'run_maint: before child creation loop: parent_table: %, v_last_partition_id: %, v_premade_count: %, v_next_partition_id: %', v_row.parent_table, v_last_partition_id, v_premade_count, v_next_partition_id; + WHILE (v_premade_count < v_row.premake) LOOP + RAISE DEBUG 'run_maint: parent_table: %, v_premade_count: %, v_next_partition_id: %', v_row.parent_table, v_premade_count, v_next_partition_id; + IF v_next_partition_id < v_sub_id_min OR v_next_partition_id > v_sub_id_max THEN + -- With subpartitioning, no need to run if the id is not in the parent table's range + EXIT; + END IF; + v_next_partition_id := v_next_partition_id + v_row.partition_interval::bigint; + v_last_partition_created := partman5.create_partition_id(v_row.parent_table, ARRAY[v_next_partition_id]); + IF v_last_partition_created THEN + v_analyze := true; + v_create_count := v_create_count + 1; + END IF; + v_premade_count := ((v_next_partition_id - v_current_partition_id) / v_row.partition_interval::bigint); + END LOOP; + + END IF; -- end main IF check for time or id + + IF v_analyze AND p_analyze THEN + IF v_jobmon_schema IS NOT NULL THEN + v_step_id := add_step(v_job_id, format('Analyzing partition set: %s', v_row.parent_table)); + END IF; + + EXECUTE format('ANALYZE %I.%I',v_parent_schema, v_parent_tablename); + + IF v_jobmon_schema IS NOT NULL THEN + PERFORM update_step(v_step_id, 'OK', 'Done'); + END IF; + END IF; + + UPDATE partman5.part_config SET maintenance_last_run = clock_timestamp() WHERE parent_table = v_row.parent_table; + +END LOOP; -- end of main loop through part_config + +IF v_jobmon_schema IS NOT NULL THEN + v_step_id := add_step(v_job_id, format('Finished maintenance')); + PERFORM update_step(v_step_id, 'OK', format('Partition maintenance finished. %s partitions made. %s partitions dropped.', v_create_count, v_drop_count)); + IF v_step_overflow_id IS NOT NULL THEN + PERFORM fail_job(v_job_id); + ELSE + PERFORM close_job(v_job_id); + END IF; +END IF; + +EXECUTE format('SELECT set_config(%L, %L, %L)', 'search_path', v_old_search_path, 'false'); + +EXCEPTION + WHEN OTHERS THEN + GET STACKED DIAGNOSTICS ex_message = MESSAGE_TEXT, + ex_context = PG_EXCEPTION_CONTEXT, + ex_detail = PG_EXCEPTION_DETAIL, + ex_hint = PG_EXCEPTION_HINT; + IF v_jobmon_schema IS NOT NULL THEN + IF v_job_id IS NULL THEN + EXECUTE format('SELECT %I.add_job(''PARTMAN RUN MAINTENANCE'')', v_jobmon_schema) INTO v_job_id; + EXECUTE format('SELECT %I.add_step(%s, ''EXCEPTION before job logging started'')', v_jobmon_schema, v_job_id, p_parent_table) INTO v_step_id; + ELSIF v_step_id IS NULL THEN + EXECUTE format('SELECT %I.add_step(%s, ''EXCEPTION before first step logged'')', v_jobmon_schema, v_job_id) INTO v_step_id; + END IF; + EXECUTE format('SELECT %I.update_step(%s, ''CRITICAL'', %L)', v_jobmon_schema, v_step_id, 'ERROR: '||coalesce(SQLERRM,'unknown')); + EXECUTE format('SELECT %I.fail_job(%s)', v_jobmon_schema, v_job_id); + END IF; + RAISE EXCEPTION '% +CONTEXT: % +DETAIL: % +HINT: %', ex_message, ex_context, ex_detail, ex_hint; +END +$$;