From 80beb909b63c3214e28f6ae01ee529c7399ed369 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 22:35:40 -0400 Subject: [PATCH] survey: add report of "largest" paths Since we are already walking our reachable objects using the path-walk API, let's now collect lists of the paths that contribute most to different metrics. Specifically, we care about * Number of versions. * Total size on disk. * Total inflated size (no delta or zlib compression). This information can be critical to discovering which parts of the repository are causing the most growth, especially on-disk size. Different packing strategies might help compress data more efficiently, but the toal inflated size is a representation of the raw size of all snapshots of those paths. Even when stored efficiently on disk, that size represents how much information must be processed to complete a command such as 'git blame'. Since the on-disk size is likely to be fragile, stop testing the exact output of 'git survey' and check that the correct set of headers is output. Signed-off-by: Derrick Stolee --- builtin/survey.c | 77 +++++++++++++++++++++++++++++++++++++++---- t/t8100-git-survey.sh | 12 ++++++- 2 files changed, 81 insertions(+), 8 deletions(-) diff --git a/builtin/survey.c b/builtin/survey.c index ecdd8ebae0197f..3c653a1042f694 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -76,7 +76,6 @@ struct survey_report_object_size_summary { typedef int (*survey_top_size_cmp)(struct survey_report_object_size_summary *s1, struct survey_report_object_size_summary *s2); -MAYBE_UNUSED static int cmp_by_nr(struct survey_report_object_size_summary *s1, struct survey_report_object_size_summary *s2) { @@ -87,7 +86,6 @@ static int cmp_by_nr(struct survey_report_object_size_summary *s1, return 0; } -MAYBE_UNUSED static int cmp_by_disk_size(struct survey_report_object_size_summary *s1, struct survey_report_object_size_summary *s2) { @@ -98,7 +96,6 @@ static int cmp_by_disk_size(struct survey_report_object_size_summary *s1, return 0; } -MAYBE_UNUSED static int cmp_by_inflated_size(struct survey_report_object_size_summary *s1, struct survey_report_object_size_summary *s2) { @@ -122,7 +119,6 @@ struct survey_report_top_sizes { size_t alloc; }; -MAYBE_UNUSED static void init_top_sizes(struct survey_report_top_sizes *top, size_t limit, const char *name, survey_top_size_cmp cmp) @@ -142,7 +138,6 @@ static void clear_top_sizes(struct survey_report_top_sizes *top) free(top->data); } -MAYBE_UNUSED static void maybe_insert_into_top_size(struct survey_report_top_sizes *top, struct survey_report_object_size_summary *summary) { @@ -178,6 +173,10 @@ struct survey_report { struct survey_report_object_summary reachable_objects; struct survey_report_object_size_summary *by_type; + + struct survey_report_top_sizes *top_paths_by_count; + struct survey_report_top_sizes *top_paths_by_disk; + struct survey_report_top_sizes *top_paths_by_inflate; }; #define REPORT_TYPE_COMMIT 0 @@ -420,6 +419,13 @@ static void survey_report_object_sizes(const char *title, clear_table(&table); } +static void survey_report_plaintext_sorted_size( + struct survey_report_top_sizes *top) +{ + survey_report_object_sizes(top->name, _("Path"), + top->data, top->nr); +} + static void survey_report_plaintext(struct survey_context *ctx) { printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree); @@ -430,6 +436,21 @@ static void survey_report_plaintext(struct survey_context *ctx) _("Object Type"), ctx->report.by_type, REPORT_TYPE_COUNT); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_count[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]); } /* @@ -670,7 +691,8 @@ static void increment_totals(struct survey_context *ctx, static void increment_object_totals(struct survey_context *ctx, struct oid_array *oids, - enum object_type type) + enum object_type type, + const char *path) { struct survey_report_object_size_summary *total; struct survey_report_object_size_summary summary = { 0 }; @@ -702,6 +724,27 @@ static void increment_object_totals(struct survey_context *ctx, total->disk_size += summary.disk_size; total->inflated_size += summary.inflated_size; total->num_missing += summary.num_missing; + + if (type == OBJ_TREE || type == OBJ_BLOB) { + int index = type == OBJ_TREE ? + REPORT_TYPE_TREE : REPORT_TYPE_BLOB; + struct survey_report_top_sizes *top; + + /* + * Temporarily store (const char *) here, but it will + * be duped if inserted and will not be freed. + */ + summary.label = (char *)path; + + top = ctx->report.top_paths_by_count; + maybe_insert_into_top_size(&top[index], &summary); + + top = ctx->report.top_paths_by_disk; + maybe_insert_into_top_size(&top[index], &summary); + + top = ctx->report.top_paths_by_inflate; + maybe_insert_into_top_size(&top[index], &summary); + } } static int survey_objects_path_walk_fn(const char *path, @@ -713,7 +756,7 @@ static int survey_objects_path_walk_fn(const char *path, increment_object_counts(&ctx->report.reachable_objects, type, oids->nr); - increment_object_totals(ctx, oids, type); + increment_object_totals(ctx, oids, type, path); ctx->progress_nr += oids->nr; display_progress(ctx->progress, ctx->progress_nr); @@ -723,11 +766,31 @@ static int survey_objects_path_walk_fn(const char *path, static void initialize_report(struct survey_context *ctx) { + const int top_limit = 100; + CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT); ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits")); ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees")); ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs")); ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags")); + + CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr); + init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY COUNT"), cmp_by_nr); + + CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size); + init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size); + + CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size); + init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size); } static void survey_phase_objects(struct survey_context *ctx) diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index 0d3910d205acba..aecf7e434a4d8f 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -66,7 +66,17 @@ test_expect_success 'git survey (default)' ' Tags | 4 | 510 | 528 EOF - test_cmp expect out + lines=$(wc -l out-trimmed && + test_cmp expect out-trimmed && + + for type in "DIRECTORIES" "FILES" + do + for metric in "COUNT" "DISK SIZE" "INFLATED SIZE" + do + grep "TOP $type BY $metric" out || return 1 + done || return 1 + done ' test_done