Skip to content

Commit

Permalink
survey: add report of "largest" paths
Browse files Browse the repository at this point in the history
Since we are already walking our reachable objects using the path-walk API,
let's now collect lists of the paths that contribute most to different
metrics. Specifically, we care about

 * Number of versions.
 * Total size on disk.
 * Total inflated size (no delta or zlib compression).

This information can be critical to discovering which parts of the
repository are causing the most growth, especially on-disk size. Different
packing strategies might help compress data more efficiently, but the toal
inflated size is a representation of the raw size of all snapshots of those
paths. Even when stored efficiently on disk, that size represents how much
information must be processed to complete a command such as 'git blame'.

Since the on-disk size is likely to be fragile, stop testing the exact
output of 'git survey' and check that the correct set of headers is
output.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
  • Loading branch information
derrickstolee committed Sep 19, 2024
1 parent 5dc88b4 commit 80beb90
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 8 deletions.
77 changes: 70 additions & 7 deletions builtin/survey.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ struct survey_report_object_size_summary {
typedef int (*survey_top_size_cmp)(struct survey_report_object_size_summary *s1,
struct survey_report_object_size_summary *s2);

MAYBE_UNUSED
static int cmp_by_nr(struct survey_report_object_size_summary *s1,
struct survey_report_object_size_summary *s2)
{
Expand All @@ -87,7 +86,6 @@ static int cmp_by_nr(struct survey_report_object_size_summary *s1,
return 0;
}

MAYBE_UNUSED
static int cmp_by_disk_size(struct survey_report_object_size_summary *s1,
struct survey_report_object_size_summary *s2)
{
Expand All @@ -98,7 +96,6 @@ static int cmp_by_disk_size(struct survey_report_object_size_summary *s1,
return 0;
}

MAYBE_UNUSED
static int cmp_by_inflated_size(struct survey_report_object_size_summary *s1,
struct survey_report_object_size_summary *s2)
{
Expand All @@ -122,7 +119,6 @@ struct survey_report_top_sizes {
size_t alloc;
};

MAYBE_UNUSED
static void init_top_sizes(struct survey_report_top_sizes *top,
size_t limit, const char *name,
survey_top_size_cmp cmp)
Expand All @@ -142,7 +138,6 @@ static void clear_top_sizes(struct survey_report_top_sizes *top)
free(top->data);
}

MAYBE_UNUSED
static void maybe_insert_into_top_size(struct survey_report_top_sizes *top,
struct survey_report_object_size_summary *summary)
{
Expand Down Expand Up @@ -178,6 +173,10 @@ struct survey_report {
struct survey_report_object_summary reachable_objects;

struct survey_report_object_size_summary *by_type;

struct survey_report_top_sizes *top_paths_by_count;
struct survey_report_top_sizes *top_paths_by_disk;
struct survey_report_top_sizes *top_paths_by_inflate;
};

#define REPORT_TYPE_COMMIT 0
Expand Down Expand Up @@ -420,6 +419,13 @@ static void survey_report_object_sizes(const char *title,
clear_table(&table);
}

static void survey_report_plaintext_sorted_size(
struct survey_report_top_sizes *top)
{
survey_report_object_sizes(top->name, _("Path"),
top->data, top->nr);
}

static void survey_report_plaintext(struct survey_context *ctx)
{
printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
Expand All @@ -430,6 +436,21 @@ static void survey_report_plaintext(struct survey_context *ctx)
_("Object Type"),
ctx->report.by_type,
REPORT_TYPE_COUNT);

survey_report_plaintext_sorted_size(
&ctx->report.top_paths_by_count[REPORT_TYPE_TREE]);
survey_report_plaintext_sorted_size(
&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]);

survey_report_plaintext_sorted_size(
&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]);
survey_report_plaintext_sorted_size(
&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]);

survey_report_plaintext_sorted_size(
&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]);
survey_report_plaintext_sorted_size(
&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]);
}

/*
Expand Down Expand Up @@ -670,7 +691,8 @@ static void increment_totals(struct survey_context *ctx,

static void increment_object_totals(struct survey_context *ctx,
struct oid_array *oids,
enum object_type type)
enum object_type type,
const char *path)
{
struct survey_report_object_size_summary *total;
struct survey_report_object_size_summary summary = { 0 };
Expand Down Expand Up @@ -702,6 +724,27 @@ static void increment_object_totals(struct survey_context *ctx,
total->disk_size += summary.disk_size;
total->inflated_size += summary.inflated_size;
total->num_missing += summary.num_missing;

if (type == OBJ_TREE || type == OBJ_BLOB) {
int index = type == OBJ_TREE ?
REPORT_TYPE_TREE : REPORT_TYPE_BLOB;
struct survey_report_top_sizes *top;

/*
* Temporarily store (const char *) here, but it will
* be duped if inserted and will not be freed.
*/
summary.label = (char *)path;

top = ctx->report.top_paths_by_count;
maybe_insert_into_top_size(&top[index], &summary);

top = ctx->report.top_paths_by_disk;
maybe_insert_into_top_size(&top[index], &summary);

top = ctx->report.top_paths_by_inflate;
maybe_insert_into_top_size(&top[index], &summary);
}
}

static int survey_objects_path_walk_fn(const char *path,
Expand All @@ -713,7 +756,7 @@ static int survey_objects_path_walk_fn(const char *path,

increment_object_counts(&ctx->report.reachable_objects,
type, oids->nr);
increment_object_totals(ctx, oids, type);
increment_object_totals(ctx, oids, type, path);

ctx->progress_nr += oids->nr;
display_progress(ctx->progress, ctx->progress_nr);
Expand All @@ -723,11 +766,31 @@ static int survey_objects_path_walk_fn(const char *path,

static void initialize_report(struct survey_context *ctx)
{
const int top_limit = 100;

CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags"));

CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT);
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE],
top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr);
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB],
top_limit, _("TOP FILES BY COUNT"), cmp_by_nr);

CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT);
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE],
top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size);
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB],
top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size);

CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT);
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE],
top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size);
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB],
top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size);
}

static void survey_phase_objects(struct survey_context *ctx)
Expand Down
12 changes: 11 additions & 1 deletion t/t8100-git-survey.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,17 @@ test_expect_success 'git survey (default)' '
Tags | 4 | 510 | 528
EOF
test_cmp expect out
lines=$(wc -l <expect) &&
head -n $lines out >out-trimmed &&
test_cmp expect out-trimmed &&
for type in "DIRECTORIES" "FILES"
do
for metric in "COUNT" "DISK SIZE" "INFLATED SIZE"
do
grep "TOP $type BY $metric" out || return 1
done || return 1
done
'

test_done

0 comments on commit 80beb90

Please sign in to comment.