From 1d4abdb74fa5b9606fd10d1caf771251d1017306 Mon Sep 17 00:00:00 2001 From: Patryk Kubiak Date: Wed, 30 Oct 2024 23:36:45 +0100 Subject: [PATCH] feat(scripts): Add script to calculate additional stats from Git repository --- config/.gitconfig | 1 + scripts/git/git_repo_analyzer.py | 168 +++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+) create mode 100755 scripts/git/git_repo_analyzer.py diff --git a/config/.gitconfig b/config/.gitconfig index 1f99e56..d385b16 100644 --- a/config/.gitconfig +++ b/config/.gitconfig @@ -95,6 +95,7 @@ howmany = "!sh -c 'git log -a --pretty=oneline | wc -l'" howmanybywhom = shortlog -sn howmanybyemail = shortlog -sen + howmanybyall = shortlog -sn --group=author --group=committer # both committer and author prsbyauthor = !git log --merges --pretty=\"%an\" | tr '[:upper:]' '[:lower:]' | tr '.' ' ' | sort |uniq -c | sort -rn diff --git a/scripts/git/git_repo_analyzer.py b/scripts/git/git_repo_analyzer.py new file mode 100755 index 0000000..e39e193 --- /dev/null +++ b/scripts/git/git_repo_analyzer.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +import subprocess +import collections +import os +import argparse +from typing import List, Dict +from datetime import datetime + + +class GitRepoAnalyzer: + def __init__(self, repo_path: str): + self.repo_path = os.path.abspath(repo_path) + if not self._is_valid_git_repo(): + raise ValueError(f"{self.repo_path} is not a valid Git repository") + + def _is_valid_git_repo(self) -> bool: + return os.path.isdir(os.path.join(self.repo_path, ".git")) + + def _run_git_command(self, args: List[str]) -> str: + try: + return subprocess.check_output(["git", "-C", self.repo_path] + args, stderr=subprocess.DEVNULL).decode("utf-8") + except subprocess.CalledProcessError: + raise RuntimeError(f"Error running git command in {self.repo_path}") + + def get_commit_data(self, format_string: str) -> List[str]: + output = self._run_git_command(["log", f"--pretty=format:{format_string}"]) + return output.split("\n") + + def analyze(self) -> Dict[str, Dict]: + authors = self.get_commit_data("%an") + author_emails = self.get_commit_data("%ae") + committers = self.get_commit_data("%cn") + committer_emails = self.get_commit_data("%ce") + dates = self.get_commit_data("%ad") + messages = self.get_commit_data("%s") + + author_counts = collections.Counter(authors) + committer_counts = collections.Counter(committers) + author_email_counts = collections.Counter(author_emails) + committer_email_counts = collections.Counter(committer_emails) + + date_objects = [datetime.strptime(date, "%a %b %d %H:%M:%S %Y %z") for date in dates] + earliest_commit = min(date_objects) + latest_commit = max(date_objects) + + file_changes = self._run_git_command(["log", "--numstat", "--format="]) + additions, deletions = 0, 0 + files_changed = set() + for line in file_changes.split("\n"): + if line.strip(): + add, delete, filename = line.split("\t") + if add != "-" and delete != "-": # Check if the values are numeric + additions += int(add) + deletions += int(delete) + files_changed.add(filename) + + branches = self._run_git_command(["branch", "-r"]).split("\n") + branch_count = len([b for b in branches if b.strip()]) + + repo_age = latest_commit - earliest_commit + commits_per_day = len(authors) / (repo_age.days + 1) + + message_lengths = [len(msg) for msg in messages] + avg_message_length = sum(message_lengths) / len(message_lengths) + + return { + "authors": dict(author_counts), + "author_emails": dict(author_email_counts), + "committers": dict(committer_counts), + "committer_emails": dict(committer_email_counts), + "commit_count": len(authors), + "date_range": {"earliest": earliest_commit, "latest": latest_commit, "duration": repo_age}, + "file_changes": {"additions": additions, "deletions": deletions, "files_changed": len(files_changed)}, + "branch_count": branch_count, + "commits_per_day": commits_per_day, + "avg_message_length": avg_message_length, + } + + +class GitRepoReporter: + def __init__(self, data: Dict[str, Dict]): + self.data = data + + def print_contribution_stats(self): + print("\nContribution Statistics:") + print(f"Total Contributors: {len(self.data['authors'])}") + print(f"Total Commits: {self.data['commit_count']}") + print(f"Average Commits per Contributor: {self.data['commit_count'] / len(self.data['authors']):.2f}") + + def print_top_contributors(self, n: int): + print(f"\nTop {n} Contributors:") + for author, count in sorted(self.data["authors"].items(), key=lambda x: x[1], reverse=True)[:n]: + print(f" {author}: {count} commits") + + def print_email_domain_stats(self): + print("\nEmail Domain Statistics:") + email_domains = collections.Counter([email.split("@")[1] for email in self.data["author_emails"]]) + for domain, count in email_domains.most_common(5): + print(f" {domain}: {count} commits") + + def print_time_stats(self): + print("\nTime Statistics:") + print(f"Repository Age: {self.data['date_range']['duration'].days} days") + print(f"First Commit: {self.data['date_range']['earliest'].strftime('%Y-%m-%d')}") + print(f"Latest Commit: {self.data['date_range']['latest'].strftime('%Y-%m-%d')}") + print(f"Average Commits per Day: {self.data['commits_per_day']:.2f}") + + def print_code_change_stats(self): + print("\nCode Change Statistics:") + print(f"Total Lines Added: {self.data['file_changes']['additions']}") + print(f"Total Lines Deleted: {self.data['file_changes']['deletions']}") + print(f"Total Files Changed: {self.data['file_changes']['files_changed']}") + print(f"Average Lines per Commit: {(self.data['file_changes']['additions'] + self.data['file_changes']['deletions']) / self.data['commit_count']:.2f}") + + def print_repo_structure(self): + print("\nRepository Structure:") + print(f"Number of Branches: {self.data['branch_count']}") + + def print_commit_message_stats(self): + print("\nCommit Message Statistics:") + print(f"Average Commit Message Length: {self.data['avg_message_length']:.2f} characters") + + def print_author_committer_diff(self): + print("\nDifferences between Authors and Committers:") + all_names = set(self.data["authors"].keys()) | set(self.data["committers"].keys()) + differences = [ + (name, self.data["authors"].get(name, 0), self.data["committers"].get(name, 0)) + for name in all_names + if self.data["authors"].get(name, 0) != self.data["committers"].get(name, 0) + ] + if differences: + for name, author_count, committer_count in sorted(differences, key=lambda x: abs(x[1] - x[2]), reverse=True): + print(f" {name}: Authored {author_count}, Committed {committer_count}") + else: + print(" No differences found between authors and committers") + + def print_summary(self, top_n: int = 5): + print("Repository Analysis Summary") + print("===========================") + self.print_contribution_stats() + self.print_top_contributors(top_n) + self.print_email_domain_stats() + self.print_time_stats() + self.print_code_change_stats() + self.print_repo_structure() + self.print_commit_message_stats() + self.print_author_committer_diff() + + +def main(): + parser = argparse.ArgumentParser(description="Analyze Git repository for detailed contribution statistics") + parser.add_argument("repo_path", nargs="?", default=".", help="Path to the Git repository (default: current directory)") + parser.add_argument("-n", "--top_contributors", type=int, default=5, help="Number of top contributors to display (default: 5)") + args = parser.parse_args() + + try: + analyzer = GitRepoAnalyzer(args.repo_path) + data = analyzer.analyze() + reporter = GitRepoReporter(data) + reporter.print_summary(args.top_contributors) + except (ValueError, RuntimeError) as e: + print(f"Error: {str(e)}") + except Exception as e: + print(f"An unexpected error occurred: {str(e)}") + + +if __name__ == "__main__": + main()