Skip to content

Commit

Permalink
feat(scripts): Add script to calculate additional stats from Git repo…
Browse files Browse the repository at this point in the history
…sitory
  • Loading branch information
kpatryk committed Oct 30, 2024
1 parent db08ff1 commit 1d4abdb
Show file tree
Hide file tree
Showing 2 changed files with 169 additions and 0 deletions.
1 change: 1 addition & 0 deletions config/.gitconfig
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@
howmany = "!sh -c 'git log -a --pretty=oneline | wc -l'"
howmanybywhom = shortlog -sn
howmanybyemail = shortlog -sen
howmanybyall = shortlog -sn --group=author --group=committer # both committer and author

prsbyauthor = !git log --merges --pretty=\"%an\" | tr '[:upper:]' '[:lower:]' | tr '.' ' ' | sort |uniq -c | sort -rn

Expand Down
168 changes: 168 additions & 0 deletions scripts/git/git_repo_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#!/usr/bin/env python3
import subprocess
import collections
import os
import argparse
from typing import List, Dict
from datetime import datetime


class GitRepoAnalyzer:
def __init__(self, repo_path: str):
self.repo_path = os.path.abspath(repo_path)
if not self._is_valid_git_repo():
raise ValueError(f"{self.repo_path} is not a valid Git repository")

def _is_valid_git_repo(self) -> bool:
return os.path.isdir(os.path.join(self.repo_path, ".git"))

def _run_git_command(self, args: List[str]) -> str:
try:
return subprocess.check_output(["git", "-C", self.repo_path] + args, stderr=subprocess.DEVNULL).decode("utf-8")
except subprocess.CalledProcessError:
raise RuntimeError(f"Error running git command in {self.repo_path}")

def get_commit_data(self, format_string: str) -> List[str]:
output = self._run_git_command(["log", f"--pretty=format:{format_string}"])
return output.split("\n")

def analyze(self) -> Dict[str, Dict]:
authors = self.get_commit_data("%an")
author_emails = self.get_commit_data("%ae")
committers = self.get_commit_data("%cn")
committer_emails = self.get_commit_data("%ce")
dates = self.get_commit_data("%ad")
messages = self.get_commit_data("%s")

author_counts = collections.Counter(authors)
committer_counts = collections.Counter(committers)
author_email_counts = collections.Counter(author_emails)
committer_email_counts = collections.Counter(committer_emails)

date_objects = [datetime.strptime(date, "%a %b %d %H:%M:%S %Y %z") for date in dates]
earliest_commit = min(date_objects)
latest_commit = max(date_objects)

file_changes = self._run_git_command(["log", "--numstat", "--format="])
additions, deletions = 0, 0
files_changed = set()
for line in file_changes.split("\n"):
if line.strip():
add, delete, filename = line.split("\t")
if add != "-" and delete != "-": # Check if the values are numeric
additions += int(add)
deletions += int(delete)
files_changed.add(filename)

branches = self._run_git_command(["branch", "-r"]).split("\n")
branch_count = len([b for b in branches if b.strip()])

repo_age = latest_commit - earliest_commit
commits_per_day = len(authors) / (repo_age.days + 1)

message_lengths = [len(msg) for msg in messages]
avg_message_length = sum(message_lengths) / len(message_lengths)

return {
"authors": dict(author_counts),
"author_emails": dict(author_email_counts),
"committers": dict(committer_counts),
"committer_emails": dict(committer_email_counts),
"commit_count": len(authors),
"date_range": {"earliest": earliest_commit, "latest": latest_commit, "duration": repo_age},
"file_changes": {"additions": additions, "deletions": deletions, "files_changed": len(files_changed)},
"branch_count": branch_count,
"commits_per_day": commits_per_day,
"avg_message_length": avg_message_length,
}


class GitRepoReporter:
def __init__(self, data: Dict[str, Dict]):
self.data = data

def print_contribution_stats(self):
print("\nContribution Statistics:")
print(f"Total Contributors: {len(self.data['authors'])}")
print(f"Total Commits: {self.data['commit_count']}")
print(f"Average Commits per Contributor: {self.data['commit_count'] / len(self.data['authors']):.2f}")

def print_top_contributors(self, n: int):
print(f"\nTop {n} Contributors:")
for author, count in sorted(self.data["authors"].items(), key=lambda x: x[1], reverse=True)[:n]:
print(f" {author}: {count} commits")

def print_email_domain_stats(self):
print("\nEmail Domain Statistics:")
email_domains = collections.Counter([email.split("@")[1] for email in self.data["author_emails"]])
for domain, count in email_domains.most_common(5):
print(f" {domain}: {count} commits")

def print_time_stats(self):
print("\nTime Statistics:")
print(f"Repository Age: {self.data['date_range']['duration'].days} days")
print(f"First Commit: {self.data['date_range']['earliest'].strftime('%Y-%m-%d')}")
print(f"Latest Commit: {self.data['date_range']['latest'].strftime('%Y-%m-%d')}")
print(f"Average Commits per Day: {self.data['commits_per_day']:.2f}")

def print_code_change_stats(self):
print("\nCode Change Statistics:")
print(f"Total Lines Added: {self.data['file_changes']['additions']}")
print(f"Total Lines Deleted: {self.data['file_changes']['deletions']}")
print(f"Total Files Changed: {self.data['file_changes']['files_changed']}")
print(f"Average Lines per Commit: {(self.data['file_changes']['additions'] + self.data['file_changes']['deletions']) / self.data['commit_count']:.2f}")

def print_repo_structure(self):
print("\nRepository Structure:")
print(f"Number of Branches: {self.data['branch_count']}")

def print_commit_message_stats(self):
print("\nCommit Message Statistics:")
print(f"Average Commit Message Length: {self.data['avg_message_length']:.2f} characters")

def print_author_committer_diff(self):
print("\nDifferences between Authors and Committers:")
all_names = set(self.data["authors"].keys()) | set(self.data["committers"].keys())
differences = [
(name, self.data["authors"].get(name, 0), self.data["committers"].get(name, 0))
for name in all_names
if self.data["authors"].get(name, 0) != self.data["committers"].get(name, 0)
]
if differences:
for name, author_count, committer_count in sorted(differences, key=lambda x: abs(x[1] - x[2]), reverse=True):
print(f" {name}: Authored {author_count}, Committed {committer_count}")
else:
print(" No differences found between authors and committers")

def print_summary(self, top_n: int = 5):
print("Repository Analysis Summary")
print("===========================")
self.print_contribution_stats()
self.print_top_contributors(top_n)
self.print_email_domain_stats()
self.print_time_stats()
self.print_code_change_stats()
self.print_repo_structure()
self.print_commit_message_stats()
self.print_author_committer_diff()


def main():
parser = argparse.ArgumentParser(description="Analyze Git repository for detailed contribution statistics")
parser.add_argument("repo_path", nargs="?", default=".", help="Path to the Git repository (default: current directory)")
parser.add_argument("-n", "--top_contributors", type=int, default=5, help="Number of top contributors to display (default: 5)")
args = parser.parse_args()

try:
analyzer = GitRepoAnalyzer(args.repo_path)
data = analyzer.analyze()
reporter = GitRepoReporter(data)
reporter.print_summary(args.top_contributors)
except (ValueError, RuntimeError) as e:
print(f"Error: {str(e)}")
except Exception as e:
print(f"An unexpected error occurred: {str(e)}")


if __name__ == "__main__":
main()

0 comments on commit 1d4abdb

Please sign in to comment.