Source code for oats.git.git_to_df_converter

#!/usr/bin/env python3
"""
Git Commit to Pandas DataFrame Converter

Extracts Git commit history and converts it into a structured pandas DataFrame,
sorted by commit date in descending order. Can save the result as a Parquet file.
"""

import argparse
import os
import sys
import pandas as pd
from git import Repo
from oats.log import cl

log = cl("git_to_df")


[docs] def extract_git_commits(repo_path: str) -> pd.DataFrame: """Extract Git commit history from a repository and return as a pandas DataFrame. Iterates over all commits in the repository, collecting SHA, author, date, message, and parent hashes. Results are sorted by commit date in descending order. Args: repo_path: Path to the Git repository. Returns: DataFrame with columns: ``id``, ``author_name``, ``author_email``, ``commit_date``, ``message``, ``parents``. Raises: Exception: If the repository cannot be opened or read. """ try: # Open the repository repo = Repo(repo_path) # Extract commit data commits_data = [] for commit in repo.iter_commits(): commit_data = { "id": commit.hexsha, # SHA1 commit ID "author_name": commit.author.name, "author_email": commit.author.email, "commit_date": commit.committed_datetime, "message": commit.message.strip(), "parents": [parent.hexsha for parent in commit.parents], } commits_data.append(commit_data) # Create DataFrame df = pd.DataFrame(commits_data) # Sort by commit date descending df = df.sort_values("commit_date", ascending=False) # Reset index df = df.reset_index(drop=True) log.info(f"Successfully extracted {len(df)} commits from repository") return df except Exception as e: log.error(f"Error extracting Git commits: {str(e)}") raise
[docs] def save_dataframe_to_parquet(df: pd.DataFrame, output_path: str) -> None: """Save a pandas DataFrame to a Parquet file. Creates parent directories as needed (unless the path is an S3 URI). Args: df: The DataFrame to persist. output_path: Local filesystem path or S3 URI for the Parquet file. Raises: Exception: If the file cannot be written. """ try: if 's3://' not in output_path: # Ensure directory exists os.makedirs(os.path.dirname(output_path), exist_ok=True) # Save to Parquet df.to_parquet(output_path, index=False) log.info(f"DataFrame saved to {output_path}") except Exception as e: log.error(f"Error saving Parquet file: {str(e)}") raise
[docs] def main() -> None: """CLI entry point — parse args, extract commits, and save to Parquet.""" parser = argparse.ArgumentParser(description="Convert Git commit history to pandas DataFrame", prog="git-to-pandas") # Add short arguments parser.add_argument("-r", "--repo-path", help="Path to the Git repository", required=True) parser.add_argument("-o", "--output-file", help="Output Parquet file path", required=True) args = parser.parse_args() try: # Validate repository path if not os.path.exists(args.repo_path): raise ValueError(f"Repository path does not exist: {args.repo_path}") # Extract commits df = extract_git_commits(args.repo_path) # Save to Parquet save_dataframe_to_parquet(df, args.output_file) log.info("Git commit to DataFrame conversion completed successfully") except Exception as e: log.error(f"Failed to convert Git commits to DataFrame: {str(e)}") sys.exit(1)
if __name__ == "__main__": main()