Source code for oats.git.build_git_repo_to_dataset

#!/usr/bin/env python3
"""
Git Repository to Markdown Dataset Builder

This module analyzes a Git repository and generates markdown representations
of each commit's diff, storing them in a pandas DataFrame under the column 'git_diff_md'.
"""

import sys
import argparse
import pandas as pd
from git import Repo
from oats.log import cl

log = cl('build_git_dataset')



[docs]
def build_git_diff_markdown(diff_index) -> str:
    """Convert a Git diff index to a markdown table.

    The table has columns: File, Type (Added/Modified/Deleted/Renamed), and Changes.

    Args:
        diff_index: A GitPython diff index (iterable of diff objects).

    Returns:
        A markdown-formatted table as a string, or empty string if no diffs.
    """
    if not diff_index:
        return ""

    markdown_lines = []
    markdown_lines.append("| File | Type | Changes |")
    markdown_lines.append("|------|------|---------|")

    for diff in diff_index:
        # Get the file names
        old_file = diff.a_path if diff.a_path else "N/A"
        new_file = diff.b_path if diff.b_path else "N/A"

        # Determine type of change
        if diff.deleted_file:
            change_type = "Deleted"
        elif diff.renamed_file:
            change_type = "Renamed"
        elif diff.new_file:
            change_type = "Added"
        else:
            change_type = "Modified"

        # Get changes count
        additions = diff.added_lines if hasattr(diff, "added_lines") else 0
        deletions = diff.removed_lines if hasattr(diff, "removed_lines") else 0
        changes = f"+{additions} -{deletions}" if additions or deletions else "No changes"

        # Format row
        if old_file != new_file and diff.renamed_file:
            file_info = f"{old_file} → {new_file}"
        else:
            file_info = new_file if new_file else old_file

        markdown_lines.append(f"| {file_info} | {change_type} | {changes} |")

    return "\n".join(markdown_lines)




[docs]
def extract_commit_info(commit) -> dict:
    """Extract basic metadata from a Git commit object.

    Args:
        commit: A GitPython commit object.

    Returns:
        Dict with keys: ``commit_hash``, ``author``, ``email``, ``date``, ``message``.
    """
    return {
        "commit_hash": commit.hexsha,
        "author": commit.author.name,
        "email": commit.author.email,
        "date": commit.committed_datetime.isoformat(),
        "message": commit.message.strip(),
    }




[docs]
def build_git_dataset(repo_path: str) -> pd.DataFrame:
    """Build a pandas DataFrame containing git commit diffs in markdown format.

    Iterates over all commits in the repository, extracts metadata, computes
    diffs between consecutive commits, and formats each diff as a markdown table.

    Args:
        repo_path: Path to the Git repository.

    Returns:
        DataFrame with columns: ``commit_hash``, ``author``, ``email``, ``date``,
        ``message``, ``git_diff_md``.

    Raises:
        Exception: If the repository cannot be opened or processed.
    """
    try:
        repo = Repo(repo_path)
        commits = list(repo.iter_commits())

        # Prepare data structures
        commit_data = []

        # Process commits from newest to oldest
        for i, commit in enumerate(commits):
            log.info(f"Processing commit {i + 1}/{len(commits)}: {commit.hexsha[:8]}")

            # Get commit info
            commit_info = extract_commit_info(commit)

            # Get diff for this commit (if not the first commit)
            if i < len(commits) - 1:
                # Get diff between this commit and the previous one
                prev_commit = commits[i + 1]
                diff_index = prev_commit.diff(commit, create_patch=True)
            else:
                # For the first commit, get diff with no parent
                diff_index = commit.diff(None, create_patch=True)

            # Convert diff to markdown
            markdown_diff = build_git_diff_markdown(diff_index)

            # Add to commit data
            commit_info["git_diff_md"] = markdown_diff
            commit_data.append(commit_info)

        # Create DataFrame
        df = pd.DataFrame(commit_data)
        return df

    except Exception as e:
        log.error(f"Error processing Git repository: {str(e)}")
        raise




[docs]
def main() -> pd.DataFrame:
    """CLI entry point — build the git dataset and optionally save to Parquet."""
    parser = argparse.ArgumentParser(description="Build a markdown dataset from Git repository commits")
    parser.add_argument('-r', "--repo-path", default="/opt/ds/oats", help="Path to the Git repository (default: /opt/ds/oats)")
    parser.add_argument('-o', "--output-file", help="Output file path for the pandas DataFrame (optional)")

    args = parser.parse_args()

    try:
        log.info(f"Building Git dataset from repository: {args.repo_path}")
        df = build_git_dataset(args.repo_path)

        if args.output_file:
            df.to_parquet(args.output_file)
            log.info(f"DataFrame saved to: {args.output_file}")

        # Display summary
        log.info(f"Processed {len(df)} commits")
        log.info("Sample of git_diff_md column:")
        for idx, row in df.head().iterrows():
            print(f"\nCommit {row['commit_hash'][:8]} - {row['message'][:50]}...")
            print(row["git_diff_md"][:200] + "..." if len(row["git_diff_md"]) > 200 else row["git_diff_md"])

        return df

    except Exception as e:
        log.error(f"Failed to build Git dataset: {str(e)}")
        sys.exit(1)



# High-level API function compatible with the specified signature

[docs]
def new_api(areq) -> pd.DataFrame:
    """High-level API function for integrating with AgentReq.

    Delegates to :func:`build_git_dataset` using the repo path from the request.

    Args:
        areq: AgentReq object with a ``repo_path`` attribute (defaults to ``.``).

    Returns:
        DataFrame with commit information and markdown diffs.
    """
    # This would integrate with the AgentReq object as needed
    # For now, we'll use the default behavior
    repo_path = getattr(areq, "repo_path", ".")
    return build_git_dataset(repo_path)



if __name__ == "__main__":
    # Run main function if executed directly
    df = main()