Source code for oats.git.git_diff_extractor

#!/usr/bin/env python3

"""
Git Commit Diff Extractor

A tool to extract git diffs from commits in the /opt/ds/oats repository,
store commit metadata and diffs in a pandas DataFrame, and save results to disk.
"""

import os
import sys
import argparse
import pandas as pd
from git import Repo
from datetime import datetime
from typing import List, Dict, Any
from oats.log import cl

try:
    from oats.api_req1 import AgentReq
except ImportError:
    AgentReq = None  # type: ignore

log = cl('git_diff_extractor')



[docs]
def get_git_repo(repo_path: str) -> Repo:
    """Initialize and return a GitPython ``Repo`` object.

    Args:
        repo_path: Filesystem path to the Git repository.

    Returns:
        A ``git.Repo`` instance for the given path.

    Raises:
        ValueError: If the path does not exist or is not a Git repository.
    """
    try:
        if not os.path.exists(repo_path):
            raise ValueError(f"Repository path does not exist: {repo_path}")
        if not os.path.isdir(os.path.join(repo_path, '.git')):
            raise ValueError(f"Path is not a git repository: {repo_path}")
        return Repo(repo_path)
    except Exception as e:
        log.error(f"Failed to initialize git repository at {repo_path}: {str(e)}")
        raise




[docs]
def extract_commit_data(repo: Repo, max_commits: int = None) -> List[Dict[str, Any]]:
    """Extract commit metadata and diffs from the repository.

    Args:
        repo: A ``git.Repo`` instance to extract from.
        max_commits: Optional cap on the number of commits to process.

    Returns:
        List of dicts, each containing commit hash, author info, dates, message, and diff content.
    """

    commits_data = []

    try:
        # Get all commits in reverse chronological order
        commits = list(repo.iter_commits(reverse=True))

        # Limit number of commits if specified
        if max_commits:
            commits = commits[:max_commits]

        for i, commit in enumerate(commits):
            # Extract commit metadata
            commit_data = {'commit_hash': commit.hexsha, 'author_name': commit.author.name, 'author_email': commit.author.email, 'author_date': commit.authored_datetime.isoformat(), 'committer_name': commit.committer.name, 'committer_email': commit.committer.email, 'committer_date': commit.committed_datetime.isoformat(), 'message_summary': commit.summary, 'message_full': commit.message, 'diff_content': '', 'diff_size': 0, 'files_changed': len(commit.stats.files), 'insertions': commit.stats.total['insertions'], 'deletions': commit.stats.total['deletions']}

            # Generate diff content for this commit
            if i == 0:
                # For the first commit, we need to compare against the initial state
                diff_content = ""
                try:
                    # Get the diff for the first commit (compare to initial state)
                    diff = commit.diff(None, create_patch=True)
                    # Convert diff to string format properly
                    if hasattr(diff, '__iter__'):
                        # If diff is iterable, join the patches
                        diff_patches = []
                        for d in diff:
                            if hasattr(d, 'patch'):
                                diff_patches.append(d.patch)
                            else:
                                diff_patches.append(str(d))
                        diff_content = '\n'.join(diff_patches) if diff_patches else ""
                    else:
                        # Single diff object
                        if hasattr(diff, 'patch'):
                            diff_content = diff.patch
                        else:
                            diff_content = str(diff)
                except Exception as e:
                    log.warning(f"Could not generate diff for first commit {commit.hexsha}: {str(e)}")
                    diff_content = "Diff generation failed"
            else:
                # For subsequent commits, get diff against previous commit
                prev_commit = commits[i - 1]
                try:
                    diff = commit.diff(prev_commit, create_patch=True)
                    # Convert diff to string format properly
                    if hasattr(diff, '__iter__'):
                        # If diff is iterable, join the patches
                        diff_patches = []
                        for d in diff:
                            if hasattr(d, 'patch'):
                                diff_patches.append(d.patch)
                            else:
                                diff_patches.append(str(d))
                        diff_content = '\n'.join(diff_patches) if diff_patches else ""
                    else:
                        # Single diff object
                        if hasattr(diff, 'patch'):
                            diff_content = diff.patch
                        else:
                            diff_content = str(diff)
                except Exception as e:
                    log.warning(f"Could not generate diff for commit {commit.hexsha}: {str(e)}")
                    diff_content = "Diff generation failed"

            commit_data['diff_content'] = diff_content
            commit_data['diff_size'] = len(diff_content) if diff_content else 0

            commits_data.append(commit_data)

    except Exception as e:
        log.error(f"Error extracting commit data: {str(e)}")
        raise

    return commits_data




[docs]
def create_dataframe(commits_data: List[Dict[str, Any]]) -> pd.DataFrame:
    """Create a pandas DataFrame from a list of commit data dicts.

    Args:
        commits_data: List of dicts returned by :func:`extract_commit_data`.

    Returns:
        A pandas DataFrame with one row per commit.
    """
    df = pd.DataFrame(commits_data)
    return df




[docs]
def save_dataframe(df: pd.DataFrame, output_file: str) -> None:
    """Save the DataFrame to a CSV file on disk.

    Args:
        df: The pandas DataFrame to persist.
        output_file: Filesystem path for the output CSV.

    Raises:
        Exception: If the file cannot be written.
    """
    try:
        df.to_csv(output_file, index=False)
        log.info(f"Successfully saved DataFrame to {output_file}")
    except Exception as e:
        log.error(f"Failed to save DataFrame to {output_file}: {str(e)}")
        raise




[docs]
def new_api(areq=None, repo_path: str = None) -> Dict[str, Any]:
    """Main API function to extract git diffs from a repository.

    Extracts commit metadata and diffs, builds a DataFrame, and saves it to CSV.

    Args:
        areq: AgentReq instance containing configuration (repo_path, max_commits, output_file).
        repo_path: Direct repo path override (ignored if ``areq`` provides one).

    Returns:
        Dictionary with keys: ``success``, ``message``, ``output_file``, ``commit_count``, ``df``.
    """
    if repo_path is None:
        repo_path = '/opt/ds/oats'
    try:
        # Get repo path from AgentReq (default to /opt/ds/oats)
        repo_path = getattr(areq, 'repo_path', '/opt/ds/oats')
        log.info(f"Starting git diff extraction from {repo_path}")

        # Validate repo path
        if not os.path.exists(repo_path):
            raise ValueError(f"Repository path does not exist: {repo_path}")

        # Initialize git repo
        repo = get_git_repo(repo_path)

        # Extract commit data (limit to 100 commits if not specified)
        max_commits = getattr(areq, 'max_commits', 100)
        commits_data = extract_commit_data(repo, max_commits)

        # Create DataFrame
        df = create_dataframe(commits_data)

        # Save to CSV
        if getattr(areq, 'output_file', None) is not None:
            output_file = getattr(areq, 'output_file')
        else:
            output_file = f'git_diffs_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
        save_dataframe(df, output_file)

        # Return result
        result = {'success': True, 'message': f'Successfully extracted {len(df)} commits', 'output_file': output_file, 'commit_count': len(df), 'df': df}

        log.info(result['message'])
        return result

    except Exception as e:
        error_msg = f"Git diff extraction failed: {str(e)}"
        log.error(error_msg)
        return {'success': False, 'message': error_msg}




[docs]
def setup_parser() -> argparse.ArgumentParser:
    """Set up and return the CLI argument parser for the diff extractor.

    Returns:
        Configured ``argparse.ArgumentParser`` instance.
    """
    parser = argparse.ArgumentParser(description='Git Commit Diff Extractor')
    parser.add_argument('-r', '--repo-path', default='/opt/ds/oats', help='Path to git repository (default: /opt/ds/oats)')
    parser.add_argument('-o', '--output-file', default=None, help='Output CSV file path')
    parser.add_argument('-m', '--max-commits', type=int, default=100, help='Maximum number of commits to extract (default: 100)')
    parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose logging')

    return parser




[docs]
def main() -> int:
    """CLI entry point — parse args, run diff extraction, and return exit code."""
    parser = setup_parser()
    args = parser.parse_args()

    if args.verbose:
        log.info("Verbose mode enabled")

    # Create minimal AgentReq for testing
    agent_req = AgentReq(prompt="Git diff extraction", db_enabled=False, s3_enabled=False, rc_enabled=False, dbui_enabled=False)

    # Set properties from CLI args
    setattr(agent_req, 'repo_path', args.repo_path)
    setattr(agent_req, 'output_file', args.output_file)
    setattr(agent_req, 'max_commits', args.max_commits)

    # Execute the API
    result = new_api(repo_path='/opt/ds/oats')

    if result['success']:
        log.info(f"Extraction completed successfully. Output saved to {result['output_file']}")
        return 0
    else:
        log.error(f"Extraction failed: {result['message']}")
        return 1



if __name__ == "__main__":
    sys.exit(main())