#!/usr/bin/env python3
"""
Git Commit Diff Extractor
A tool to extract git diffs from commits in the /opt/ds/oats repository,
store commit metadata and diffs in a pandas DataFrame, and save results to disk.
"""
import os
import sys
import argparse
import pandas as pd
from git import Repo
from datetime import datetime
from typing import List, Dict, Any
from oats.log import cl
try:
from oats.api_req1 import AgentReq
except ImportError:
AgentReq = None # type: ignore
log = cl('git_diff_extractor')
[docs]
def get_git_repo(repo_path: str) -> Repo:
"""Initialize and return a GitPython ``Repo`` object.
Args:
repo_path: Filesystem path to the Git repository.
Returns:
A ``git.Repo`` instance for the given path.
Raises:
ValueError: If the path does not exist or is not a Git repository.
"""
try:
if not os.path.exists(repo_path):
raise ValueError(f"Repository path does not exist: {repo_path}")
if not os.path.isdir(os.path.join(repo_path, '.git')):
raise ValueError(f"Path is not a git repository: {repo_path}")
return Repo(repo_path)
except Exception as e:
log.error(f"Failed to initialize git repository at {repo_path}: {str(e)}")
raise
[docs]
def create_dataframe(commits_data: List[Dict[str, Any]]) -> pd.DataFrame:
"""Create a pandas DataFrame from a list of commit data dicts.
Args:
commits_data: List of dicts returned by :func:`extract_commit_data`.
Returns:
A pandas DataFrame with one row per commit.
"""
df = pd.DataFrame(commits_data)
return df
[docs]
def save_dataframe(df: pd.DataFrame, output_file: str) -> None:
"""Save the DataFrame to a CSV file on disk.
Args:
df: The pandas DataFrame to persist.
output_file: Filesystem path for the output CSV.
Raises:
Exception: If the file cannot be written.
"""
try:
df.to_csv(output_file, index=False)
log.info(f"Successfully saved DataFrame to {output_file}")
except Exception as e:
log.error(f"Failed to save DataFrame to {output_file}: {str(e)}")
raise
[docs]
def new_api(areq=None, repo_path: str = None) -> Dict[str, Any]:
"""Main API function to extract git diffs from a repository.
Extracts commit metadata and diffs, builds a DataFrame, and saves it to CSV.
Args:
areq: AgentReq instance containing configuration (repo_path, max_commits, output_file).
repo_path: Direct repo path override (ignored if ``areq`` provides one).
Returns:
Dictionary with keys: ``success``, ``message``, ``output_file``, ``commit_count``, ``df``.
"""
if repo_path is None:
repo_path = '/opt/ds/oats'
try:
# Get repo path from AgentReq (default to /opt/ds/oats)
repo_path = getattr(areq, 'repo_path', '/opt/ds/oats')
log.info(f"Starting git diff extraction from {repo_path}")
# Validate repo path
if not os.path.exists(repo_path):
raise ValueError(f"Repository path does not exist: {repo_path}")
# Initialize git repo
repo = get_git_repo(repo_path)
# Extract commit data (limit to 100 commits if not specified)
max_commits = getattr(areq, 'max_commits', 100)
commits_data = extract_commit_data(repo, max_commits)
# Create DataFrame
df = create_dataframe(commits_data)
# Save to CSV
if getattr(areq, 'output_file', None) is not None:
output_file = getattr(areq, 'output_file')
else:
output_file = f'git_diffs_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
save_dataframe(df, output_file)
# Return result
result = {'success': True, 'message': f'Successfully extracted {len(df)} commits', 'output_file': output_file, 'commit_count': len(df), 'df': df}
log.info(result['message'])
return result
except Exception as e:
error_msg = f"Git diff extraction failed: {str(e)}"
log.error(error_msg)
return {'success': False, 'message': error_msg}
[docs]
def setup_parser() -> argparse.ArgumentParser:
"""Set up and return the CLI argument parser for the diff extractor.
Returns:
Configured ``argparse.ArgumentParser`` instance.
"""
parser = argparse.ArgumentParser(description='Git Commit Diff Extractor')
parser.add_argument('-r', '--repo-path', default='/opt/ds/oats', help='Path to git repository (default: /opt/ds/oats)')
parser.add_argument('-o', '--output-file', default=None, help='Output CSV file path')
parser.add_argument('-m', '--max-commits', type=int, default=100, help='Maximum number of commits to extract (default: 100)')
parser.add_argument('-v', '--verbose', action='store_true', help='Enable verbose logging')
return parser
[docs]
def main() -> int:
"""CLI entry point — parse args, run diff extraction, and return exit code."""
parser = setup_parser()
args = parser.parse_args()
if args.verbose:
log.info("Verbose mode enabled")
# Create minimal AgentReq for testing
agent_req = AgentReq(prompt="Git diff extraction", db_enabled=False, s3_enabled=False, rc_enabled=False, dbui_enabled=False)
# Set properties from CLI args
setattr(agent_req, 'repo_path', args.repo_path)
setattr(agent_req, 'output_file', args.output_file)
setattr(agent_req, 'max_commits', args.max_commits)
# Execute the API
result = new_api(repo_path='/opt/ds/oats')
if result['success']:
log.info(f"Extraction completed successfully. Output saved to {result['output_file']}")
return 0
else:
log.error(f"Extraction failed: {result['message']}")
return 1
if __name__ == "__main__":
sys.exit(main())