Source code for oats.models

"""Data models for the OAT (Open Agent Tools) system."""
import heapq
import os
import re
import ujson as json
from typing import List, Tuple
from pydantic import BaseModel
from rank_bm25 import BM25Okapi
from oats.pp import pp
from oats.log import gl

log = gl('oats.models')

[docs] class OatPromptChoices(BaseModel): """Container for tool-choice results returned by the OAT index.""" status: bool = False actions: list[str] = [] prompts: list[str] = [] src_files: list[str] = [] partial_actions: list[str] = [] partial_prompts: list[str] = [] partial_src_files: list[str] = [] index_files: list[str] = [] tool_data: dict = {} version: str = '9'
[docs] class OatConfig(BaseModel): """Configuration and index data for OAT tool resolution.""" repo_uses_index: str = os.getenv("CODER_TOOL_USES_INDEX", "./.ai/AGENT.repo_uses.python.tools.json") repo_uses_data: dict = {} repo_uses_actions: list = [] repo_uses_prompts: list = [] repo_uses_src_files: list = [] repo_uses_action_to_src_dict: dict = {} repo_uses_prompt_dict: dict = {} best_tools: list[dict] = [] best_impls: dict = {}
[docs] def __init__(self, **kwargs): """Load the OAT index file and build action/prompt lookup dictionaries.""" super().__init__(**kwargs) self.best_tools = {} self.best_impls = {} if os.path.exists(self.repo_uses_index): with open(self.repo_uses_index, 'r') as f: self.repo_uses_data = json.loads(f.read()) if len(self.repo_uses_data) > 0: for src_file in self.repo_uses_data: for use_key in self.repo_uses_data[src_file]: action_str = use_key src_prompt = self.repo_uses_data[src_file][use_key] self.repo_uses_actions.append(action_str) self.repo_uses_prompts.append(src_prompt) if action_str not in self.repo_uses_action_to_src_dict: self.repo_uses_action_to_src_dict[action_str] = [] if action_str not in self.repo_uses_prompt_dict: self.repo_uses_prompt_dict[action_str] = [] self.repo_uses_action_to_src_dict[action_str].append(src_file) self.repo_uses_prompt_dict[action_str].append(src_prompt)
[docs] def get_prompt_choices(self, prompt: str, verbose: bool = False) -> OatPromptChoices: """Find tool matches for a prompt using exact and partial string matching.""" choices = OatPromptChoices() choices.status = False prompt_splits = prompt.split(' ') first_prompt = None second_prompt = None third_prompt = None first_prompt = None num_words = len(prompt_splits) all_prompts = [] found_prompts = {} if num_words >= 1: all_prompts.insert(0, '_'.join(prompt_splits[:1]).lower()) if num_words >= 2: all_prompts.insert(0, '_'.join(prompt_splits[:2]).lower()) if num_words >= 3: all_prompts.insert(0, '_'.join(prompt_splits[:3]).lower()) if num_words >= 4: all_prompts.insert(0, '_'.join(prompt_splits[:4]).lower()) if num_words == 0: log.error(f'no_words_found_in_prompt_to_check: {prompt}') return choices valid_match = False choices.status = False src_files = [] best_files = [] best_uses = {} tool_results = [] use_prompts = [] actions = [] self.best_tools = {} self.best_impls = {} for check_prompt in all_prompts: if check_prompt == '': continue if check_prompt in self.repo_uses_action_to_src_dict: if check_prompt in self.repo_uses_prompt_dict: if check_prompt not in found_prompts: found_prompts[check_prompt] = self.repo_uses_action_to_src_dict[check_prompt] else: continue actions.append(check_prompt) valid_match = True for full_prompt in self.repo_uses_prompt_dict[check_prompt]: use_prompts.append(full_prompt) for src_file in self.repo_uses_action_to_src_dict[check_prompt]: src_files.append(src_file) if src_file not in best_uses: best_files.append(src_file) best_uses[src_file] = self.repo_uses_data[src_file] new_tool_def = { "file": src_file, "func": check_prompt, "description": best_uses[src_file][check_prompt], "score": float(f'{1.0 - (0.1 * len(tool_results)):0.2f}'), "retrieval_score": 1.0, } tool_results.append(new_tool_def) else: log.error(f'missing_prompt_dict_check_prompt: {check_prompt[0:32]}') else: if verbose: log.error(f'missing_action_to_src_dict: {check_prompt[0:32]}') if len(actions) < 2: skip_words = [ 'build', 'create', 'review', 'get', 'delete' # 'prune', # 'retrieve', 'think', # 'wonder', 'walk', 'check', # 'estimate', # 'find', # 'search', # 'query', 'read', 'view', 'gets', 'assemble', 'compile', 'built', 'to', 'the', 'it', 'a' 'inspect', 'curate', 'analyze', 'process', 'handle', 'examine', 'test', 'text', 'name', 'user', 'person' 'api', 'helper', 'list', 'select', 'take', 'took', 'red', 'blue', 'green' 'yellow', 'orange', ] for action in self.repo_uses_action_to_src_dict: for check_prompt in all_prompts: if check_prompt == '': continue if check_prompt in skip_words: continue lower_prompt = check_prompt.lower() # print(check_prompt) if lower_prompt in action: if action in self.repo_uses_prompt_dict: if action not in found_prompts: # found_prompts[action] = self.repo_uses_prompts[action] found_prompts[action] = self.repo_uses_prompt_dict[action] else: continue check_value = self.repo_uses_prompt_dict[action] if verbose: log.info(f'# Source action: {action} check_prompt:\n```\n{check_prompt}\n```\ncheck_value\n```\n{check_value}\n```\n') actions.append(check_prompt) for src_file in self.repo_uses_action_to_src_dict[action]: src_files.append(src_file) if src_file not in best_uses: best_files.append(src_file) best_uses[src_file] = self.repo_uses_data[src_file] use_func = '' use_desc = '' for comp_prompt, comp_desc in best_uses[src_file].items(): if lower_prompt in comp_prompt: use_func = comp_prompt use_desc = comp_desc break # if src_file in ['coder/othink/start_test5.py']: # print(use_func) # print(use_desc) if use_func == '': use_func = list(best_uses[src_file].items())[0][0] use_desc = best_uses[src_file][use_func] log.debug('-----\nstart:\n') new_tool_def = { "file": src_file, "func": use_func, "description": use_desc, "score": float(f'{1.0 - (0.1 * len(tool_results)):0.2f}'), "retrieval_score": 1.0, } tool_results.append(new_tool_def) # if len(tool_results) > 1: # print(pp(tool_results) # STOP_TOOL_TEST_1 # print(found_prompts) found_prompts['src_files'] = src_files # print(src_files) valid_match = True for full_prompt in self.repo_uses_prompt_dict[action]: use_prompts.append(full_prompt) else: log.error(f'missing_partial_prompt_dict_check_prompt: {action[0:32]}') # else: # log.error(f'missing_partial_action_in_check_prompt action: {action} prompt: {check_prompt[0:32]}') if len(src_files) == 0: choices.status = False else: choices.status = True choices.actions = actions[0:10] choices.prompts = use_prompts[0:10] choices.src_files = src_files[0:10] choices.index_files.append(self.repo_uses_index) model = 'bm25' tool_data = { "query": prompt, "model": model, "reranked": False, "best_files": best_files, "best_uses": best_uses, "results": tool_results, } if verbose: log.info(f'# Prompt Choice Report') log.debug('best_uses:') print(pp(best_uses)) log.debug('tool_results:') print(pp(tool_results)) """ # create best_impls for tool_node in tool_results: src_file = tool_node['file'] func_name = tool_node['func'] description = tool_node['description'] score = tool_node['score'] rscore = tool_node['retrieval_score'] """ choices.tool_data = tool_data return choices
[docs] def get_best_matches_bm25(self, prompt: str, top_k: int = 5, verbose: bool = False) -> OatPromptChoices: """Use BM25 to find the best matching actions for a given prompt. Compares the prompt against each key in repo_uses_action_to_src_dict and returns the top_k best matches sorted by BM25 score. """ choices = OatPromptChoices() choices.status = False if not self.repo_uses_action_to_src_dict: if verbose: log.error('repo_uses_action_to_src_dict is empty — nothing to match against') return choices # Build corpus: one entry per action key actions_list = list(self.repo_uses_action_to_src_dict.keys()) # Tokenize: split on underscores and whitespace so that # "post_message_to_channel" and "post message to channel" match. tokenized_corpus = [re.split(r'[\s_]+', action.lower()) for action in actions_list] tokenized_query = re.split(r'[\s_]+', prompt.lower()) if not tokenized_query: log.error(f'no tokens in prompt for bm25: {prompt}') return choices # Build BM25 index bm25 = BM25Okapi(tokenized_corpus) # Score query against all actions scores = bm25.get_scores(tokenized_query) # Get top_k indices sorted by descending score top_indices = heapq.nlargest(top_k, range(len(scores)), key=lambda i: scores[i]) # Filter out zero-score results top_indices = [i for i in top_indices if scores[i] > 0] if not top_indices: if verbose: log.info(f'### Sorry!! no bm25 matches found for prompt: {prompt[:64]}') return choices choices.status = True src_files: List[str] = [] use_prompts: List[str] = [] actions = [] for idx in top_indices: action = actions_list[idx] choices.actions.append(action) # Collect associated source files for src_file in self.repo_uses_action_to_src_dict[action]: if src_file not in src_files: src_files.append(src_file) # Collect associated prompts if action in self.repo_uses_prompt_dict: for full_prompt in self.repo_uses_prompt_dict[action]: if full_prompt not in use_prompts: use_prompts.append(full_prompt) if len(src_files) == 0: choices.status = False else: choices.status = True choices.prompts = use_prompts[0:10] choices.src_files = src_files[0:10] choices.index_files.append(self.repo_uses_index) return choices