Source code for github_search_engine.github_search_engine

  1import inspect
  2import logging
  3import sys
  4
  5import chevron
  6import onnxruntime
  7from githubkit.versions.v2022_11_28.models import Issue
  8from qdrant_client import QdrantClient
  9from qdrant_client.http.models import QueryResponse
 10
 11from github_search_engine.clients.github_client_manager import (
 12  GithubClientManager,
 13)
 14from github_search_engine.clients.ollama_client_manager import (
 15  OllamaClientManager,
 16)
 17
 18
[docs] 19class GithubSearchEngine: 20 def __init__( 21 self, 22 github_access_token: str, 23 qdrant_location: str | None = None, 24 qdrant_path: str | None = None, 25 ): 26 """A GithubSearchEngine to search GitHub repositories. 27 28 Initializes a client manager for GitHub, Qdrant, and Ollama services, setting up 29 logging and database model configuration. 30 31 Args: 32 github_access_token: The GitHub access token for accessing GitHub API. 33 qdrant_location: The location of the Qdrant server. Default is None. 34 qdrant_path: The path to the Qdrant database. Default is None. 35 """ 36 logging.basicConfig(level=logging.WARNING) 37 38 self._github_client = GithubClientManager(access_token=github_access_token) 39 self._database_client = QdrantClient( 40 location=qdrant_location, 41 path=qdrant_path, 42 ) 43 self._ollama_client = OllamaClientManager() 44 45 self._database_client.set_model( 46 "snowflake/snowflake-arctic-embed-m", 47 providers=onnxruntime.get_available_providers(), 48 ) 49 50 @staticmethod
[docs] 51 def summarise_issue(issue: Issue) -> str: 52 """Construct a summary string from an Issue object. 53 54 Summarizes the given issue by combining its title and body. This function takes an Issue object and returns a formatted 55 string containing the issue's title and body, separated by two newline characters. 56 57 Args: 58 issue: The issue to be summarised. The issue must have 'title' and 'body' attributes. 59 60 Returns: 61 A formatted string containing the issue's title and body. 62 """ 63 issue_summary = f""" 64 {issue.title} 65 66 {issue.body} 67 """ 68 return issue_summary
69
[docs] 70 def summarise_results( 71 self, 72 results: list[QueryResponse], 73 owner: str, 74 repository_name: str, 75 query: str, 76 ) -> str: 77 """Summarizes the content and discussion of GitHub issues. 78 79 Summarizes the content and discussion of given GitHub issues, presenting how they relate to a 80 specified query. The summary is concise, devoid of headings or titles, and presented in a 81 single 2-3 sentence paragraph. 82 83 Args: 84 results: A list of QueryResponse objects containing GitHub issues to summarize. 85 owner: The owner of the GitHub repository. 86 repository_name: The name of the GitHub repository. 87 query: The original query to relate the issues to. 88 89 Returns: 90 A single string containing the summarized content and discussions of all provided GitHub issues. 91 """ 92 prompt_template = """ 93 Please briefly summarise the content and discussion of the following github issues. 94 Keep it short, concise and to the point and explain how it relates to '{{originalQuery}}' 95 Do not write headings or titles, simply summarize into a single 2-3 sentence paragraph. 96 97 # {{issue.title}} 98 {{issue.body}} 99 100 Comments: 101 {{#comments}} 102 * {{body}} 103 {{/comments}} 104 """ 105 prompt_template = inspect.cleandoc(prompt_template) 106 logging.info("Summarising issues") 107 summaries = [] 108 for issue in results: 109 summary = self._ollama_client.chat( 110 chevron.render( 111 template=prompt_template, 112 data={ 113 "issue": issue.metadata, 114 "comments": [ 115 {"body": comment.body} 116 for comment in self._github_client.get_issue_comments( 117 owner=owner, 118 repository_name=repository_name, 119 issue_number=issue.metadata["number"], 120 ) 121 ], 122 "originalQuery": query, 123 }, 124 ) 125 ) 126 summary = f""" 127 # Issue [#{issue.metadata["number"]}]({issue.metadata["html_url"]}) 128 129 {summary} 130 """ 131 summaries.append(inspect.cleandoc(summary)) 132 final_summary = "\n\n".join(summaries) 133 logging.info("Done") 134 return final_summary
135
[docs] 136 async def index_repository(self, owner: str, repository_name: str): 137 """Index a GitHub repository. 138 139 Retrieves all issues from the specified repository and index them into a 140 vector database for further processing or querying. 141 142 Args: 143 owner: The owner of the GitHub repository. 144 repository_name: The name of the GitHub repository. 145 """ 146 logging.info(f"Fetching Issues from {owner}/{repository_name}") 147 issues = await self._github_client.get_repository_issues( 148 owner, repository_name 149 ) 150 151 logging.info("Adding to Vector DB") 152 self._database_client.add( 153 collection_name=f"{owner}/{repository_name}", 154 documents=[self.summarise_issue(issue) for issue in issues], 155 metadata=[issue.model_dump() for issue in issues], 156 ) 157 logging.info("Done")
158
[docs] 159 def search( 160 self, owner: str, repository_name: str, text: str 161 ) -> list[QueryResponse]: 162 """Searches for issues in the specified repository that match the given text. 163 164 This method searches the database for issues within the given repository that 165 match the specified text query. If the repository's collection does not exist 166 in the database, an error is logged and the program exits. The search results 167 are filtered to exclude issues with empty bodies. 168 169 Args: 170 owner: The owner of the repository. 171 repository_name: The name of the repository. 172 text: A natural language query to search for within the repository's issues. 173 174 Returns: 175 A list of query responses that match the search criteria. 176 """ 177 if not self._database_client.collection_exists( 178 collection_name=f"{owner}/{repository_name}", 179 ): 180 logging.error( 181 "DB Collection not found. Try indexing the repository first." 182 ) 183 sys.exit(1) 184 else: 185 results = self._database_client.query( 186 collection_name=f"{owner}/{repository_name}", 187 query_text=text, 188 score_threshold=0.8, 189 limit=5, 190 ) 191 192 # Filter empty issues 193 results = [result for result in results if result.metadata["body"]] 194 return results