Source code for github_search_engine.github_search_engine

  1import inspect
  2import logging
  3import sys
  4from typing import List
  5from typing import Optional
  6
  7import chevron
  8import onnxruntime
  9from githubkit.versions.v2022_11_28.models import Issue
 10from qdrant_client import QdrantClient
 11from qdrant_client.http.models import QueryResponse
 12
 13from github_search_engine.clients.github_client_manager import (
 14  GithubClientManager,
 15)
 16from github_search_engine.clients.ollama_client_manager import (
 17  OllamaClientManager,
 18)
 19
 20
[docs] 21class GithubSearchEngine: 22 def __init__( 23 self, 24 github_access_token: str, 25 qdrant_location: Optional[str] = None, 26 qdrant_path: Optional[str] = None, 27 ): 28 logging.basicConfig(level=logging.WARNING) 29 30 self._github_client = GithubClientManager(access_token=github_access_token) 31 self._database_client = QdrantClient( 32 location=qdrant_location, 33 path=qdrant_path, 34 ) 35 self._ollama_client = OllamaClientManager() 36 37 self._database_client.set_model( 38 "snowflake/snowflake-arctic-embed-m", 39 providers=onnxruntime.get_available_providers(), 40 ) 41 42 @staticmethod
[docs] 43 def summarise_issue(issue: Issue) -> str: 44 issue_summary = f""" 45 {issue.title} 46 47 {issue.body} 48 """ 49 return issue_summary
50
[docs] 51 def summarise_results( 52 self, 53 results: List[QueryResponse], 54 owner: str, 55 repository_name: str, 56 query: str, 57 ): 58 prompt_template = """ 59 Please briefly summarise the content and discussion of the following github issues. 60 Keep it short, concise and to the point and explain how it relates to '{{originalQuery}}' 61 Do not write headings or titles, simply summarize into a single 2-3 sentence paragraph. 62 63 # {{issue.title}} 64 {{issue.body}} 65 66 Comments: 67 {{#comments}} 68 * {{body}} 69 {{/comments}} 70 """ 71 prompt_template = inspect.cleandoc(prompt_template) 72 logging.info("Summarising issues") 73 summaries = [] 74 for issue in results: 75 summary = self._ollama_client.chat( 76 chevron.render( 77 template=prompt_template, 78 data={ 79 "issue": issue.metadata, 80 "comments": [ 81 {"body": comment.body} 82 for comment in self._github_client.get_issue_comments( 83 owner=owner, 84 repository_name=repository_name, 85 issue_number=issue.metadata["number"], 86 ) 87 ], 88 "originalQuery": query, 89 }, 90 ) 91 ) 92 summary = f""" 93 # Issue [#{issue.metadata["number"]}]({issue.metadata["html_url"]}) 94 95 {summary} 96 """ 97 summaries.append(inspect.cleandoc(summary)) 98 final_summary = "\n\n".join(summaries) 99 logging.info("Done") 100 return final_summary
101
[docs] 102 async def index_repository(self, owner: str, repository_name: str): 103 logging.info(f"Fetching Issues from {owner}/{repository_name}") 104 issues = await self._github_client.get_repository_issues( 105 owner, repository_name 106 ) 107 108 logging.info("Adding to Vector DB") 109 self._database_client.add( 110 collection_name=f"{owner}/{repository_name}", 111 documents=[self.summarise_issue(issue) for issue in issues], 112 metadata=[issue.model_dump() for issue in issues], 113 ) 114 logging.info("Done")
115
[docs] 116 def search( 117 self, owner: str, repository_name: str, text: str 118 ) -> List[QueryResponse]: 119 if not self._database_client.collection_exists( 120 collection_name=f"{owner}/{repository_name}", 121 ): 122 logging.error( 123 "DB Collection not found. Try indexing the repository first." 124 ) 125 sys.exit(1) 126 else: 127 results = self._database_client.query( 128 collection_name=f"{owner}/{repository_name}", 129 query_text=text, 130 score_threshold=0.8, 131 limit=5, 132 ) 133 134 # Filter empty issues 135 results = [result for result in results if result.metadata["body"]] 136 return results