1import inspect
2import logging
3import sys
4
5import chevron
6import onnxruntime
7from githubkit.versions.v2022_11_28.models import Issue
8from qdrant_client import QdrantClient
9from qdrant_client.http.models import QueryResponse
10
11from github_search_engine.clients.github_client_manager import (
12 GithubClientManager,
13)
14from github_search_engine.clients.ollama_client_manager import (
15 OllamaClientManager,
16)
17
18
[docs]
19class GithubSearchEngine:
20 def __init__(
21 self,
22 github_access_token: str,
23 qdrant_location: str | None = None,
24 qdrant_path: str | None = None,
25 ):
26 """A GithubSearchEngine to search GitHub repositories.
27
28 Initializes a client manager for GitHub, Qdrant, and Ollama services, setting up
29 logging and database model configuration.
30
31 Args:
32 github_access_token: The GitHub access token for accessing GitHub API.
33 qdrant_location: The location of the Qdrant server. Default is None.
34 qdrant_path: The path to the Qdrant database. Default is None.
35 """
36 logging.basicConfig(level=logging.WARNING)
37
38 self._github_client = GithubClientManager(access_token=github_access_token)
39 self._database_client = QdrantClient(
40 location=qdrant_location,
41 path=qdrant_path,
42 )
43 self._ollama_client = OllamaClientManager()
44
45 self._database_client.set_model(
46 "snowflake/snowflake-arctic-embed-m",
47 providers=onnxruntime.get_available_providers(),
48 )
49
50 @staticmethod
[docs]
51 def summarise_issue(issue: Issue) -> str:
52 """Construct a summary string from an Issue object.
53
54 Summarizes the given issue by combining its title and body. This function takes an Issue object and returns a formatted
55 string containing the issue's title and body, separated by two newline characters.
56
57 Args:
58 issue: The issue to be summarised. The issue must have 'title' and 'body' attributes.
59
60 Returns:
61 A formatted string containing the issue's title and body.
62 """
63 issue_summary = f"""
64 {issue.title}
65
66 {issue.body}
67 """
68 return issue_summary
69
[docs]
70 def summarise_results(
71 self,
72 results: list[QueryResponse],
73 owner: str,
74 repository_name: str,
75 query: str,
76 ) -> str:
77 """Summarizes the content and discussion of GitHub issues.
78
79 Summarizes the content and discussion of given GitHub issues, presenting how they relate to a
80 specified query. The summary is concise, devoid of headings or titles, and presented in a
81 single 2-3 sentence paragraph.
82
83 Args:
84 results: A list of QueryResponse objects containing GitHub issues to summarize.
85 owner: The owner of the GitHub repository.
86 repository_name: The name of the GitHub repository.
87 query: The original query to relate the issues to.
88
89 Returns:
90 A single string containing the summarized content and discussions of all provided GitHub issues.
91 """
92 prompt_template = """
93 Please briefly summarise the content and discussion of the following github issues.
94 Keep it short, concise and to the point and explain how it relates to '{{originalQuery}}'
95 Do not write headings or titles, simply summarize into a single 2-3 sentence paragraph.
96
97 # {{issue.title}}
98 {{issue.body}}
99
100 Comments:
101 {{#comments}}
102 * {{body}}
103 {{/comments}}
104 """
105 prompt_template = inspect.cleandoc(prompt_template)
106 logging.info("Summarising issues")
107 summaries = []
108 for issue in results:
109 summary = self._ollama_client.chat(
110 chevron.render(
111 template=prompt_template,
112 data={
113 "issue": issue.metadata,
114 "comments": [
115 {"body": comment.body}
116 for comment in self._github_client.get_issue_comments(
117 owner=owner,
118 repository_name=repository_name,
119 issue_number=issue.metadata["number"],
120 )
121 ],
122 "originalQuery": query,
123 },
124 )
125 )
126 summary = f"""
127 # Issue [#{issue.metadata["number"]}]({issue.metadata["html_url"]})
128
129 {summary}
130 """
131 summaries.append(inspect.cleandoc(summary))
132 final_summary = "\n\n".join(summaries)
133 logging.info("Done")
134 return final_summary
135
[docs]
136 async def index_repository(self, owner: str, repository_name: str):
137 """Index a GitHub repository.
138
139 Retrieves all issues from the specified repository and index them into a
140 vector database for further processing or querying.
141
142 Args:
143 owner: The owner of the GitHub repository.
144 repository_name: The name of the GitHub repository.
145 """
146 logging.info(f"Fetching Issues from {owner}/{repository_name}")
147 issues = await self._github_client.get_repository_issues(
148 owner, repository_name
149 )
150
151 logging.info("Adding to Vector DB")
152 self._database_client.add(
153 collection_name=f"{owner}/{repository_name}",
154 documents=[self.summarise_issue(issue) for issue in issues],
155 metadata=[issue.model_dump() for issue in issues],
156 )
157 logging.info("Done")
158
[docs]
159 def search(
160 self, owner: str, repository_name: str, text: str
161 ) -> list[QueryResponse]:
162 """Searches for issues in the specified repository that match the given text.
163
164 This method searches the database for issues within the given repository that
165 match the specified text query. If the repository's collection does not exist
166 in the database, an error is logged and the program exits. The search results
167 are filtered to exclude issues with empty bodies.
168
169 Args:
170 owner: The owner of the repository.
171 repository_name: The name of the repository.
172 text: A natural language query to search for within the repository's issues.
173
174 Returns:
175 A list of query responses that match the search criteria.
176 """
177 if not self._database_client.collection_exists(
178 collection_name=f"{owner}/{repository_name}",
179 ):
180 logging.error(
181 "DB Collection not found. Try indexing the repository first."
182 )
183 sys.exit(1)
184 else:
185 results = self._database_client.query(
186 collection_name=f"{owner}/{repository_name}",
187 query_text=text,
188 score_threshold=0.8,
189 limit=5,
190 )
191
192 # Filter empty issues
193 results = [result for result in results if result.metadata["body"]]
194 return results