1import inspect
2import logging
3import sys
4from typing import List
5from typing import Optional
6
7import chevron
8import onnxruntime
9from githubkit.versions.v2022_11_28.models import Issue
10from qdrant_client import QdrantClient
11from qdrant_client.http.models import QueryResponse
12
13from github_search_engine.clients.github_client_manager import (
14 GithubClientManager,
15)
16from github_search_engine.clients.ollama_client_manager import (
17 OllamaClientManager,
18)
19
20
[docs]
21class GithubSearchEngine:
22 def __init__(
23 self,
24 github_access_token: str,
25 qdrant_location: Optional[str] = None,
26 qdrant_path: Optional[str] = None,
27 ):
28 logging.basicConfig(level=logging.WARNING)
29
30 self._github_client = GithubClientManager(access_token=github_access_token)
31 self._database_client = QdrantClient(
32 location=qdrant_location,
33 path=qdrant_path,
34 )
35 self._ollama_client = OllamaClientManager()
36
37 self._database_client.set_model(
38 "snowflake/snowflake-arctic-embed-m",
39 providers=onnxruntime.get_available_providers(),
40 )
41
42 @staticmethod
[docs]
43 def summarise_issue(issue: Issue) -> str:
44 issue_summary = f"""
45 {issue.title}
46
47 {issue.body}
48 """
49 return issue_summary
50
[docs]
51 def summarise_results(
52 self,
53 results: List[QueryResponse],
54 owner: str,
55 repository_name: str,
56 query: str,
57 ):
58 prompt_template = """
59 Please briefly summarise the content and discussion of the following github issues.
60 Keep it short, concise and to the point and explain how it relates to '{{originalQuery}}'
61 Do not write headings or titles, simply summarize into a single 2-3 sentence paragraph.
62
63 # {{issue.title}}
64 {{issue.body}}
65
66 Comments:
67 {{#comments}}
68 * {{body}}
69 {{/comments}}
70 """
71 prompt_template = inspect.cleandoc(prompt_template)
72 logging.info("Summarising issues")
73 summaries = []
74 for issue in results:
75 summary = self._ollama_client.chat(
76 chevron.render(
77 template=prompt_template,
78 data={
79 "issue": issue.metadata,
80 "comments": [
81 {"body": comment.body}
82 for comment in self._github_client.get_issue_comments(
83 owner=owner,
84 repository_name=repository_name,
85 issue_number=issue.metadata["number"],
86 )
87 ],
88 "originalQuery": query,
89 },
90 )
91 )
92 summary = f"""
93 # Issue [#{issue.metadata["number"]}]({issue.metadata["html_url"]})
94
95 {summary}
96 """
97 summaries.append(inspect.cleandoc(summary))
98 final_summary = "\n\n".join(summaries)
99 logging.info("Done")
100 return final_summary
101
[docs]
102 async def index_repository(self, owner: str, repository_name: str):
103 logging.info(f"Fetching Issues from {owner}/{repository_name}")
104 issues = await self._github_client.get_repository_issues(
105 owner, repository_name
106 )
107
108 logging.info("Adding to Vector DB")
109 self._database_client.add(
110 collection_name=f"{owner}/{repository_name}",
111 documents=[self.summarise_issue(issue) for issue in issues],
112 metadata=[issue.model_dump() for issue in issues],
113 )
114 logging.info("Done")
115
[docs]
116 def search(
117 self, owner: str, repository_name: str, text: str
118 ) -> List[QueryResponse]:
119 if not self._database_client.collection_exists(
120 collection_name=f"{owner}/{repository_name}",
121 ):
122 logging.error(
123 "DB Collection not found. Try indexing the repository first."
124 )
125 sys.exit(1)
126 else:
127 results = self._database_client.query(
128 collection_name=f"{owner}/{repository_name}",
129 query_text=text,
130 score_threshold=0.8,
131 limit=5,
132 )
133
134 # Filter empty issues
135 results = [result for result in results if result.metadata["body"]]
136 return results