1import inspect
2import logging
3import sys
4from typing import List
5from typing import Optional
6
7import chevron
8from githubkit.versions.v2022_11_28.models import Issue
9from qdrant_client import QdrantClient
10from qdrant_client.http.models import QueryResponse
11
12from github_search_engine.clients.github_client_manager import (
13 GithubClientManager,
14)
15from github_search_engine.clients.ollama_client_manager import (
16 OllamaClientManager,
17)
18
19
[docs]
20class GithubSearchEngine:
21 def __init__(
22 self,
23 github_access_token: str,
24 qdrant_location: Optional[str] = None,
25 qdrant_path: Optional[str] = None,
26 ):
27 logging.basicConfig(level=logging.WARNING)
28
29 self._github_client = GithubClientManager(access_token=github_access_token)
30 self._database_client = QdrantClient(
31 location=qdrant_location,
32 path=qdrant_path,
33 )
34 self._ollama_client = OllamaClientManager()
35
36 self._database_client.set_model(
37 "snowflake/snowflake-arctic-embed-m",
38 providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
39 )
40
41 @staticmethod
[docs]
42 def summarise_issue(issue: Issue) -> str:
43 issue_summary = f"""
44 {issue.title}
45
46 {issue.body}
47 """
48 return issue_summary
49
[docs]
50 def summarise_results(
51 self,
52 results: List[QueryResponse],
53 owner: str,
54 repository_name: str,
55 query: str,
56 ):
57 prompt_template = """
58 Please briefly summarise the content and discussion of the following github issues.
59 Keep it short, concise and to the point and explain how it relates to '{{originalQuery}}'
60 Do not write headings or titles, simply summarize into a single 2-3 sentence paragraph.
61
62 # {{issue.title}}
63 {{issue.body}}
64
65 Comments:
66 {{#comments}}
67 * {{body}}
68 {{/comments}}
69 """
70 prompt_template = inspect.cleandoc(prompt_template)
71 logging.info("Summarising issues")
72 summaries = []
73 for issue in results:
74 summary = self._ollama_client.chat(
75 chevron.render(
76 template=prompt_template,
77 data={
78 "issue": issue.metadata,
79 "comments": [
80 {"body": comment.body}
81 for comment in self._github_client.get_issue_comments(
82 owner=owner,
83 repository_name=repository_name,
84 issue_number=issue.metadata["number"],
85 )
86 ],
87 "originalQuery": query,
88 },
89 )
90 )
91 summary = f"""
92 # Issue [#{issue.metadata["number"]}]({issue.metadata["html_url"]})
93
94 {summary}
95 """
96 summaries.append(inspect.cleandoc(summary))
97 final_summary = "\n\n".join(summaries)
98 logging.info("Done")
99 return final_summary
100
[docs]
101 async def index_repository(self, owner: str, repository_name: str):
102 logging.info(f"Fetching Issues from {owner}/{repository_name}")
103 issues = await self._github_client.get_repository_issues(
104 owner, repository_name
105 )
106
107 logging.info("Adding to Vector DB")
108 self._database_client.add(
109 collection_name=f"{owner}/{repository_name}",
110 documents=[self.summarise_issue(issue) for issue in issues],
111 metadata=[issue.model_dump() for issue in issues],
112 )
113 logging.info("Done")
114
[docs]
115 def search(
116 self, owner: str, repository_name: str, text: str
117 ) -> List[QueryResponse]:
118 if not self._database_client.collection_exists(
119 collection_name=f"{owner}/{repository_name}",
120 ):
121 logging.error(
122 "DB Collection not found. Try indexing the repository first."
123 )
124 sys.exit(1)
125 else:
126 results = self._database_client.query(
127 collection_name=f"{owner}/{repository_name}",
128 query_text=text,
129 score_threshold=0.8,
130 limit=5,
131 )
132
133 # Filter empty issues
134 results = [result for result in results if result.metadata["body"]]
135 return results