triage.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. import logging
  2. import os
  3. from typing import Optional, Tuple, Dict
  4. import pandas as pd
  5. import fire
  6. from llm import run_llm_inference
  7. from utils import fetch_repo_issues, validate_df_values
  8. from plots import draw_all_plots
  9. from pdf_report import create_report_pdf
  10. logging.basicConfig(level=logging.INFO, filename='log.txt', format='%(asctime)s [%(levelname)-5.5s] %(message)s')
  11. logger = logging.getLogger(__name__)
  12. logger.addHandler(logging.StreamHandler())
  13. def generate_issue_annotations(
  14. issues_df: pd.DataFrame
  15. ) -> Tuple[pd.DataFrame, Dict[str, int]]:
  16. """
  17. Get the annotations for the given issues.
  18. Args:
  19. - issues_df (pd.DataFrame): The DataFrame containing the issues.
  20. Returns:
  21. - Tuple[pd.DataFrame, Dict[str, int]]: A tuple containing the annotated issues DataFrame and the theme counts.
  22. """
  23. # pyre-fixme[6]
  24. def _categorize_issues(
  25. issues_metadata_df: pd.DataFrame,
  26. ) -> Tuple[pd.Series, Dict[str, int]]:
  27. """
  28. Categorize the issues.
  29. Args:
  30. - issues_metadata_df (pd.DataFrame): The DataFrame containing the issues metadata.
  31. Returns:
  32. - Tuple[pd.Series, Dict[str, int]]: A tuple containing the categorized issues and the theme counts.
  33. """
  34. minified_issues = issues_metadata_df[
  35. [
  36. "number",
  37. "summary",
  38. "possible_causes",
  39. "remediations",
  40. "component",
  41. "issue_type",
  42. ]
  43. ].to_dict(orient="records")
  44. themes_json = run_llm_inference(
  45. "assign_category",
  46. str(minified_issues),
  47. generation_kwargs={"temperature": 0.45, "max_tokens": 2048},
  48. )
  49. tmp = {}
  50. for t in themes_json["report"]:
  51. for num in t["related_issues"]:
  52. tmp[num] = tmp.get(num, []) + [t["theme"]]
  53. themes = issues_metadata_df.number.apply(
  54. lambda x: tmp.get(x, ["Miscellaneous"])
  55. )
  56. theme_count = {
  57. k["theme"]: len(k["related_issues"]) for k in themes_json["report"]
  58. }
  59. return themes, theme_count
  60. logger.info(f"Generating annotations for {len(issues_df)} issues")
  61. discussions = issues_df["discussion"].tolist()
  62. metadata = run_llm_inference(
  63. "parse_issue",
  64. discussions,
  65. generation_kwargs={"max_tokens": 2048, "temperature": 0.42},
  66. )
  67. # Handle the case where the LLM returns None instead of a generated response
  68. metadata_index = [
  69. issues_df.index[i] for i in range(len(metadata)) if metadata[i] is not None
  70. ]
  71. metadata = [m for m in metadata if m is not None]
  72. issues_metadata_df = issues_df.merge(
  73. pd.DataFrame(metadata, index=metadata_index), left_index=True, right_index=True
  74. )
  75. themes, theme_count = _categorize_issues(issues_metadata_df)
  76. issues_metadata_df["themes"] = themes
  77. return issues_metadata_df, theme_count
  78. def generate_executive_reports(
  79. annotated_issues: pd.DataFrame,
  80. theme_counts: Dict,
  81. repo_name: str,
  82. start_date: str,
  83. end_date: str,
  84. save_folder: Optional[str] = None,
  85. ) -> Tuple[pd.DataFrame, pd.DataFrame]:
  86. """
  87. Generate executive reports for the given issues.
  88. Args:
  89. - annotated_issues (pd.DataFrame): The DataFrame containing the annotated issues.
  90. - theme_counts (dict): A dictionary containing the theme counts.
  91. - repo_name (str): The name of the repository. Defaults to None.
  92. - start_date (str): The start date of the report. Defaults to None.
  93. - end_date (str): The end date of the report. Defaults to None.
  94. Returns:
  95. - Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the challenges DataFrame and the overview DataFrame.
  96. """
  97. logger.info(f"Generating high-level summaries from annotations...")
  98. report = {
  99. "repo_name": repo_name,
  100. "start_date": start_date,
  101. "end_date": end_date,
  102. "sentiment_count": annotated_issues["sentiment"].value_counts().to_dict(),
  103. "severity_count": annotated_issues["severity"].value_counts().to_dict(),
  104. "op_expertise_count": annotated_issues["op_expertise"].value_counts().to_dict(),
  105. "themes_count": theme_counts,
  106. "issues_created": annotated_issues["number"].nunique(),
  107. "open_discussion": len(
  108. annotated_issues[
  109. (annotated_issues.num_comments > 0) & (annotated_issues.closed == False)
  110. ]
  111. ),
  112. "closed_discussion": len(
  113. annotated_issues[
  114. (annotated_issues.num_comments > 0) & (annotated_issues.closed == True)
  115. ]
  116. ),
  117. "open_no_discussion": len(
  118. annotated_issues[
  119. (annotated_issues.num_comments == 0)
  120. & (annotated_issues.closed == False)
  121. ]
  122. ),
  123. "closed_no_discussion": len(
  124. annotated_issues[
  125. (annotated_issues.num_comments == 0) & (annotated_issues.closed == True)
  126. ]
  127. ),
  128. }
  129. report_input = str(
  130. annotated_issues[
  131. ["number", "summary", "possible_causes", "remediations"]
  132. ].to_dict("records")
  133. )
  134. overview = run_llm_inference(
  135. "get_overview", str(report_input), {"temperature": 0.45, "max_tokens": 4096}
  136. )
  137. report.update(overview)
  138. overview_df = {
  139. k: report[k]
  140. for k in [
  141. "repo_name",
  142. "start_date",
  143. "end_date",
  144. "issues_created",
  145. "open_discussion",
  146. "closed_discussion",
  147. "open_no_discussion",
  148. "closed_no_discussion",
  149. ]
  150. }
  151. overview_df["open_questions"] = [report["open_questions"]]
  152. overview_df["executive_summary"] = [report["executive_summary"]]
  153. for col in [
  154. "sentiment_count",
  155. "severity_count",
  156. "op_expertise_count",
  157. "themes_count",
  158. ]:
  159. d = report[col]
  160. for k, v in d.items():
  161. overview_df[f"{col}_{k}"] = v
  162. overview_df = pd.DataFrame(overview_df)
  163. logger.info(f"Identifying key-challenges faced by users...")
  164. challenges_df = {k: report[k] for k in ["repo_name", "start_date", "end_date"]}
  165. challenges_df["key_challenge"] = [
  166. k["key_challenge"] for k in report["issue_analysis"]
  167. ]
  168. challenges_df["affected_issues"] = [
  169. k["affected_issues"] for k in report["issue_analysis"]
  170. ]
  171. challenges_df["possible_causes"] = [
  172. k["possible_causes"] for k in report["issue_analysis"]
  173. ]
  174. challenges_df["remediations"] = [
  175. k["remediations"] for k in report["issue_analysis"]
  176. ]
  177. challenges_df = pd.DataFrame(challenges_df)
  178. return challenges_df, overview_df
  179. def main(repo_name, start_date, end_date):
  180. out_folder = f'output/{repo_name}/{start_date}_{end_date}'
  181. os.makedirs(out_folder, exist_ok=True)
  182. # Get issues data
  183. issues_df = fetch_repo_issues(repo_name, start_date, end_date)
  184. # Generate annotations and metadata
  185. annotated_issues, theme_counts = generate_issue_annotations(issues_df)
  186. # Validate and save generated data
  187. annotated_issues = validate_df_values(annotated_issues, out_folder, 'annotated_issues')
  188. # Generate high-level analysis
  189. challenges, overview = generate_executive_reports(annotated_issues, theme_counts, repo_name, start_date, end_date)
  190. # Validate and save generated data
  191. challenges = validate_df_values(challenges, out_folder, 'challenges')
  192. overview = validate_df_values(overview, out_folder, 'overview')
  193. # Create graphs and charts
  194. plot_folder = out_folder + "/plots"
  195. os.makedirs(plot_folder, exist_ok=True)
  196. draw_all_plots(repo_name, plot_folder, overview)
  197. # Create PDF report
  198. exec_summary = overview['executive_summary'].iloc[0]
  199. open_qs = overview['open_questions'].iloc[0]
  200. key_challenges_data = challenges[['key_challenge', 'possible_causes', 'remediations', 'affected_issues']].to_dict('records')
  201. create_report_pdf(repo_name, start_date, end_date, key_challenges_data, exec_summary, open_qs, out_folder)
  202. if __name__ == "__main__":
  203. fire.Fire(main)