triage.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. import os
  2. from typing import Optional, Tuple, Dict
  3. import pandas as pd
  4. import fire
  5. from llm import run_llm_inference
  6. from utils import fetch_repo_issues, validate_df_values
  7. from plots import draw_all_plots
  8. from pdf_report import create_report_pdf
  9. def generate_issue_annotations(
  10. issues_df: pd.DataFrame,
  11. save_folder: Optional[str] = None
  12. ) -> Tuple[pd.DataFrame, Dict[str, int]]:
  13. """
  14. Get the annotations for the given issues.
  15. Args:
  16. - issues_df (pd.DataFrame): The DataFrame containing the issues.
  17. Returns:
  18. - Tuple[pd.DataFrame, Dict[str, int]]: A tuple containing the annotated issues DataFrame and the theme counts.
  19. """
  20. # pyre-fixme[6]
  21. def _categorize_issues(
  22. issues_metadata_df: pd.DataFrame,
  23. ) -> Tuple[pd.Series, Dict[str, int]]:
  24. """
  25. Categorize the issues.
  26. Args:
  27. - issues_metadata_df (pd.DataFrame): The DataFrame containing the issues metadata.
  28. Returns:
  29. - Tuple[pd.Series, Dict[str, int]]: A tuple containing the categorized issues and the theme counts.
  30. """
  31. minified_issues = issues_metadata_df[
  32. [
  33. "number",
  34. "summary",
  35. "possible_causes",
  36. "remediations",
  37. "component",
  38. "issue_type",
  39. ]
  40. ].to_dict(orient="records")
  41. themes_json = run_llm_inference(
  42. "assign_category",
  43. str(minified_issues),
  44. generation_kwargs={"temperature": 0.45, "max_tokens": 2048},
  45. )
  46. tmp = {}
  47. for t in themes_json["report"]:
  48. for num in t["related_issues"]:
  49. tmp[num] = tmp.get(num, []) + [t["theme"]]
  50. themes = issues_metadata_df.number.apply(
  51. lambda x: tmp.get(x, ["Miscellaneous"])
  52. )
  53. theme_count = {
  54. k["theme"]: len(k["related_issues"]) for k in themes_json["report"]
  55. }
  56. return themes, theme_count
  57. discussions = issues_df["discussion"].tolist()
  58. metadata = run_llm_inference(
  59. "parse_issue",
  60. discussions,
  61. generation_kwargs={"max_tokens": 2048, "temperature": 0.42},
  62. )
  63. # Handle the case where the LLM returns None instead of a generated response
  64. metadata_index = [
  65. issues_df.index[i] for i in range(len(metadata)) if metadata[i] is not None
  66. ]
  67. metadata = [m for m in metadata if m is not None]
  68. issues_metadata_df = issues_df.merge(
  69. pd.DataFrame(metadata, index=metadata_index), left_index=True, right_index=True
  70. )
  71. themes, theme_count = _categorize_issues(issues_metadata_df)
  72. issues_metadata_df["themes"] = themes
  73. if save_folder:
  74. save_df(issues_metadata_df, save_folder, 'annotated_issues')
  75. return issues_metadata_df, theme_count
  76. def generate_executive_reports(
  77. annotated_issues: pd.DataFrame,
  78. theme_counts: Dict,
  79. repo_name: str,
  80. start_date: str,
  81. end_date: str,
  82. save_folder: Optional[str] = None,
  83. ) -> Tuple[pd.DataFrame, pd.DataFrame]:
  84. """
  85. Generate executive reports for the given issues.
  86. Args:
  87. - annotated_issues (pd.DataFrame): The DataFrame containing the annotated issues.
  88. - theme_counts (dict): A dictionary containing the theme counts.
  89. - repo_name (str): The name of the repository. Defaults to None.
  90. - start_date (str): The start date of the report. Defaults to None.
  91. - end_date (str): The end date of the report. Defaults to None.
  92. Returns:
  93. - Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the challenges DataFrame and the overview DataFrame.
  94. """
  95. report = {
  96. "repo_name": repo_name,
  97. "start_date": start_date,
  98. "end_date": end_date,
  99. "sentiment_count": annotated_issues["sentiment"].value_counts().to_dict(),
  100. "severity_count": annotated_issues["severity"].value_counts().to_dict(),
  101. "op_expertise_count": annotated_issues["op_expertise"].value_counts().to_dict(),
  102. "themes_count": theme_counts,
  103. "issues_created": annotated_issues["number"].nunique(),
  104. "open_discussion": len(
  105. annotated_issues[
  106. (annotated_issues.num_comments > 0) & (annotated_issues.closed == False)
  107. ]
  108. ),
  109. "closed_discussion": len(
  110. annotated_issues[
  111. (annotated_issues.num_comments > 0) & (annotated_issues.closed == True)
  112. ]
  113. ),
  114. "open_no_discussion": len(
  115. annotated_issues[
  116. (annotated_issues.num_comments == 0)
  117. & (annotated_issues.closed == False)
  118. ]
  119. ),
  120. "closed_no_discussion": len(
  121. annotated_issues[
  122. (annotated_issues.num_comments == 0) & (annotated_issues.closed == True)
  123. ]
  124. ),
  125. }
  126. report_input = str(
  127. annotated_issues[
  128. ["number", "summary", "possible_causes", "remediations"]
  129. ].to_dict("records")
  130. )
  131. overview = run_llm_inference(
  132. "get_overview", str(report_input), {"temperature": 0.45, "max_tokens": 4096}
  133. )
  134. report.update(overview)
  135. overview_df = {
  136. k: report[k]
  137. for k in [
  138. "repo_name",
  139. "start_date",
  140. "end_date",
  141. "issues_created",
  142. "open_discussion",
  143. "closed_discussion",
  144. "open_no_discussion",
  145. "closed_no_discussion",
  146. ]
  147. }
  148. overview_df["open_questions"] = [report["open_questions"]]
  149. overview_df["executive_summary"] = [report["executive_summary"]]
  150. for col in [
  151. "sentiment_count",
  152. "severity_count",
  153. "op_expertise_count",
  154. "themes_count",
  155. ]:
  156. d = report[col]
  157. for k, v in d.items():
  158. overview_df[f"{col}_{k}"] = v
  159. overview_df = pd.DataFrame(overview_df)
  160. challenges_df = {k: report[k] for k in ["repo_name", "start_date", "end_date"]}
  161. challenges_df["key_challenge"] = [
  162. k["key_challenge"] for k in report["issue_analysis"]
  163. ]
  164. challenges_df["affected_issues"] = [
  165. k["affected_issues"] for k in report["issue_analysis"]
  166. ]
  167. challenges_df["possible_causes"] = [
  168. k["possible_causes"] for k in report["issue_analysis"]
  169. ]
  170. challenges_df["remediations"] = [
  171. k["remediations"] for k in report["issue_analysis"]
  172. ]
  173. challenges_df = pd.DataFrame(challenges_df)
  174. return challenges_df, overview_df
  175. def create_report(repo_name, start_date, end_date, challenges, overview, out_folder):
  176. # generate pdf report
  177. challenges = validate_df_values(challenges)
  178. overview = validate_df_values(overview)
  179. exec_summary = overview['executive_summary'].iloc[0]
  180. open_qs = overview['open_questions'].iloc[0]
  181. key_challenges_data = challenges[['key_challenge', 'possible_causes', 'remediations', 'affected_issues']].to_dict('records')
  182. create_report_pdf(repo_name, start_date, end_date, key_challenges_data, exec_summary, open_qs, out_folder)
  183. def main(repo_name, start_date, end_date):
  184. out_folder = f'output/{repo_name}/{start_date}_{end_date}'
  185. os.makedirs(out_folder, exist_ok=True)
  186. # Get issues data
  187. issues_df = fetch_repo_issues(repo_name, start_date, end_date)
  188. # Generate annotations and metadata
  189. annotated_issues, theme_counts = generate_issue_annotations(issues_df)
  190. # Generate high-level analysis
  191. challenges, overview = generate_executive_reports(annotated_issues, theme_counts, repo_name, start_date, end_date)
  192. # Validate and save generated data
  193. annotated_issues = validate_df_values(annotated_issues)
  194. challenges = validate_df_values(challenges)
  195. overview = validate_df_values(overview)
  196. # Create graphs and charts
  197. plot_folder = out_folder + "/plots"
  198. os.makedirs(plot_folder, exist_ok=True)
  199. draw_all_plots(repo_name, plot_folder, overview)
  200. # Create PDF report
  201. exec_summary = overview['executive_summary'].iloc[0]
  202. open_qs = overview['open_questions'].iloc[0]
  203. key_challenges_data = challenges[['key_challenge', 'possible_causes', 'remediations', 'affected_issues']].to_dict('records')
  204. create_report_pdf(repo_name, start_date, end_date, key_challenges_data, exec_summary, open_qs, out_folder)
  205. if __name__ == "__main__":
  206. fire.Fire(main)