| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240 | 
							- import logging
 
- import os
 
- from typing import Optional, Tuple, Dict
 
- import pandas as pd
 
- import fire
 
- from llm import run_llm_inference
 
- from utils import fetch_repo_issues, validate_df_values
 
- from plots import draw_all_plots
 
- from pdf_report import create_report_pdf
 
- logging.basicConfig(level=logging.INFO, filename='log.txt', format='%(asctime)s [%(levelname)-5.5s] %(message)s')
 
- logger = logging.getLogger(__name__)
 
- logger.addHandler(logging.StreamHandler())
 
- def generate_issue_annotations(
 
-     issues_df: pd.DataFrame
 
- ) -> Tuple[pd.DataFrame, Dict[str, int]]:
 
-     """
 
-     Get the annotations for the given issues.
 
-     Args:
 
-     - issues_df (pd.DataFrame): The DataFrame containing the issues.
 
-     Returns:
 
-     - Tuple[pd.DataFrame, Dict[str, int]]: A tuple containing the annotated issues DataFrame and the theme counts.
 
-     """
 
-     # pyre-fixme[6]
 
-     def _categorize_issues(
 
-         issues_metadata_df: pd.DataFrame,
 
-     ) -> Tuple[pd.Series, Dict[str, int]]:
 
-         """
 
-         Categorize the issues.
 
-         Args:
 
-         - issues_metadata_df (pd.DataFrame): The DataFrame containing the issues metadata.
 
-         Returns:
 
-         - Tuple[pd.Series, Dict[str, int]]: A tuple containing the categorized issues and the theme counts.
 
-         """
 
-         minified_issues = issues_metadata_df[
 
-             [
 
-                 "number",
 
-                 "summary",
 
-                 "possible_causes",
 
-                 "remediations",
 
-                 "component",
 
-                 "issue_type",
 
-             ]
 
-         ].to_dict(orient="records")
 
-         themes_json = run_llm_inference(
 
-             "assign_category",
 
-             str(minified_issues),
 
-             generation_kwargs={"temperature": 0.45, "max_tokens": 2048},
 
-         )
 
-         tmp = {}
 
-         for t in themes_json["report"]:
 
-             for num in t["related_issues"]:
 
-                 tmp[num] = tmp.get(num, []) + [t["theme"]]
 
-         themes = issues_metadata_df.number.apply(
 
-             lambda x: tmp.get(x, ["Miscellaneous"])
 
-         )
 
-         theme_count = {
 
-             k["theme"]: len(k["related_issues"]) for k in themes_json["report"]
 
-         }
 
-         return themes, theme_count
 
-     logger.info(f"Generating annotations for {len(issues_df)} issues")
 
-     
 
-     discussions = issues_df["discussion"].tolist()
 
-     metadata = run_llm_inference(
 
-         "parse_issue",
 
-         discussions,
 
-         generation_kwargs={"max_tokens": 2048, "temperature": 0.42},
 
-     )
 
-     # Handle the case where the LLM returns None instead of a generated response
 
-     metadata_index = [
 
-         issues_df.index[i] for i in range(len(metadata)) if metadata[i] is not None
 
-     ]
 
-     metadata = [m for m in metadata if m is not None]
 
-     issues_metadata_df = issues_df.merge(
 
-         pd.DataFrame(metadata, index=metadata_index), left_index=True, right_index=True
 
-     )
 
-     themes, theme_count = _categorize_issues(issues_metadata_df)
 
-     issues_metadata_df["themes"] = themes
 
-     return issues_metadata_df, theme_count
 
- def generate_executive_reports(
 
-     annotated_issues: pd.DataFrame,
 
-     theme_counts: Dict,
 
-     repo_name: str,
 
-     start_date: str,
 
-     end_date: str,
 
-     save_folder: Optional[str] = None,
 
- ) -> Tuple[pd.DataFrame, pd.DataFrame]:
 
-     """
 
-     Generate executive reports for the given issues.
 
-     Args:
 
-     - annotated_issues (pd.DataFrame): The DataFrame containing the annotated issues.
 
-     - theme_counts (dict): A dictionary containing the theme counts.
 
-     - repo_name (str): The name of the repository. Defaults to None.
 
-     - start_date (str): The start date of the report. Defaults to None.
 
-     - end_date (str): The end date of the report. Defaults to None.
 
-     Returns:
 
-     - Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the challenges DataFrame and the overview DataFrame.
 
-     """
 
-     logger.info(f"Generating high-level summaries from annotations...")
 
-     
 
-     report = {
 
-         "repo_name": repo_name,
 
-         "start_date": start_date,
 
-         "end_date": end_date,
 
-         "sentiment_count": annotated_issues["sentiment"].value_counts().to_dict(),
 
-         "severity_count": annotated_issues["severity"].value_counts().to_dict(),
 
-         "op_expertise_count": annotated_issues["op_expertise"].value_counts().to_dict(),
 
-         "themes_count": theme_counts,
 
-         "issues_created": annotated_issues["number"].nunique(),
 
-         "open_discussion": len(
 
-             annotated_issues[
 
-                 (annotated_issues.num_comments > 0) & (annotated_issues.closed == False)
 
-             ]
 
-         ),
 
-         "closed_discussion": len(
 
-             annotated_issues[
 
-                 (annotated_issues.num_comments > 0) & (annotated_issues.closed == True)
 
-             ]
 
-         ),
 
-         "open_no_discussion": len(
 
-             annotated_issues[
 
-                 (annotated_issues.num_comments == 0)
 
-                 & (annotated_issues.closed == False)
 
-             ]
 
-         ),
 
-         "closed_no_discussion": len(
 
-             annotated_issues[
 
-                 (annotated_issues.num_comments == 0) & (annotated_issues.closed == True)
 
-             ]
 
-         ),
 
-     }
 
-     report_input = str(
 
-         annotated_issues[
 
-             ["number", "summary", "possible_causes", "remediations"]
 
-         ].to_dict("records")
 
-     )
 
-     overview = run_llm_inference(
 
-         "get_overview", str(report_input), {"temperature": 0.45, "max_tokens": 4096}
 
-     )
 
-     report.update(overview)
 
-     overview_df = {
 
-         k: report[k]
 
-         for k in [
 
-             "repo_name",
 
-             "start_date",
 
-             "end_date",
 
-             "issues_created",
 
-             "open_discussion",
 
-             "closed_discussion",
 
-             "open_no_discussion",
 
-             "closed_no_discussion",
 
-         ]
 
-     }
 
-     overview_df["open_questions"] = [report["open_questions"]]
 
-     overview_df["executive_summary"] = [report["executive_summary"]]
 
-     for col in [
 
-         "sentiment_count",
 
-         "severity_count",
 
-         "op_expertise_count",
 
-         "themes_count",
 
-     ]:
 
-         d = report[col]
 
-         for k, v in d.items():
 
-             overview_df[f"{col}_{k}"] = v
 
-     overview_df = pd.DataFrame(overview_df)
 
-     
 
-     logger.info(f"Identifying key-challenges faced by users...")
 
-     challenges_df = {k: report[k] for k in ["repo_name", "start_date", "end_date"]}
 
-     challenges_df["key_challenge"] = [
 
-         k["key_challenge"] for k in report["issue_analysis"]
 
-     ]
 
-     challenges_df["affected_issues"] = [
 
-         k["affected_issues"] for k in report["issue_analysis"]
 
-     ]
 
-     challenges_df["possible_causes"] = [
 
-         k["possible_causes"] for k in report["issue_analysis"]
 
-     ]
 
-     challenges_df["remediations"] = [
 
-         k["remediations"] for k in report["issue_analysis"]
 
-     ]
 
-     challenges_df = pd.DataFrame(challenges_df)
 
-     return challenges_df, overview_df
 
-    
 
-    
 
- def main(repo_name, start_date, end_date):
 
-     out_folder = f'output/{repo_name}/{start_date}_{end_date}'
 
-     os.makedirs(out_folder, exist_ok=True)
 
-     
 
-     # Get issues data
 
-     issues_df = fetch_repo_issues(repo_name, start_date, end_date)
 
-     
 
-     # Generate annotations and metadata
 
-     annotated_issues, theme_counts = generate_issue_annotations(issues_df)
 
-     # Validate and save generated data
 
-     annotated_issues = validate_df_values(annotated_issues, out_folder, 'annotated_issues')
 
-     
 
-     # Generate high-level analysis
 
-     challenges, overview = generate_executive_reports(annotated_issues, theme_counts, repo_name, start_date, end_date)
 
-     # Validate and save generated data
 
-     challenges = validate_df_values(challenges, out_folder, 'challenges')
 
-     overview = validate_df_values(overview, out_folder, 'overview')
 
-     
 
-     # Create graphs and charts
 
-     plot_folder = out_folder + "/plots"
 
-     os.makedirs(plot_folder, exist_ok=True)
 
-     draw_all_plots(repo_name, plot_folder, overview)
 
-     
 
-     # Create PDF report
 
-     exec_summary = overview['executive_summary'].iloc[0]
 
-     open_qs = overview['open_questions'].iloc[0]
 
-     key_challenges_data = challenges[['key_challenge', 'possible_causes', 'remediations', 'affected_issues']].to_dict('records')
 
-     create_report_pdf(repo_name, start_date, end_date, key_challenges_data, exec_summary, open_qs, out_folder)
 
-     
 
- if __name__ == "__main__":
 
-     fire.Fire(main)
 
 
  |