| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240 | import loggingimport osfrom typing import Optional, Tuple, Dictimport pandas as pdimport firefrom llm import run_llm_inferencefrom utils import fetch_repo_issues, validate_df_valuesfrom plots import draw_all_plotsfrom pdf_report import create_report_pdflogging.basicConfig(level=logging.INFO, filename='log.txt', format='%(asctime)s [%(levelname)-5.5s] %(message)s')logger = logging.getLogger(__name__)logger.addHandler(logging.StreamHandler())def generate_issue_annotations(    issues_df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, int]]:    """    Get the annotations for the given issues.    Args:    - issues_df (pd.DataFrame): The DataFrame containing the issues.    Returns:    - Tuple[pd.DataFrame, Dict[str, int]]: A tuple containing the annotated issues DataFrame and the theme counts.    """    # pyre-fixme[6]    def _categorize_issues(        issues_metadata_df: pd.DataFrame,    ) -> Tuple[pd.Series, Dict[str, int]]:        """        Categorize the issues.        Args:        - issues_metadata_df (pd.DataFrame): The DataFrame containing the issues metadata.        Returns:        - Tuple[pd.Series, Dict[str, int]]: A tuple containing the categorized issues and the theme counts.        """        minified_issues = issues_metadata_df[            [                "number",                "summary",                "possible_causes",                "remediations",                "component",                "issue_type",            ]        ].to_dict(orient="records")        themes_json = run_llm_inference(            "assign_category",            str(minified_issues),            generation_kwargs={"temperature": 0.45, "max_tokens": 2048},        )        tmp = {}        for t in themes_json["report"]:            for num in t["related_issues"]:                tmp[num] = tmp.get(num, []) + [t["theme"]]        themes = issues_metadata_df.number.apply(            lambda x: tmp.get(x, ["Miscellaneous"])        )        theme_count = {            k["theme"]: len(k["related_issues"]) for k in themes_json["report"]        }        return themes, theme_count    logger.info(f"Generating annotations for {len(issues_df)} issues")        discussions = issues_df["discussion"].tolist()    metadata = run_llm_inference(        "parse_issue",        discussions,        generation_kwargs={"max_tokens": 2048, "temperature": 0.42},    )    # Handle the case where the LLM returns None instead of a generated response    metadata_index = [        issues_df.index[i] for i in range(len(metadata)) if metadata[i] is not None    ]    metadata = [m for m in metadata if m is not None]    issues_metadata_df = issues_df.merge(        pd.DataFrame(metadata, index=metadata_index), left_index=True, right_index=True    )    themes, theme_count = _categorize_issues(issues_metadata_df)    issues_metadata_df["themes"] = themes    return issues_metadata_df, theme_countdef generate_executive_reports(    annotated_issues: pd.DataFrame,    theme_counts: Dict,    repo_name: str,    start_date: str,    end_date: str,    save_folder: Optional[str] = None,) -> Tuple[pd.DataFrame, pd.DataFrame]:    """    Generate executive reports for the given issues.    Args:    - annotated_issues (pd.DataFrame): The DataFrame containing the annotated issues.    - theme_counts (dict): A dictionary containing the theme counts.    - repo_name (str): The name of the repository. Defaults to None.    - start_date (str): The start date of the report. Defaults to None.    - end_date (str): The end date of the report. Defaults to None.    Returns:    - Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the challenges DataFrame and the overview DataFrame.    """    logger.info(f"Generating high-level summaries from annotations...")        report = {        "repo_name": repo_name,        "start_date": start_date,        "end_date": end_date,        "sentiment_count": annotated_issues["sentiment"].value_counts().to_dict(),        "severity_count": annotated_issues["severity"].value_counts().to_dict(),        "op_expertise_count": annotated_issues["op_expertise"].value_counts().to_dict(),        "themes_count": theme_counts,        "issues_created": annotated_issues["number"].nunique(),        "open_discussion": len(            annotated_issues[                (annotated_issues.num_comments > 0) & (annotated_issues.closed == False)            ]        ),        "closed_discussion": len(            annotated_issues[                (annotated_issues.num_comments > 0) & (annotated_issues.closed == True)            ]        ),        "open_no_discussion": len(            annotated_issues[                (annotated_issues.num_comments == 0)                & (annotated_issues.closed == False)            ]        ),        "closed_no_discussion": len(            annotated_issues[                (annotated_issues.num_comments == 0) & (annotated_issues.closed == True)            ]        ),    }    report_input = str(        annotated_issues[            ["number", "summary", "possible_causes", "remediations"]        ].to_dict("records")    )    overview = run_llm_inference(        "get_overview", str(report_input), {"temperature": 0.45, "max_tokens": 4096}    )    report.update(overview)    overview_df = {        k: report[k]        for k in [            "repo_name",            "start_date",            "end_date",            "issues_created",            "open_discussion",            "closed_discussion",            "open_no_discussion",            "closed_no_discussion",        ]    }    overview_df["open_questions"] = [report["open_questions"]]    overview_df["executive_summary"] = [report["executive_summary"]]    for col in [        "sentiment_count",        "severity_count",        "op_expertise_count",        "themes_count",    ]:        d = report[col]        for k, v in d.items():            overview_df[f"{col}_{k}"] = v    overview_df = pd.DataFrame(overview_df)        logger.info(f"Identifying key-challenges faced by users...")    challenges_df = {k: report[k] for k in ["repo_name", "start_date", "end_date"]}    challenges_df["key_challenge"] = [        k["key_challenge"] for k in report["issue_analysis"]    ]    challenges_df["affected_issues"] = [        k["affected_issues"] for k in report["issue_analysis"]    ]    challenges_df["possible_causes"] = [        k["possible_causes"] for k in report["issue_analysis"]    ]    challenges_df["remediations"] = [        k["remediations"] for k in report["issue_analysis"]    ]    challenges_df = pd.DataFrame(challenges_df)    return challenges_df, overview_df      def main(repo_name, start_date, end_date):    out_folder = f'output/{repo_name}/{start_date}_{end_date}'    os.makedirs(out_folder, exist_ok=True)        # Get issues data    issues_df = fetch_repo_issues(repo_name, start_date, end_date)        # Generate annotations and metadata    annotated_issues, theme_counts = generate_issue_annotations(issues_df)    # Validate and save generated data    annotated_issues = validate_df_values(annotated_issues, out_folder, 'annotated_issues')        # Generate high-level analysis    challenges, overview = generate_executive_reports(annotated_issues, theme_counts, repo_name, start_date, end_date)    # Validate and save generated data    challenges = validate_df_values(challenges, out_folder, 'challenges')    overview = validate_df_values(overview, out_folder, 'overview')        # Create graphs and charts    plot_folder = out_folder + "/plots"    os.makedirs(plot_folder, exist_ok=True)    draw_all_plots(repo_name, plot_folder, overview)        # Create PDF report    exec_summary = overview['executive_summary'].iloc[0]    open_qs = overview['open_questions'].iloc[0]    key_challenges_data = challenges[['key_challenge', 'possible_causes', 'remediations', 'affected_issues']].to_dict('records')    create_report_pdf(repo_name, start_date, end_date, key_challenges_data, exec_summary, open_qs, out_folder)    if __name__ == "__main__":    fire.Fire(main)
 |