فهرست منبع

Add auto triage tooll

Suraj Subramanian 7 ماه پیش
والد
کامیت
a5ddea144a

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 146 - 0
recipes/use_cases/github_triage/config.yaml


+ 160 - 0
recipes/use_cases/github_triage/llm.py

@@ -0,0 +1,160 @@
+import logging 
+from typing import Any, Dict, List, Optional, Union
+import yaml
+import time
+import json
+
+from openai import OpenAI
+import groq
+
+log = logging.getLogger(__name__)
+CFG = yaml.safe_load(open("config.yaml", "r"))
+
+class LlamaVLLM():
+    def __init__(self, endpoint, model_id):
+        self.model_id = model_id
+        self.client = OpenAI(base_url=endpoint, api_key='token')
+
+    def chat(
+        self,
+        inputs: List[Dict[str, str]],
+        generation_kwargs: Optional[Dict[str, Any]] = None,
+        guided_decode_json_schema: Optional[str] = None
+    ) -> List[str]:
+
+        if generation_kwargs is None:
+            generation_kwargs = {}
+            
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=inputs,
+                extra_body={
+                    "guided_json": guided_decode_json_schema
+                },
+                **generation_kwargs,
+            )
+            output = response.choices[0].message
+        except Exception as e:
+            log.error(
+                f"FAILED to generate inference for input {inputs}\nError: {str(e)}"
+            )
+            output = None
+        return output
+    
+
+class LlamaGroq():
+    def __init__(self, key, model_id):
+        self.model_id = model_id
+        self.client = groq.Groq(api_key=key)
+        print(f"Using Groq:{self.model_id} for inference")
+
+    def chat(
+        self, 
+        inputs: List[Dict[str, str]], 
+        generation_kwargs: Optional[Dict[str, Any]] = None,
+        guided_decode_json_schema: Optional[str] = None
+    ) -> str:
+        
+        if generation_kwargs is None:
+            generation_kwargs = {}
+            
+        # Currently Groq doesn't support guided JSON decoding. Workaround:
+        if guided_decode_json_schema is not None:
+            inputs[0]['content'] += f"\n\nEnsure your response aligns with the following JSON schema:\n{guided_decode_json_schema}\n\n"
+        
+        output = None
+        
+        while True:
+            try:
+                response = self.client.chat.completions.with_raw_response.create(
+                    model=self.model_id,
+                    messages=inputs,
+                    stream=False,
+                    **generation_kwargs,
+                    response_format={"type": 'json_object' if guided_decode_json_schema is not None else 'text'}
+                )
+                completion = response.parse()
+                output = completion.choices[0].message.content
+                break
+            except groq.RateLimitError as e:
+                wait = response.headers['X-Ratelimit-Reset']
+                response = e.response
+                print(e)
+                print(f"waiting for {wait} to prevent ratelimiting")
+                time.sleep(wait)
+            except:
+                print(f"inference failed for input: {inputs}")
+
+        return output
+
+
+def run_llm_inference(
+    prompt_name: str,
+    inputs: Union[str, List[str]],
+    generation_kwargs: Optional[Dict] = None,
+    guided_decode_json_schema=None,
+) -> Union[List[str], List[Dict[str, Any]]]:
+    """
+    Run the LLM inference on the given inputs.
+
+    Args:
+    - prompt_name (str): The name of the prompt to use.
+    - inputs (str or List[str]): The input(s) to the LLM.
+    - generation_kwargs (Dict): Additional keyword arguments to pass to the LLM.
+    - guided_decode_json_schema (str): The JSON schema to use for guided decoding.
+
+    Returns:
+    - Union[str, List[str]]: The response(s) from the LLM.
+    """
+    log.info(f"[run_llm_inference] {prompt_name}")
+    
+    # initialize appropriate LLM accessor
+    if CFG['model']['use'] == 'vllm':
+        LLM = LlamaVLLM(**CFG['model']['vllm'])
+    elif CFG['model']['use'] == 'groq':
+        LLM = LlamaGroq(**CFG['model']['groq'])
+    else:
+        raise ValueError("Invalid model type in config.yaml")
+    
+    _batch = True
+    if isinstance(inputs, str):
+        _batch = False
+        inputs = [inputs]
+
+    inputs = [
+        [
+            {"role": "system", "content": CFG["prompts"][prompt_name]["system"]},
+            {"role": "user", "content": i},
+        ]
+        for i in inputs
+    ]
+
+    if (
+        guided_decode_json_schema is None
+        and "json_schema" in CFG["prompts"][prompt_name]
+    ):
+        guided_decode_json_schema = " ".join(
+            CFG["prompts"][prompt_name]["json_schema"].split()
+        )
+
+    responses = [
+        LLM.chat(i, generation_kwargs, guided_decode_json_schema) for i in inputs
+    ]
+
+    if guided_decode_json_schema is not None:
+        responses_json = []
+        for r in responses:
+            if r is not None:
+                try:
+                    responses_json.append(json.loads(r, strict=False))
+                    continue
+                except json.JSONDecodeError:
+                    log.error(f"Error decoding JSON: {r}")
+            responses_json.append(None)
+        responses = responses_json
+
+    if not _batch:
+        responses = responses[0]
+
+    return responses

+ 111 - 0
recipes/use_cases/github_triage/pdf_report.py

@@ -0,0 +1,111 @@
+from fpdf import FPDF
+import os
+from datetime import datetime
+
+class ReportPDF(FPDF):
+    def __init__(self, repository_name, start_date, end_date):
+        FPDF.__init__(self,orientation='P',unit='mm',format='A4')
+        self.repo = repository_name
+        self.start_end = f"{datetime.strptime(start_date, '%Y-%m-%d').strftime('%b %d, %Y')} to {datetime.strptime(end_date, '%Y-%m-%d').strftime('%b %d, %Y')}"
+        
+    def header(self):
+        self.set_font('Arial', 'B', 12)
+        self.cell(100, 10, f'AutoTriage Report: {self.repo}', 0, 0)
+        self.cell(90, 10, self.start_end, 0, 0, 'R')
+        self.ln(20)
+
+    def footer(self):
+        self.set_y(-15)
+        self.set_font('Arial', 'I', 8)
+        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
+        
+    def exec_summary(self, text):
+        self.set_font('Arial', 'B', 16)
+        self.cell(0, 8, 'Executive Summary', 'B', 0, 'L')
+        self.ln(10)
+        self.set_font('Arial', '', 10)
+        self.multi_cell(0, 5, text)
+        self.ln(10)
+    
+    def add_challenge(self, challenge_data):
+        # title
+        self.set_font('Arial', '', 14)
+        self.cell(0, 10, f"{challenge_data['key_challenge']}", 0, 0, 'L')
+        self.ln(8)
+        
+        # psosible causes
+        self.set_font('Arial', 'B', 10)
+        self.cell(0, 10, "Possible Causes", 0, 0, 'L')
+        self.ln(5)
+        self.set_font('Arial', '', 10)
+        for x in challenge_data['possible_causes']:
+            self.cell(0, 10, "* " + x, 0, 0, 'L')
+            self.ln(5)
+        self.ln(3)
+            
+        # remediations
+        self.set_font('Arial', 'B', 10)
+        self.cell(0, 10, "Remediations", 0, 0, 'L')
+        self.ln(5)
+        self.set_font('Arial', '', 10)
+        for x in challenge_data['remediations']:
+            self.cell(0, 10, "* " + x, 0, 0, 'L')
+            self.ln(5)
+        self.ln(3)
+        
+        # affected issues
+        self.set_font('Arial', 'B', 10)
+        self.cell(30, 10, f"Affected issues: ", 0, 0, 'L')
+        for iss in challenge_data['affected_issues']:
+            self.set_text_color(0,0,255)
+            self.cell(8, 10, str(iss), 0, 0, 'L', link=f"https://github.com/{self.repo}/issues/{iss}")
+        self.set_text_color(0,0,0)
+        self.ln(15)
+
+    def challenges_section(self, key_challenges_data):
+        self.set_font('Arial', 'B', 16)
+        self.cell(0, 8, 'Key Challenges', 'B', 0, 'L')
+        self.ln(10)
+        for cd in key_challenges_data:
+            self.add_challenge(cd)
+        self.ln(20)
+    
+    def open_ques_section(self, open_questions):
+        self.set_font('Arial', 'B', 16)
+        self.cell(0, 8, 'Open Questions', 'B', 0, 'L')
+        self.ln(10)
+        self.set_font('Arial', '', 10)
+        for qq in open_questions:
+            self.multi_cell(0, 5, "* " + qq, 0, 0, 'L')
+            self.ln(5)
+        self.ln(5)
+    
+    def add_graphs_section(self, title, plot_paths):
+        self.set_font('Arial', 'B', 16)
+        self.cell(0, 8, f'[Viz] {title}', 'B', 0, 'L')
+        self.ln(10)
+        for path in plot_paths:
+            if os.path.exists(path):
+                self.add_plot(path)
+        self.ln(10)
+            
+    def add_plot(self, img):
+        self.image(img, x=30, w=150)
+        self.ln(5)
+        
+    
+    
+def create_report_pdf(repo_name, start_date, end_date, key_challenges_data, executive_summary, open_questions, out_folder):#, image1, image2):
+    pdf = ReportPDF(repo_name, start_date, end_date)
+    pdf.add_page()
+    pdf.exec_summary(executive_summary)
+    pdf.open_ques_section(open_questions)
+    pdf.challenges_section(key_challenges_data)
+    pdf.add_page()
+    pdf.add_graphs_section("Repo Maintenance", [f'{out_folder}/plots/engagement_sankey.png'])
+    pdf.add_page()
+    pdf.add_graphs_section("Traffic in the last 2 weeks", [f'{out_folder}/plots/{x}.png' for x in ['views_clones','resources', 'referrers']])
+    pdf.add_page()
+    pdf.add_graphs_section("New issues in the last 2 weeks", [f'{out_folder}/plots/{x}.png' for x in ['themes', 'severity', 'sentiment', 'expertise']])
+    pdf.output(f'{out_folder}/report.pdf', 'F')
+

+ 174 - 0
recipes/use_cases/github_triage/plots.py

@@ -0,0 +1,174 @@
+import matplotlib.pyplot as plt
+import pandas as pd
+import plotly.graph_objects as go
+from utils import fetch_github_endpoint
+
+
+def plot_views_clones(repo_name, out_folder):
+    def json_to_df(json_data, key):
+        df = pd.DataFrame(json_data[key])
+        df['timestamp'] = df['timestamp'].apply(lambda x: x[5:10])
+        if key in ['clones', 'views']:
+            df.rename(columns={'uniques': key}, inplace=True)
+            df.drop(columns=['count'], inplace=True)
+        return df
+
+    unique_clones_2w = fetch_github_endpoint(f"https://api.github.com/repos/{repo_name}/traffic/clones").json()
+    unique_views_2w = fetch_github_endpoint(f"https://api.github.com/repos/{repo_name}/traffic/views").json()
+
+    df1 = json_to_df(unique_clones_2w, 'clones')
+    df2 = json_to_df(unique_views_2w, 'views')
+
+    df = df1.merge(df2, on='timestamp', how='inner')
+
+    fig, ax1 = plt.subplots(figsize=(10, 6))
+    ax1.plot(df['timestamp'], df['views'], color='blue')
+    ax1.set_xlabel('Day', fontsize=18)
+    ax1.set_ylabel('Unique Views', color='blue', fontsize=18)
+    ax1.tick_params(axis='y', labelcolor='blue')
+
+    ax2 = ax1.twinx()
+    ax2.bar(df['timestamp'], df['clones'], color='red')
+    ax2.set_ylabel('Unique Clones', color='red', fontsize=18)
+    ax2.tick_params(axis='y', labelcolor='red')
+
+    plt.title('Views & Clones in the last 2 weeks', fontsize=24)
+    plt.savefig(f'{out_folder}/views_clones.png', dpi=120)  
+    plt.close()
+
+def plot_high_traffic_resources(repo_name, out_folder):
+    popular_paths_2w = fetch_github_endpoint(f"https://api.github.com/repos/{repo_name}/traffic/popular/paths").json()
+    df = pd.DataFrame(popular_paths_2w)
+    df['path'] = df['path'].apply(lambda x: '/'.join(x.split('/')[-2:]))
+    df = df.sort_values(by='uniques', ascending=False).head(10)
+
+    plt.figure(figsize=(10, 6))
+    plt.barh(df['path'], df['uniques'])
+    plt.xlabel('Unique traffic in the last 2 weeks', fontsize=18)
+    # plt.ylabel('Resource', fontsize=18, labelpad=15)
+    plt.title("Popular Resources on the Repository", fontsize=24)
+    plt.tight_layout()
+    plt.savefig(f'{out_folder}/resources.png', dpi=120)
+    plt.close()
+    
+def plot_high_traffic_referrers(repo_name, out_folder):
+    popular_referrer_2w = fetch_github_endpoint(f"https://api.github.com/repos/{repo_name}/traffic/popular/referrers").json()
+    df = pd.DataFrame(popular_referrer_2w)
+    df = df.sort_values(by='uniques', ascending=False)
+
+    plt.figure(figsize=(10, 6))
+    plt.barh(df['referrer'], df['uniques'])
+    plt.xlabel('Unique traffic in the last 2 weeks', fontsize=18)
+    plt.ylabel('Referrer', fontsize=18)
+    plt.title("Popular Referrers to the Repository", fontsize=24)
+    plt.savefig(f'{out_folder}/referrers.png', dpi=120)
+    plt.close()
+
+def plot_commit_activity(repo_name, out_folder):
+    limit = 10
+    today = pd.to_datetime('today')
+    weekly_commit_count_52w = fetch_github_endpoint(f"https://api.github.com/repos/{repo_name}/stats/participation").json()['all'][-limit:]
+    timestamps = [(today - pd.Timedelta(days=7*(i+1))) for i in range(limit)]
+    df = pd.DataFrame({'timestamp': timestamps, 'commit_count': weekly_commit_count_52w})
+
+    plt.figure(figsize=(10, 6))
+    plt.bar(df['timestamp'], df['commit_count'])
+    plt.xlabel('Week', fontsize=18)
+    plt.ylabel('Commit Count', fontsize=18)
+    plt.title(f"Commits in the last {limit} weeks", fontsize=24)
+    plt.savefig(f'{out_folder}/commits.png', dpi=120)
+    plt.close()
+
+def plot_user_expertise(df, out_folder):
+    d = df.to_dict('records')[0]
+    levels = ['Beginner', 'Intermediate', 'Advanced']
+    keys = [f"op_expertise_count_{x.lower()}" for x in levels]
+    data = pd.DataFrame({'Expertise': levels, 'Count': [d.get(k, 0) for k in keys]})
+
+    plt.figure(figsize=(10, 6))
+    plt.barh(data['Expertise'], data['Count'])
+    plt.xlabel('Count', fontsize=18)
+    plt.title('User Expertise', fontsize=24)
+    plt.savefig(f'{out_folder}/expertise.png', dpi=120)
+    plt.close()
+
+def plot_severity(df, out_folder):
+    d = df.to_dict('records')[0]
+    levels = ['Trivial', 'Minor', "Major", 'Critical']
+    keys = [f"severity_count_{x.lower()}" for x in levels]
+    data = pd.DataFrame({'Severity': levels, 'Count': [d.get(k, 0) for k in keys]})
+    plt.figure(figsize=(10, 6))
+    plt.barh(data['Severity'], data['Count'])
+    plt.xlabel('Count', fontsize=18)
+    plt.title('Severity', fontsize=24)
+    plt.savefig(f'{out_folder}/severity.png', dpi=120)
+    plt.close()
+
+def plot_sentiment(df, out_folder):
+    d = df.to_dict('records')[0]
+    levels = ['Positive', 'Neutral', 'Negative']
+    keys = [f"sentiment_count_{x.lower()}" for x in levels]
+    data = pd.DataFrame({'Sentiment': levels, 'Count': [d.get(k, 0) for k in keys]})
+    plt.figure(figsize=(10, 6))
+    plt.barh(data['Sentiment'], data['Count'])
+    plt.xlabel('Count', fontsize=18)
+    plt.title('Sentiment', fontsize=24)
+    plt.savefig(f'{out_folder}/sentiment.png', dpi=120)
+    plt.close()
+        
+def plot_themes(df, out_folder):
+    d = df.to_dict('records')[0]
+    levels = ['Documentation', 'Installation and Environment', 'Model Inference', 'Model Fine Tuning and Training', 'Model Evaluation and Benchmarking', 'Model Conversion', 'Cloud Compute', 'CUDA Compatibility', 'Distributed Training and Multi-GPU', 'Invalid', 'Miscellaneous']
+    keys = [f'themes_count_{x.lower().replace(" ", "_").replace("-", "_")}' for x in levels]
+    data = pd.DataFrame({'Theme': levels, 'Count': [d.get(k, 0) for k in keys]})
+    plt.figure(figsize=(10, 6))
+    plt.barh(data['Theme'], data['Count'])
+    plt.xlabel('Count', fontsize=18)
+    # plt.ylabel('Theme', fontsize=18)
+    plt.title('Themes', fontsize=24)
+    plt.tight_layout()
+    plt.savefig(f'{out_folder}/themes.png', dpi=120)
+    plt.close()
+  
+def issue_activity_sankey(df, out_folder):
+    
+    d = df.to_dict('records')[0]
+    label = ["New Issues", "Issues Under Discussion", "Issues Discussed and Closed", "Issues Not Responded to", "Issues Closed Without Discussion"]
+    values = [
+        d['issues_created'], 
+        d['open_discussion'] + d['closed_discussion'],  # 7
+        d['closed_discussion'], # 3
+        d['open_no_discussion'] + d['closed_no_discussion'],
+        d['closed_no_discussion'] 
+    ]
+
+    fig = go.Figure(data=[go.Sankey(
+        node = dict(
+        pad = 15,
+        thickness = 20,
+        line = dict(color = "black", width = 0.5),
+        label = [f"{l} ({values[i]})" for i, l in enumerate(label)],
+        color = ["#007bff", "#17a2b8", "#6610f2", "#dc3545", "#6c757d"]  # color scheme to highlight different flows
+        ),
+        link = dict(
+        source = [0, 1, 0, 3], # indices correspond to labels, eg A1, A2, etc
+        target = [1, 2, 3, 4],
+        value = [v if v > 0 else 1e-9 for v in values[1:]]
+    ))])
+
+    fig.update_layout(title_text="Issue Flow", font_size=16)
+    fig.update_layout(margin=dict(l=20, r=20, t=60, b=20))  # adjust margins to make text more visible
+    fig.write_image(f"{out_folder}/engagement_sankey.png")
+
+
+def draw_all_plots(repo_name, out_folder, overview):
+    func1 = [plot_views_clones, plot_high_traffic_resources, plot_high_traffic_referrers, plot_commit_activity]
+    func2 = [plot_user_expertise, plot_severity, plot_sentiment, plot_themes, issue_activity_sankey]
+    for func in func1:
+        try:
+            func(repo_name, out_folder)
+        except:
+            print(f"Github fetch failed for {func}. Make sure you have push-access to {repo_name}!")
+    for func in func2:
+        func(overview, out_folder)
+    

+ 6 - 0
recipes/use_cases/github_triage/requirements.txt

@@ -0,0 +1,6 @@
+kaleido
+plotly
+openai
+groq
+fpdf
+plotly

+ 246 - 0
recipes/use_cases/github_triage/triage.py

@@ -0,0 +1,246 @@
+import os
+from typing import Optional, Tuple, Dict
+import pandas as pd
+import fire
+
+from llm import run_llm_inference
+from utils import fetch_repo_issues, validate_df_values
+from plots import draw_all_plots
+from pdf_report import create_report_pdf
+
+def generate_issue_annotations(
+    issues_df: pd.DataFrame,
+    save_folder: Optional[str] = None
+) -> Tuple[pd.DataFrame, Dict[str, int]]:
+    """
+    Get the annotations for the given issues.
+
+    Args:
+    - issues_df (pd.DataFrame): The DataFrame containing the issues.
+
+    Returns:
+    - Tuple[pd.DataFrame, Dict[str, int]]: A tuple containing the annotated issues DataFrame and the theme counts.
+    """
+
+    # pyre-fixme[6]
+    def _categorize_issues(
+        issues_metadata_df: pd.DataFrame,
+    ) -> Tuple[pd.Series, Dict[str, int]]:
+        """
+        Categorize the issues.
+
+        Args:
+        - issues_metadata_df (pd.DataFrame): The DataFrame containing the issues metadata.
+
+        Returns:
+        - Tuple[pd.Series, Dict[str, int]]: A tuple containing the categorized issues and the theme counts.
+        """
+        minified_issues = issues_metadata_df[
+            [
+                "number",
+                "summary",
+                "possible_causes",
+                "remediations",
+                "component",
+                "issue_type",
+            ]
+        ].to_dict(orient="records")
+        themes_json = run_llm_inference(
+            "assign_category",
+            str(minified_issues),
+            generation_kwargs={"temperature": 0.45, "max_tokens": 2048},
+        )
+
+        tmp = {}
+        for t in themes_json["report"]:
+            for num in t["related_issues"]:
+                tmp[num] = tmp.get(num, []) + [t["theme"]]
+
+        themes = issues_metadata_df.number.apply(
+            lambda x: tmp.get(x, ["Miscellaneous"])
+        )
+        theme_count = {
+            k["theme"]: len(k["related_issues"]) for k in themes_json["report"]
+        }
+        return themes, theme_count
+
+    discussions = issues_df["discussion"].tolist()
+    metadata = run_llm_inference(
+        "parse_issue",
+        discussions,
+        generation_kwargs={"max_tokens": 2048, "temperature": 0.42},
+    )
+
+    # Handle the case where the LLM returns None instead of a generated response
+    metadata_index = [
+        issues_df.index[i] for i in range(len(metadata)) if metadata[i] is not None
+    ]
+    metadata = [m for m in metadata if m is not None]
+
+    issues_metadata_df = issues_df.merge(
+        pd.DataFrame(metadata, index=metadata_index), left_index=True, right_index=True
+    )
+
+    themes, theme_count = _categorize_issues(issues_metadata_df)
+    issues_metadata_df["themes"] = themes
+    
+    if save_folder:
+        save_df(issues_metadata_df, save_folder, 'annotated_issues')
+        
+    return issues_metadata_df, theme_count
+
+
+def generate_executive_reports(
+    annotated_issues: pd.DataFrame,
+    theme_counts: Dict,
+    repo_name: str,
+    start_date: str,
+    end_date: str,
+    save_folder: Optional[str] = None,
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Generate executive reports for the given issues.
+
+    Args:
+    - annotated_issues (pd.DataFrame): The DataFrame containing the annotated issues.
+    - theme_counts (dict): A dictionary containing the theme counts.
+    - repo_name (str): The name of the repository. Defaults to None.
+    - start_date (str): The start date of the report. Defaults to None.
+    - end_date (str): The end date of the report. Defaults to None.
+
+    Returns:
+    - Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the challenges DataFrame and the overview DataFrame.
+    """
+
+    report = {
+        "repo_name": repo_name,
+        "start_date": start_date,
+        "end_date": end_date,
+        "sentiment_count": annotated_issues["sentiment"].value_counts().to_dict(),
+        "severity_count": annotated_issues["severity"].value_counts().to_dict(),
+        "op_expertise_count": annotated_issues["op_expertise"].value_counts().to_dict(),
+        "themes_count": theme_counts,
+        "issues_created": annotated_issues["number"].nunique(),
+        "open_discussion": len(
+            annotated_issues[
+                (annotated_issues.num_comments > 0) & (annotated_issues.closed == False)
+            ]
+        ),
+        "closed_discussion": len(
+            annotated_issues[
+                (annotated_issues.num_comments > 0) & (annotated_issues.closed == True)
+            ]
+        ),
+        "open_no_discussion": len(
+            annotated_issues[
+                (annotated_issues.num_comments == 0)
+                & (annotated_issues.closed == False)
+            ]
+        ),
+        "closed_no_discussion": len(
+            annotated_issues[
+                (annotated_issues.num_comments == 0) & (annotated_issues.closed == True)
+            ]
+        ),
+    }
+
+    report_input = str(
+        annotated_issues[
+            ["number", "summary", "possible_causes", "remediations"]
+        ].to_dict("records")
+    )
+    overview = run_llm_inference(
+        "get_overview", str(report_input), {"temperature": 0.45, "max_tokens": 4096}
+    )
+    report.update(overview)
+
+    overview_df = {
+        k: report[k]
+        for k in [
+            "repo_name",
+            "start_date",
+            "end_date",
+            "issues_created",
+            "open_discussion",
+            "closed_discussion",
+            "open_no_discussion",
+            "closed_no_discussion",
+        ]
+    }
+    overview_df["open_questions"] = [report["open_questions"]]
+    overview_df["executive_summary"] = [report["executive_summary"]]
+
+    for col in [
+        "sentiment_count",
+        "severity_count",
+        "op_expertise_count",
+        "themes_count",
+    ]:
+        d = report[col]
+        for k, v in d.items():
+            overview_df[f"{col}_{k}"] = v
+
+    overview_df = pd.DataFrame(overview_df)
+
+    challenges_df = {k: report[k] for k in ["repo_name", "start_date", "end_date"]}
+    challenges_df["key_challenge"] = [
+        k["key_challenge"] for k in report["issue_analysis"]
+    ]
+    challenges_df["affected_issues"] = [
+        k["affected_issues"] for k in report["issue_analysis"]
+    ]
+    challenges_df["possible_causes"] = [
+        k["possible_causes"] for k in report["issue_analysis"]
+    ]
+    challenges_df["remediations"] = [
+        k["remediations"] for k in report["issue_analysis"]
+    ]
+    challenges_df = pd.DataFrame(challenges_df)
+
+    return challenges_df, overview_df
+
+
+def create_report(repo_name, start_date, end_date, challenges, overview, out_folder):
+    
+    # generate pdf report
+    challenges = validate_df_values(challenges)
+    overview = validate_df_values(overview)
+    exec_summary = overview['executive_summary'].iloc[0]
+    open_qs = overview['open_questions'].iloc[0]
+    key_challenges_data = challenges[['key_challenge', 'possible_causes', 'remediations', 'affected_issues']].to_dict('records')
+    create_report_pdf(repo_name, start_date, end_date, key_challenges_data, exec_summary, open_qs, out_folder)
+       
+   
+def main(repo_name, start_date, end_date):
+    out_folder = f'output/{repo_name}/{start_date}_{end_date}'
+    os.makedirs(out_folder, exist_ok=True)
+    
+    # Get issues data
+    issues_df = fetch_repo_issues(repo_name, start_date, end_date)
+    
+    # Generate annotations and metadata
+    annotated_issues, theme_counts = generate_issue_annotations(issues_df)
+    
+    # Generate high-level analysis
+    challenges, overview = generate_executive_reports(annotated_issues, theme_counts, repo_name, start_date, end_date)
+    
+    # Validate and save generated data
+    annotated_issues = validate_df_values(annotated_issues)
+    challenges = validate_df_values(challenges)
+    overview = validate_df_values(overview)
+    
+    # Create graphs and charts
+    plot_folder = out_folder + "/plots"
+    os.makedirs(plot_folder, exist_ok=True)
+    draw_all_plots(repo_name, plot_folder, overview)
+    
+    # Create PDF report
+    exec_summary = overview['executive_summary'].iloc[0]
+    open_qs = overview['open_questions'].iloc[0]
+    key_challenges_data = challenges[['key_challenge', 'possible_causes', 'remediations', 'affected_issues']].to_dict('records')
+    create_report_pdf(repo_name, start_date, end_date, key_challenges_data, exec_summary, open_qs, out_folder)
+    
+
+
+if __name__ == "__main__":
+    fire.Fire(main)

+ 97 - 0
recipes/use_cases/github_triage/utils.py

@@ -0,0 +1,97 @@
+import requests
+import yaml
+import pandas as pd
+
+
+CFG = yaml.safe_load(open("config.yaml", "r"))
+
+
+def fetch_github_endpoint(url):
+    headers = {
+        "Authorization": f"Bearer {CFG['tokens']['github']}",
+        "Content-Type": "application/json"
+    }
+    response = requests.get(url, headers=headers, timeout=10)
+    return response
+
+
+def fetch_repo_issues(repo, start_date=None, end_date=None):
+    time_filter = ""
+    if start_date and not end_date:
+        time_filter = f"+created:>{start_date}"
+    if end_date and not start_date:
+        time_filter = f"+created:<{end_date}"
+    if start_date and end_date:
+        time_filter = f"+created:{start_date}..{end_date}"
+    
+    url = f"https://api.github.com/search/issues?per_page=100&sort=created&order=asc&q=repo:{repo}+is:issue{time_filter}"
+
+    samples = []
+    print(f"[{repo}/issues] Fetching page: ", end=" ", flush=True)
+
+    while True:
+        response = fetch_github_endpoint(url)
+
+        if response.status_code == 200:
+            print(". ", end=" ", flush=True)
+            issues = response.json()['items']
+            for issue in issues:
+                if issue['body'] is None:
+                    continue
+                
+                issue['discussion'] = issue['title'] + "\n" + issue['body']
+                if issue['comments'] > 0:
+                    comments_response = fetch_github_endpoint(issue['comments_url']).json()
+                    comments = "\n> ".join([x['body'] for x in comments_response])
+                    issue['discussion'] += "\n> " + comments
+                    
+                samples.append(issue)
+        
+            # Check if there are more pages
+            if "Link" in response.headers:
+                link_header = [h.split(';') for h in response.headers["Link"].split(', ')]
+                link_header = [x for x in link_header if "next" in x[1]]
+                if link_header:
+                    url = link_header[0][0].strip().replace('<', '').replace('>','')
+                else:
+                    break
+            else:
+                break
+        else:
+            print(f"Error: {response.status_code}")
+            break
+
+    rows = [{
+        "repo_name": repo,
+        "number": d['number'],
+        "html_url": d['html_url'],
+        "closed": (d['state'] == 'closed'),
+        "num_comments": d['comments'],
+        "created_at": d["created_at"],
+        "discussion": d['discussion'],
+    } for d in samples]
+    
+    return pd.DataFrame(rows)
+
+
+def fetch_repo_stats(repo):
+    repo_info = fetch_github_endpoint(f"https://api.github.com/repos/{repo}").json()
+    
+    repo_stats = {
+        "Total Open Issues": repo_info['open_issues_count'],
+        "Total Stars": repo_info['stargazers_count'],
+        "Total Forks": repo_info['forks_count'],
+    }
+    
+    return repo_stats
+
+
+def validate_df_values(df, out_folder=None, name=None):
+    df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("-", "_")
+    for c in df.columns:
+        x = df[c].iloc[0]
+        if isinstance(x, str) and '[' in x:
+            df[c] = df[c].apply(lambda x: eval(x))
+    if out_folder is not None:
+        df.to_csv(f"{out_folder}/{name}.csv", index=False)
+    return df