1 سال پیش · a5ddea144a
--- a/recipes/use_cases/github_triage/config.yaml
+++ b/recipes/use_cases/github_triage/config.yaml
--- a/recipes/use_cases/github_triage/llm.py
+++ b/recipes/use_cases/github_triage/llm.py
@@ -0,0 +1,160 @@
 
				+import logging 
			
 
				+from typing import Any, Dict, List, Optional, Union
			
 
				+import yaml
			
 
				+import time
			
 
				+import json
			
 
				+
			
 
				+from openai import OpenAI
			
 
				+import groq
			
 
				+
			
 
				+log = logging.getLogger(__name__)
			
 
				+CFG = yaml.safe_load(open("config.yaml", "r"))
			
 
				+
			
 
				+class LlamaVLLM():
			
 
				+    def __init__(self, endpoint, model_id):
			
 
				+        self.model_id = model_id
			
 
				+        self.client = OpenAI(base_url=endpoint, api_key='token')
			
 
				+
			
 
				+    def chat(
			
 
				+        self,
			
 
				+        inputs: List[Dict[str, str]],
			
 
				+        generation_kwargs: Optional[Dict[str, Any]] = None,
			
 
				+        guided_decode_json_schema: Optional[str] = None
			
 
				+    ) -> List[str]:
			
 
				+
			
 
				+        if generation_kwargs is None:
			
 
				+            generation_kwargs = {}
			
 
				+            
			
 
				+        try:
			
 
				+            response = self.client.chat.completions.create(
			
 
				+                model=self.model,
			
 
				+                messages=inputs,
			
 
				+                extra_body={
			
 
				+                    "guided_json": guided_decode_json_schema
			
 
				+                },
			
 
				+                **generation_kwargs,
			
 
				+            )
			
 
				+            output = response.choices[0].message
			
 
				+        except Exception as e:
			
 
				+            log.error(
			
 
				+                f"FAILED to generate inference for input {inputs}\nError: {str(e)}"
			
 
				+            )
			
 
				+            output = None
			
 
				+        return output
			
 
				+    
			
 
				+
			
 
				+class LlamaGroq():
			
 
				+    def __init__(self, key, model_id):
			
 
				+        self.model_id = model_id
			
 
				+        self.client = groq.Groq(api_key=key)
			
 
				+        print(f"Using Groq:{self.model_id} for inference")
			
 
				+
			
 
				+    def chat(
			
 
				+        self, 
			
 
				+        inputs: List[Dict[str, str]], 
			
 
				+        generation_kwargs: Optional[Dict[str, Any]] = None,
			
 
				+        guided_decode_json_schema: Optional[str] = None
			
 
				+    ) -> str:
			
 
				+        
			
 
				+        if generation_kwargs is None:
			
 
				+            generation_kwargs = {}
			
 
				+            
			
 
				+        # Currently Groq doesn't support guided JSON decoding. Workaround:
			
 
				+        if guided_decode_json_schema is not None:
			
 
				+            inputs[0]['content'] += f"\n\nEnsure your response aligns with the following JSON schema:\n{guided_decode_json_schema}\n\n"
			
 
				+        
			
 
				+        output = None
			
 
				+        
			
 
				+        while True:
			
 
				+            try:
			
 
				+                response = self.client.chat.completions.with_raw_response.create(
			
 
				+                    model=self.model_id,
			
 
				+                    messages=inputs,
			
 
				+                    stream=False,
			
 
				+                    **generation_kwargs,
			
 
				+                    response_format={"type": 'json_object' if guided_decode_json_schema is not None else 'text'}
			
 
				+                )
			
 
				+                completion = response.parse()
			
 
				+                output = completion.choices[0].message.content
			
 
				+                break
			
 
				+            except groq.RateLimitError as e:
			
 
				+                wait = response.headers['X-Ratelimit-Reset']
			
 
				+                response = e.response
			
 
				+                print(e)
			
 
				+                print(f"waiting for {wait} to prevent ratelimiting")
			
 
				+                time.sleep(wait)
			
 
				+            except:
			
 
				+                print(f"inference failed for input: {inputs}")
			
 
				+
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+def run_llm_inference(
			
 
				+    prompt_name: str,
			
 
				+    inputs: Union[str, List[str]],
			
 
				+    generation_kwargs: Optional[Dict] = None,
			
 
				+    guided_decode_json_schema=None,
			
 
				+) -> Union[List[str], List[Dict[str, Any]]]:
			
 
				+    """
			
 
				+    Run the LLM inference on the given inputs.
			
 
				+
			
 
				+    Args:
			
 
				+    - prompt_name (str): The name of the prompt to use.
			
 
				+    - inputs (str or List[str]): The input(s) to the LLM.
			
 
				+    - generation_kwargs (Dict): Additional keyword arguments to pass to the LLM.
			
 
				+    - guided_decode_json_schema (str): The JSON schema to use for guided decoding.
			
 
				+
			
 
				+    Returns:
			
 
				+    - Union[str, List[str]]: The response(s) from the LLM.
			
 
				+    """
			
 
				+    log.info(f"[run_llm_inference] {prompt_name}")
			
 
				+    
			
 
				+    # initialize appropriate LLM accessor
			
 
				+    if CFG['model']['use'] == 'vllm':
			
 
				+        LLM = LlamaVLLM(**CFG['model']['vllm'])
			
 
				+    elif CFG['model']['use'] == 'groq':
			
 
				+        LLM = LlamaGroq(**CFG['model']['groq'])
			
 
				+    else:
			
 
				+        raise ValueError("Invalid model type in config.yaml")
			
 
				+    
			
 
				+    _batch = True
			
 
				+    if isinstance(inputs, str):
			
 
				+        _batch = False
			
 
				+        inputs = [inputs]
			
 
				+
			
 
				+    inputs = [
			
 
				+        [
			
 
				+            {"role": "system", "content": CFG["prompts"][prompt_name]["system"]},
			
 
				+            {"role": "user", "content": i},
			
 
				+        ]
			
 
				+        for i in inputs
			
 
				+    ]
			
 
				+
			
 
				+    if (
			
 
				+        guided_decode_json_schema is None
			
 
				+        and "json_schema" in CFG["prompts"][prompt_name]
			
 
				+    ):
			
 
				+        guided_decode_json_schema = " ".join(
			
 
				+            CFG["prompts"][prompt_name]["json_schema"].split()
			
 
				+        )
			
 
				+
			
 
				+    responses = [
			
 
				+        LLM.chat(i, generation_kwargs, guided_decode_json_schema) for i in inputs
			
 
				+    ]
			
 
				+
			
 
				+    if guided_decode_json_schema is not None:
			
 
				+        responses_json = []
			
 
				+        for r in responses:
			
 
				+            if r is not None:
			
 
				+                try:
			
 
				+                    responses_json.append(json.loads(r, strict=False))
			
 
				+                    continue
			
 
				+                except json.JSONDecodeError:
			
 
				+                    log.error(f"Error decoding JSON: {r}")
			
 
				+            responses_json.append(None)
			
 
				+        responses = responses_json
			
 
				+
			
 
				+    if not _batch:
			
 
				+        responses = responses[0]
			
 
				+
			
 
				+    return responses
			
--- a/recipes/use_cases/github_triage/pdf_report.py
+++ b/recipes/use_cases/github_triage/pdf_report.py
@@ -0,0 +1,111 @@
 
				+from fpdf import FPDF
			
 
				+import os
			
 
				+from datetime import datetime
			
 
				+
			
 
				+class ReportPDF(FPDF):
			
 
				+    def __init__(self, repository_name, start_date, end_date):
			
 
				+        FPDF.__init__(self,orientation='P',unit='mm',format='A4')
			
 
				+        self.repo = repository_name
			
 
				+        self.start_end = f"{datetime.strptime(start_date, '%Y-%m-%d').strftime('%b %d, %Y')} to {datetime.strptime(end_date, '%Y-%m-%d').strftime('%b %d, %Y')}"
			
 
				+        
			
 
				+    def header(self):
			
 
				+        self.set_font('Arial', 'B', 12)
			
 
				+        self.cell(100, 10, f'AutoTriage Report: {self.repo}', 0, 0)
			
 
				+        self.cell(90, 10, self.start_end, 0, 0, 'R')
			
 
				+        self.ln(20)
			
 
				+
			
 
				+    def footer(self):
			
 
				+        self.set_y(-15)
			
 
				+        self.set_font('Arial', 'I', 8)
			
 
				+        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
			
 
				+        
			
 
				+    def exec_summary(self, text):
			
 
				+        self.set_font('Arial', 'B', 16)
			
 
				+        self.cell(0, 8, 'Executive Summary', 'B', 0, 'L')
			
 
				+        self.ln(10)
			
 
				+        self.set_font('Arial', '', 10)
			
 
				+        self.multi_cell(0, 5, text)
			
 
				+        self.ln(10)
			
 
				+    
			
 
				+    def add_challenge(self, challenge_data):
			
 
				+        # title
			
 
				+        self.set_font('Arial', '', 14)
			
 
				+        self.cell(0, 10, f"{challenge_data['key_challenge']}", 0, 0, 'L')
			
 
				+        self.ln(8)
			
 
				+        
			
 
				+        # psosible causes
			
 
				+        self.set_font('Arial', 'B', 10)
			
 
				+        self.cell(0, 10, "Possible Causes", 0, 0, 'L')
			
 
				+        self.ln(5)
			
 
				+        self.set_font('Arial', '', 10)
			
 
				+        for x in challenge_data['possible_causes']:
			
 
				+            self.cell(0, 10, "* " + x, 0, 0, 'L')
			
 
				+            self.ln(5)
			
 
				+        self.ln(3)
			
 
				+            
			
 
				+        # remediations
			
 
				+        self.set_font('Arial', 'B', 10)
			
 
				+        self.cell(0, 10, "Remediations", 0, 0, 'L')
			
 
				+        self.ln(5)
			
 
				+        self.set_font('Arial', '', 10)
			
 
				+        for x in challenge_data['remediations']:
			
 
				+            self.cell(0, 10, "* " + x, 0, 0, 'L')
			
 
				+            self.ln(5)
			
 
				+        self.ln(3)
			
 
				+        
			
 
				+        # affected issues
			
 
				+        self.set_font('Arial', 'B', 10)
			
 
				+        self.cell(30, 10, f"Affected issues: ", 0, 0, 'L')
			
 
				+        for iss in challenge_data['affected_issues']:
			
 
				+            self.set_text_color(0,0,255)
			
 
				+            self.cell(8, 10, str(iss), 0, 0, 'L', link=f"https://github.com/{self.repo}/issues/{iss}")
			
 
				+        self.set_text_color(0,0,0)
			
 
				+        self.ln(15)
			
 
				+
			
 
				+    def challenges_section(self, key_challenges_data):
			
 
				+        self.set_font('Arial', 'B', 16)
			
 
				+        self.cell(0, 8, 'Key Challenges', 'B', 0, 'L')
			
 
				+        self.ln(10)
			
 
				+        for cd in key_challenges_data:
			
 
				+            self.add_challenge(cd)
			
 
				+        self.ln(20)
			
 
				+    
			
 
				+    def open_ques_section(self, open_questions):
			
 
				+        self.set_font('Arial', 'B', 16)
			
 
				+        self.cell(0, 8, 'Open Questions', 'B', 0, 'L')
			
 
				+        self.ln(10)
			
 
				+        self.set_font('Arial', '', 10)
			
 
				+        for qq in open_questions:
			
 
				+            self.multi_cell(0, 5, "* " + qq, 0, 0, 'L')
			
 
				+            self.ln(5)
			
 
				+        self.ln(5)
			
 
				+    
			
 
				+    def add_graphs_section(self, title, plot_paths):
			
 
				+        self.set_font('Arial', 'B', 16)
			
 
				+        self.cell(0, 8, f'[Viz] {title}', 'B', 0, 'L')
			
 
				+        self.ln(10)
			
 
				+        for path in plot_paths:
			
 
				+            if os.path.exists(path):
			
 
				+                self.add_plot(path)
			
 
				+        self.ln(10)
			
 
				+            
			
 
				+    def add_plot(self, img):
			
 
				+        self.image(img, x=30, w=150)
			
 
				+        self.ln(5)
			
 
				+        
			
 
				+    
			
 
				+    
			
 
				+def create_report_pdf(repo_name, start_date, end_date, key_challenges_data, executive_summary, open_questions, out_folder):#, image1, image2):
			
 
				+    pdf = ReportPDF(repo_name, start_date, end_date)
			
 
				+    pdf.add_page()
			
 
				+    pdf.exec_summary(executive_summary)
			
 
				+    pdf.open_ques_section(open_questions)
			
 
				+    pdf.challenges_section(key_challenges_data)
			
 
				+    pdf.add_page()
			
 
				+    pdf.add_graphs_section("Repo Maintenance", [f'{out_folder}/plots/engagement_sankey.png'])
			
 
				+    pdf.add_page()
			
 
				+    pdf.add_graphs_section("Traffic in the last 2 weeks", [f'{out_folder}/plots/{x}.png' for x in ['views_clones','resources', 'referrers']])
			
 
				+    pdf.add_page()
			
 
				+    pdf.add_graphs_section("New issues in the last 2 weeks", [f'{out_folder}/plots/{x}.png' for x in ['themes', 'severity', 'sentiment', 'expertise']])
			
 
				+    pdf.output(f'{out_folder}/report.pdf', 'F')
			
 
				+
			
--- a/recipes/use_cases/github_triage/plots.py
+++ b/recipes/use_cases/github_triage/plots.py
@@ -0,0 +1,174 @@
 
				+import matplotlib.pyplot as plt
			
 
				+import pandas as pd
			
 
				+import plotly.graph_objects as go
			
 
				+from utils import fetch_github_endpoint
			
 
				+
			
 
				+
			
 
				+def plot_views_clones(repo_name, out_folder):
			
 
				+    def json_to_df(json_data, key):
			
 
				+        df = pd.DataFrame(json_data[key])
			
 
				+        df['timestamp'] = df['timestamp'].apply(lambda x: x[5:10])
			
 
				+        if key in ['clones', 'views']:
			
 
				+            df.rename(columns={'uniques': key}, inplace=True)
			
 
				+            df.drop(columns=['count'], inplace=True)
			
 
				+        return df
			
 
				+
			
 
				+    unique_clones_2w = fetch_github_endpoint(f"https://api.github.com/repos/{repo_name}/traffic/clones").json()
			
 
				+    unique_views_2w = fetch_github_endpoint(f"https://api.github.com/repos/{repo_name}/traffic/views").json()
			
 
				+
			
 
				+    df1 = json_to_df(unique_clones_2w, 'clones')
			
 
				+    df2 = json_to_df(unique_views_2w, 'views')
			
 
				+
			
 
				+    df = df1.merge(df2, on='timestamp', how='inner')
			
 
				+
			
 
				+    fig, ax1 = plt.subplots(figsize=(10, 6))
			
 
				+    ax1.plot(df['timestamp'], df['views'], color='blue')
			
 
				+    ax1.set_xlabel('Day', fontsize=18)
			
 
				+    ax1.set_ylabel('Unique Views', color='blue', fontsize=18)
			
 
				+    ax1.tick_params(axis='y', labelcolor='blue')
			
 
				+
			
 
				+    ax2 = ax1.twinx()
			
 
				+    ax2.bar(df['timestamp'], df['clones'], color='red')
			
 
				+    ax2.set_ylabel('Unique Clones', color='red', fontsize=18)
			
 
				+    ax2.tick_params(axis='y', labelcolor='red')
			
 
				+
			
 
				+    plt.title('Views & Clones in the last 2 weeks', fontsize=24)
			
 
				+    plt.savefig(f'{out_folder}/views_clones.png', dpi=120)  
			
 
				+    plt.close()
			
 
				+
			
 
				+def plot_high_traffic_resources(repo_name, out_folder):
			
 
				+    popular_paths_2w = fetch_github_endpoint(f"https://api.github.com/repos/{repo_name}/traffic/popular/paths").json()
			
 
				+    df = pd.DataFrame(popular_paths_2w)
			
 
				+    df['path'] = df['path'].apply(lambda x: '/'.join(x.split('/')[-2:]))
			
 
				+    df = df.sort_values(by='uniques', ascending=False).head(10)
			
 
				+
			
 
				+    plt.figure(figsize=(10, 6))
			
 
				+    plt.barh(df['path'], df['uniques'])
			
 
				+    plt.xlabel('Unique traffic in the last 2 weeks', fontsize=18)
			
 
				+    # plt.ylabel('Resource', fontsize=18, labelpad=15)
			
 
				+    plt.title("Popular Resources on the Repository", fontsize=24)
			
 
				+    plt.tight_layout()
			
 
				+    plt.savefig(f'{out_folder}/resources.png', dpi=120)
			
 
				+    plt.close()
			
 
				+    
			
 
				+def plot_high_traffic_referrers(repo_name, out_folder):
			
 
				+    popular_referrer_2w = fetch_github_endpoint(f"https://api.github.com/repos/{repo_name}/traffic/popular/referrers").json()
			
 
				+    df = pd.DataFrame(popular_referrer_2w)
			
 
				+    df = df.sort_values(by='uniques', ascending=False)
			
 
				+
			
 
				+    plt.figure(figsize=(10, 6))
			
 
				+    plt.barh(df['referrer'], df['uniques'])
			
 
				+    plt.xlabel('Unique traffic in the last 2 weeks', fontsize=18)
			
 
				+    plt.ylabel('Referrer', fontsize=18)
			
 
				+    plt.title("Popular Referrers to the Repository", fontsize=24)
			
 
				+    plt.savefig(f'{out_folder}/referrers.png', dpi=120)
			
 
				+    plt.close()
			
 
				+
			
 
				+def plot_commit_activity(repo_name, out_folder):
			
 
				+    limit = 10
			
 
				+    today = pd.to_datetime('today')
			
 
				+    weekly_commit_count_52w = fetch_github_endpoint(f"https://api.github.com/repos/{repo_name}/stats/participation").json()['all'][-limit:]
			
 
				+    timestamps = [(today - pd.Timedelta(days=7*(i+1))) for i in range(limit)]
			
 
				+    df = pd.DataFrame({'timestamp': timestamps, 'commit_count': weekly_commit_count_52w})
			
 
				+
			
 
				+    plt.figure(figsize=(10, 6))
			
 
				+    plt.bar(df['timestamp'], df['commit_count'])
			
 
				+    plt.xlabel('Week', fontsize=18)
			
 
				+    plt.ylabel('Commit Count', fontsize=18)
			
 
				+    plt.title(f"Commits in the last {limit} weeks", fontsize=24)
			
 
				+    plt.savefig(f'{out_folder}/commits.png', dpi=120)
			
 
				+    plt.close()
			
 
				+
			
 
				+def plot_user_expertise(df, out_folder):
			
 
				+    d = df.to_dict('records')[0]
			
 
				+    levels = ['Beginner', 'Intermediate', 'Advanced']
			
 
				+    keys = [f"op_expertise_count_{x.lower()}" for x in levels]
			
 
				+    data = pd.DataFrame({'Expertise': levels, 'Count': [d.get(k, 0) for k in keys]})
			
 
				+
			
 
				+    plt.figure(figsize=(10, 6))
			
 
				+    plt.barh(data['Expertise'], data['Count'])
			
 
				+    plt.xlabel('Count', fontsize=18)
			
 
				+    plt.title('User Expertise', fontsize=24)
			
 
				+    plt.savefig(f'{out_folder}/expertise.png', dpi=120)
			
 
				+    plt.close()
			
 
				+
			
 
				+def plot_severity(df, out_folder):
			
 
				+    d = df.to_dict('records')[0]
			
 
				+    levels = ['Trivial', 'Minor', "Major", 'Critical']
			
 
				+    keys = [f"severity_count_{x.lower()}" for x in levels]
			
 
				+    data = pd.DataFrame({'Severity': levels, 'Count': [d.get(k, 0) for k in keys]})
			
 
				+    plt.figure(figsize=(10, 6))
			
 
				+    plt.barh(data['Severity'], data['Count'])
			
 
				+    plt.xlabel('Count', fontsize=18)
			
 
				+    plt.title('Severity', fontsize=24)
			
 
				+    plt.savefig(f'{out_folder}/severity.png', dpi=120)
			
 
				+    plt.close()
			
 
				+
			
 
				+def plot_sentiment(df, out_folder):
			
 
				+    d = df.to_dict('records')[0]
			
 
				+    levels = ['Positive', 'Neutral', 'Negative']
			
 
				+    keys = [f"sentiment_count_{x.lower()}" for x in levels]
			
 
				+    data = pd.DataFrame({'Sentiment': levels, 'Count': [d.get(k, 0) for k in keys]})
			
 
				+    plt.figure(figsize=(10, 6))
			
 
				+    plt.barh(data['Sentiment'], data['Count'])
			
 
				+    plt.xlabel('Count', fontsize=18)
			
 
				+    plt.title('Sentiment', fontsize=24)
			
 
				+    plt.savefig(f'{out_folder}/sentiment.png', dpi=120)
			
 
				+    plt.close()
			
 
				+        
			
 
				+def plot_themes(df, out_folder):
			
 
				+    d = df.to_dict('records')[0]
			
 
				+    levels = ['Documentation', 'Installation and Environment', 'Model Inference', 'Model Fine Tuning and Training', 'Model Evaluation and Benchmarking', 'Model Conversion', 'Cloud Compute', 'CUDA Compatibility', 'Distributed Training and Multi-GPU', 'Invalid', 'Miscellaneous']
			
 
				+    keys = [f'themes_count_{x.lower().replace(" ", "_").replace("-", "_")}' for x in levels]
			
 
				+    data = pd.DataFrame({'Theme': levels, 'Count': [d.get(k, 0) for k in keys]})
			
 
				+    plt.figure(figsize=(10, 6))
			
 
				+    plt.barh(data['Theme'], data['Count'])
			
 
				+    plt.xlabel('Count', fontsize=18)
			
 
				+    # plt.ylabel('Theme', fontsize=18)
			
 
				+    plt.title('Themes', fontsize=24)
			
 
				+    plt.tight_layout()
			
 
				+    plt.savefig(f'{out_folder}/themes.png', dpi=120)
			
 
				+    plt.close()
			
 
				+  
			
 
				+def issue_activity_sankey(df, out_folder):
			
 
				+    
			
 
				+    d = df.to_dict('records')[0]
			
 
				+    label = ["New Issues", "Issues Under Discussion", "Issues Discussed and Closed", "Issues Not Responded to", "Issues Closed Without Discussion"]
			
 
				+    values = [
			
 
				+        d['issues_created'], 
			
 
				+        d['open_discussion'] + d['closed_discussion'],  # 7
			
 
				+        d['closed_discussion'], # 3
			
 
				+        d['open_no_discussion'] + d['closed_no_discussion'],
			
 
				+        d['closed_no_discussion'] 
			
 
				+    ]
			
 
				+
			
 
				+    fig = go.Figure(data=[go.Sankey(
			
 
				+        node = dict(
			
 
				+        pad = 15,
			
 
				+        thickness = 20,
			
 
				+        line = dict(color = "black", width = 0.5),
			
 
				+        label = [f"{l} ({values[i]})" for i, l in enumerate(label)],
			
 
				+        color = ["#007bff", "#17a2b8", "#6610f2", "#dc3545", "#6c757d"]  # color scheme to highlight different flows
			
 
				+        ),
			
 
				+        link = dict(
			
 
				+        source = [0, 1, 0, 3], # indices correspond to labels, eg A1, A2, etc
			
 
				+        target = [1, 2, 3, 4],
			
 
				+        value = [v if v > 0 else 1e-9 for v in values[1:]]
			
 
				+    ))])
			
 
				+
			
 
				+    fig.update_layout(title_text="Issue Flow", font_size=16)
			
 
				+    fig.update_layout(margin=dict(l=20, r=20, t=60, b=20))  # adjust margins to make text more visible
			
 
				+    fig.write_image(f"{out_folder}/engagement_sankey.png")
			
 
				+
			
 
				+
			
 
				+def draw_all_plots(repo_name, out_folder, overview):
			
 
				+    func1 = [plot_views_clones, plot_high_traffic_resources, plot_high_traffic_referrers, plot_commit_activity]
			
 
				+    func2 = [plot_user_expertise, plot_severity, plot_sentiment, plot_themes, issue_activity_sankey]
			
 
				+    for func in func1:
			
 
				+        try:
			
 
				+            func(repo_name, out_folder)
			
 
				+        except:
			
 
				+            print(f"Github fetch failed for {func}. Make sure you have push-access to {repo_name}!")
			
 
				+    for func in func2:
			
 
				+        func(overview, out_folder)
			
 
				+    
			
--- a/recipes/use_cases/github_triage/requirements.txt
+++ b/recipes/use_cases/github_triage/requirements.txt
@@ -0,0 +1,6 @@
 
				+kaleido
			
 
				+plotly
			
 
				+openai
			
 
				+groq
			
 
				+fpdf
			
 
				+plotly
			
--- a/recipes/use_cases/github_triage/triage.py
+++ b/recipes/use_cases/github_triage/triage.py
@@ -0,0 +1,246 @@
 
				+import os
			
 
				+from typing import Optional, Tuple, Dict
			
 
				+import pandas as pd
			
 
				+import fire
			
 
				+
			
 
				+from llm import run_llm_inference
			
 
				+from utils import fetch_repo_issues, validate_df_values
			
 
				+from plots import draw_all_plots
			
 
				+from pdf_report import create_report_pdf
			
 
				+
			
 
				+def generate_issue_annotations(
			
 
				+    issues_df: pd.DataFrame,
			
 
				+    save_folder: Optional[str] = None
			
 
				+) -> Tuple[pd.DataFrame, Dict[str, int]]:
			
 
				+    """
			
 
				+    Get the annotations for the given issues.
			
 
				+
			
 
				+    Args:
			
 
				+    - issues_df (pd.DataFrame): The DataFrame containing the issues.
			
 
				+
			
 
				+    Returns:
			
 
				+    - Tuple[pd.DataFrame, Dict[str, int]]: A tuple containing the annotated issues DataFrame and the theme counts.
			
 
				+    """
			
 
				+
			
 
				+    # pyre-fixme[6]
			
 
				+    def _categorize_issues(
			
 
				+        issues_metadata_df: pd.DataFrame,
			
 
				+    ) -> Tuple[pd.Series, Dict[str, int]]:
			
 
				+        """
			
 
				+        Categorize the issues.
			
 
				+
			
 
				+        Args:
			
 
				+        - issues_metadata_df (pd.DataFrame): The DataFrame containing the issues metadata.
			
 
				+
			
 
				+        Returns:
			
 
				+        - Tuple[pd.Series, Dict[str, int]]: A tuple containing the categorized issues and the theme counts.
			
 
				+        """
			
 
				+        minified_issues = issues_metadata_df[
			
 
				+            [
			
 
				+                "number",
			
 
				+                "summary",
			
 
				+                "possible_causes",
			
 
				+                "remediations",
			
 
				+                "component",
			
 
				+                "issue_type",
			
 
				+            ]
			
 
				+        ].to_dict(orient="records")
			
 
				+        themes_json = run_llm_inference(
			
 
				+            "assign_category",
			
 
				+            str(minified_issues),
			
 
				+            generation_kwargs={"temperature": 0.45, "max_tokens": 2048},
			
 
				+        )
			
 
				+
			
 
				+        tmp = {}
			
 
				+        for t in themes_json["report"]:
			
 
				+            for num in t["related_issues"]:
			
 
				+                tmp[num] = tmp.get(num, []) + [t["theme"]]
			
 
				+
			
 
				+        themes = issues_metadata_df.number.apply(
			
 
				+            lambda x: tmp.get(x, ["Miscellaneous"])
			
 
				+        )
			
 
				+        theme_count = {
			
 
				+            k["theme"]: len(k["related_issues"]) for k in themes_json["report"]
			
 
				+        }
			
 
				+        return themes, theme_count
			
 
				+
			
 
				+    discussions = issues_df["discussion"].tolist()
			
 
				+    metadata = run_llm_inference(
			
 
				+        "parse_issue",
			
 
				+        discussions,
			
 
				+        generation_kwargs={"max_tokens": 2048, "temperature": 0.42},
			
 
				+    )
			
 
				+
			
 
				+    # Handle the case where the LLM returns None instead of a generated response
			
 
				+    metadata_index = [
			
 
				+        issues_df.index[i] for i in range(len(metadata)) if metadata[i] is not None
			
 
				+    ]
			
 
				+    metadata = [m for m in metadata if m is not None]
			
 
				+
			
 
				+    issues_metadata_df = issues_df.merge(
			
 
				+        pd.DataFrame(metadata, index=metadata_index), left_index=True, right_index=True
			
 
				+    )
			
 
				+
			
 
				+    themes, theme_count = _categorize_issues(issues_metadata_df)
			
 
				+    issues_metadata_df["themes"] = themes
			
 
				+    
			
 
				+    if save_folder:
			
 
				+        save_df(issues_metadata_df, save_folder, 'annotated_issues')
			
 
				+        
			
 
				+    return issues_metadata_df, theme_count
			
 
				+
			
 
				+
			
 
				+def generate_executive_reports(
			
 
				+    annotated_issues: pd.DataFrame,
			
 
				+    theme_counts: Dict,
			
 
				+    repo_name: str,
			
 
				+    start_date: str,
			
 
				+    end_date: str,
			
 
				+    save_folder: Optional[str] = None,
			
 
				+) -> Tuple[pd.DataFrame, pd.DataFrame]:
			
 
				+    """
			
 
				+    Generate executive reports for the given issues.
			
 
				+
			
 
				+    Args:
			
 
				+    - annotated_issues (pd.DataFrame): The DataFrame containing the annotated issues.
			
 
				+    - theme_counts (dict): A dictionary containing the theme counts.
			
 
				+    - repo_name (str): The name of the repository. Defaults to None.
			
 
				+    - start_date (str): The start date of the report. Defaults to None.
			
 
				+    - end_date (str): The end date of the report. Defaults to None.
			
 
				+
			
 
				+    Returns:
			
 
				+    - Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing the challenges DataFrame and the overview DataFrame.
			
 
				+    """
			
 
				+
			
 
				+    report = {
			
 
				+        "repo_name": repo_name,
			
 
				+        "start_date": start_date,
			
 
				+        "end_date": end_date,
			
 
				+        "sentiment_count": annotated_issues["sentiment"].value_counts().to_dict(),
			
 
				+        "severity_count": annotated_issues["severity"].value_counts().to_dict(),
			
 
				+        "op_expertise_count": annotated_issues["op_expertise"].value_counts().to_dict(),
			
 
				+        "themes_count": theme_counts,
			
 
				+        "issues_created": annotated_issues["number"].nunique(),
			
 
				+        "open_discussion": len(
			
 
				+            annotated_issues[
			
 
				+                (annotated_issues.num_comments > 0) & (annotated_issues.closed == False)
			
 
				+            ]
			
 
				+        ),
			
 
				+        "closed_discussion": len(
			
 
				+            annotated_issues[
			
 
				+                (annotated_issues.num_comments > 0) & (annotated_issues.closed == True)
			
 
				+            ]
			
 
				+        ),
			
 
				+        "open_no_discussion": len(
			
 
				+            annotated_issues[
			
 
				+                (annotated_issues.num_comments == 0)
			
 
				+                & (annotated_issues.closed == False)
			
 
				+            ]
			
 
				+        ),
			
 
				+        "closed_no_discussion": len(
			
 
				+            annotated_issues[
			
 
				+                (annotated_issues.num_comments == 0) & (annotated_issues.closed == True)
			
 
				+            ]
			
 
				+        ),
			
 
				+    }
			
 
				+
			
 
				+    report_input = str(
			
 
				+        annotated_issues[
			
 
				+            ["number", "summary", "possible_causes", "remediations"]
			
 
				+        ].to_dict("records")
			
 
				+    )
			
 
				+    overview = run_llm_inference(
			
 
				+        "get_overview", str(report_input), {"temperature": 0.45, "max_tokens": 4096}
			
 
				+    )
			
 
				+    report.update(overview)
			
 
				+
			
 
				+    overview_df = {
			
 
				+        k: report[k]
			
 
				+        for k in [
			
 
				+            "repo_name",
			
 
				+            "start_date",
			
 
				+            "end_date",
			
 
				+            "issues_created",
			
 
				+            "open_discussion",
			
 
				+            "closed_discussion",
			
 
				+            "open_no_discussion",
			
 
				+            "closed_no_discussion",
			
 
				+        ]
			
 
				+    }
			
 
				+    overview_df["open_questions"] = [report["open_questions"]]
			
 
				+    overview_df["executive_summary"] = [report["executive_summary"]]
			
 
				+
			
 
				+    for col in [
			
 
				+        "sentiment_count",
			
 
				+        "severity_count",
			
 
				+        "op_expertise_count",
			
 
				+        "themes_count",
			
 
				+    ]:
			
 
				+        d = report[col]
			
 
				+        for k, v in d.items():
			
 
				+            overview_df[f"{col}_{k}"] = v
			
 
				+
			
 
				+    overview_df = pd.DataFrame(overview_df)
			
 
				+
			
 
				+    challenges_df = {k: report[k] for k in ["repo_name", "start_date", "end_date"]}
			
 
				+    challenges_df["key_challenge"] = [
			
 
				+        k["key_challenge"] for k in report["issue_analysis"]
			
 
				+    ]
			
 
				+    challenges_df["affected_issues"] = [
			
 
				+        k["affected_issues"] for k in report["issue_analysis"]
			
 
				+    ]
			
 
				+    challenges_df["possible_causes"] = [
			
 
				+        k["possible_causes"] for k in report["issue_analysis"]
			
 
				+    ]
			
 
				+    challenges_df["remediations"] = [
			
 
				+        k["remediations"] for k in report["issue_analysis"]
			
 
				+    ]
			
 
				+    challenges_df = pd.DataFrame(challenges_df)
			
 
				+
			
 
				+    return challenges_df, overview_df
			
 
				+
			
 
				+
			
 
				+def create_report(repo_name, start_date, end_date, challenges, overview, out_folder):
			
 
				+    
			
 
				+    # generate pdf report
			
 
				+    challenges = validate_df_values(challenges)
			
 
				+    overview = validate_df_values(overview)
			
 
				+    exec_summary = overview['executive_summary'].iloc[0]
			
 
				+    open_qs = overview['open_questions'].iloc[0]
			
 
				+    key_challenges_data = challenges[['key_challenge', 'possible_causes', 'remediations', 'affected_issues']].to_dict('records')
			
 
				+    create_report_pdf(repo_name, start_date, end_date, key_challenges_data, exec_summary, open_qs, out_folder)
			
 
				+       
			
 
				+   
			
 
				+def main(repo_name, start_date, end_date):
			
 
				+    out_folder = f'output/{repo_name}/{start_date}_{end_date}'
			
 
				+    os.makedirs(out_folder, exist_ok=True)
			
 
				+    
			
 
				+    # Get issues data
			
 
				+    issues_df = fetch_repo_issues(repo_name, start_date, end_date)
			
 
				+    
			
 
				+    # Generate annotations and metadata
			
 
				+    annotated_issues, theme_counts = generate_issue_annotations(issues_df)
			
 
				+    
			
 
				+    # Generate high-level analysis
			
 
				+    challenges, overview = generate_executive_reports(annotated_issues, theme_counts, repo_name, start_date, end_date)
			
 
				+    
			
 
				+    # Validate and save generated data
			
 
				+    annotated_issues = validate_df_values(annotated_issues)
			
 
				+    challenges = validate_df_values(challenges)
			
 
				+    overview = validate_df_values(overview)
			
 
				+    
			
 
				+    # Create graphs and charts
			
 
				+    plot_folder = out_folder + "/plots"
			
 
				+    os.makedirs(plot_folder, exist_ok=True)
			
 
				+    draw_all_plots(repo_name, plot_folder, overview)
			
 
				+    
			
 
				+    # Create PDF report
			
 
				+    exec_summary = overview['executive_summary'].iloc[0]
			
 
				+    open_qs = overview['open_questions'].iloc[0]
			
 
				+    key_challenges_data = challenges[['key_challenge', 'possible_causes', 'remediations', 'affected_issues']].to_dict('records')
			
 
				+    create_report_pdf(repo_name, start_date, end_date, key_challenges_data, exec_summary, open_qs, out_folder)
			
 
				+    
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    fire.Fire(main)
			
--- a/recipes/use_cases/github_triage/utils.py
+++ b/recipes/use_cases/github_triage/utils.py
@@ -0,0 +1,97 @@
 
				+import requests
			
 
				+import yaml
			
 
				+import pandas as pd
			
 
				+
			
 
				+
			
 
				+CFG = yaml.safe_load(open("config.yaml", "r"))
			
 
				+
			
 
				+
			
 
				+def fetch_github_endpoint(url):
			
 
				+    headers = {
			
 
				+        "Authorization": f"Bearer {CFG['tokens']['github']}",
			
 
				+        "Content-Type": "application/json"
			
 
				+    }
			
 
				+    response = requests.get(url, headers=headers, timeout=10)
			
 
				+    return response
			
 
				+
			
 
				+
			
 
				+def fetch_repo_issues(repo, start_date=None, end_date=None):
			
 
				+    time_filter = ""
			
 
				+    if start_date and not end_date:
			
 
				+        time_filter = f"+created:>{start_date}"
			
 
				+    if end_date and not start_date:
			
 
				+        time_filter = f"+created:<{end_date}"
			
 
				+    if start_date and end_date:
			
 
				+        time_filter = f"+created:{start_date}..{end_date}"
			
 
				+    
			
 
				+    url = f"https://api.github.com/search/issues?per_page=100&sort=created&order=asc&q=repo:{repo}+is:issue{time_filter}"
			
 
				+
			
 
				+    samples = []
			
 
				+    print(f"[{repo}/issues] Fetching page: ", end=" ", flush=True)
			
 
				+
			
 
				+    while True:
			
 
				+        response = fetch_github_endpoint(url)
			
 
				+
			
 
				+        if response.status_code == 200:
			
 
				+            print(". ", end=" ", flush=True)
			
 
				+            issues = response.json()['items']
			
 
				+            for issue in issues:
			
 
				+                if issue['body'] is None:
			
 
				+                    continue
			
 
				+                
			
 
				+                issue['discussion'] = issue['title'] + "\n" + issue['body']
			
 
				+                if issue['comments'] > 0:
			
 
				+                    comments_response = fetch_github_endpoint(issue['comments_url']).json()
			
 
				+                    comments = "\n> ".join([x['body'] for x in comments_response])
			
 
				+                    issue['discussion'] += "\n> " + comments
			
 
				+                    
			
 
				+                samples.append(issue)
			
 
				+        
			
 
				+            # Check if there are more pages
			
 
				+            if "Link" in response.headers:
			
 
				+                link_header = [h.split(';') for h in response.headers["Link"].split(', ')]
			
 
				+                link_header = [x for x in link_header if "next" in x[1]]
			
 
				+                if link_header:
			
 
				+                    url = link_header[0][0].strip().replace('<', '').replace('>','')
			
 
				+                else:
			
 
				+                    break
			
 
				+            else:
			
 
				+                break
			
 
				+        else:
			
 
				+            print(f"Error: {response.status_code}")
			
 
				+            break
			
 
				+
			
 
				+    rows = [{
			
 
				+        "repo_name": repo,
			
 
				+        "number": d['number'],
			
 
				+        "html_url": d['html_url'],
			
 
				+        "closed": (d['state'] == 'closed'),
			
 
				+        "num_comments": d['comments'],
			
 
				+        "created_at": d["created_at"],
			
 
				+        "discussion": d['discussion'],
			
 
				+    } for d in samples]
			
 
				+    
			
 
				+    return pd.DataFrame(rows)
			
 
				+
			
 
				+
			
 
				+def fetch_repo_stats(repo):
			
 
				+    repo_info = fetch_github_endpoint(f"https://api.github.com/repos/{repo}").json()
			
 
				+    
			
 
				+    repo_stats = {
			
 
				+        "Total Open Issues": repo_info['open_issues_count'],
			
 
				+        "Total Stars": repo_info['stargazers_count'],
			
 
				+        "Total Forks": repo_info['forks_count'],
			
 
				+    }
			
 
				+    
			
 
				+    return repo_stats
			
 
				+
			
 
				+
			
 
				+def validate_df_values(df, out_folder=None, name=None):
			
 
				+    df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("-", "_")
			
 
				+    for c in df.columns:
			
 
				+        x = df[c].iloc[0]
			
 
				+        if isinstance(x, str) and '[' in x:
			
 
				+            df[c] = df[c].apply(lambda x: eval(x))
			
 
				+    if out_folder is not None:
			
 
				+        df.to_csv(f"{out_folder}/{name}.csv", index=False)
			
 
				+    return df