| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 | import requestsimport yamlimport pandas as pdimport logginglogger = logging.getLogger(__name__)logger.addHandler(logging.StreamHandler())CFG = yaml.safe_load(open("config.yaml", "r"))def fetch_github_endpoint(url):    headers = {        "Authorization": f"Bearer {CFG['github_token']}",        "Content-Type": "application/json"    }    logger.debug(f"Requesting url: {url}")    response = requests.get(url, headers=headers, timeout=10)    return responsedef fetch_repo_issues(repo, start_date=None, end_date=None):    time_filter = ""    if start_date and not end_date:        time_filter = f"+created:>{start_date}"    if end_date and not start_date:        time_filter = f"+created:<{end_date}"    if start_date and end_date:        time_filter = f"+created:{start_date}..{end_date}"        url = f"https://api.github.com/search/issues?per_page=100&sort=created&order=asc&q=repo:{repo}+is:issue{time_filter}"    samples = []    while True:        response = fetch_github_endpoint(url)                if response.status_code == 200:            issues = response.json()['items']            for issue in issues:                if issue['body'] is None:                    continue                                issue['discussion'] = issue['title'] + "\n" + issue['body']                if issue['comments'] > 0:                    comments_response = fetch_github_endpoint(issue['comments_url']).json()                    comments = "\n> ".join([x['body'] for x in comments_response])                    issue['discussion'] += "\n> " + comments                                    samples.append(issue)                    # Check if there are more pages            if "Link" in response.headers:                link_header = [h.split(';') for h in response.headers["Link"].split(', ')]                link_header = [x for x in link_header if "next" in x[1]]                if link_header:                    url = link_header[0][0].strip().replace('<', '').replace('>','')                else:                    break            else:                break        else:            raise Exception(f"Fetching issues failed with Error: {response.status_code} on url {url}")            rows = [{        "repo_name": repo,        "number": d['number'],        "html_url": d['html_url'],        "closed": (d['state'] == 'closed'),        "num_comments": d['comments'],        "created_at": d["created_at"],        "discussion": d['discussion'],    } for d in samples]        logger.info(f"Fetched {len(samples)} issues on {repo} from {start_date} to {end_date}")        return pd.DataFrame(rows)def fetch_repo_stats(repo):    repo_info = fetch_github_endpoint(f"https://api.github.com/repos/{repo}").json()        repo_stats = {        "Total Open Issues": repo_info['open_issues_count'],        "Total Stars": repo_info['stargazers_count'],        "Total Forks": repo_info['forks_count'],    }        return repo_statsdef validate_df_values(df, out_folder=None, name=None):    df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("-", "_")    if out_folder is not None:        path = f"{out_folder}/{name}.csv"        df.to_csv(path, index=False)        logger.info(f"Data saved to {path}")    return df
 |