utils.py 3.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. import requests
  2. import yaml
  3. import pandas as pd
  4. CFG = yaml.safe_load(open("config.yaml", "r"))
  5. def fetch_github_endpoint(url):
  6. headers = {
  7. "Authorization": f"Bearer {CFG['tokens']['github']}",
  8. "Content-Type": "application/json"
  9. }
  10. response = requests.get(url, headers=headers, timeout=10)
  11. return response
  12. def fetch_repo_issues(repo, start_date=None, end_date=None):
  13. time_filter = ""
  14. if start_date and not end_date:
  15. time_filter = f"+created:>{start_date}"
  16. if end_date and not start_date:
  17. time_filter = f"+created:<{end_date}"
  18. if start_date and end_date:
  19. time_filter = f"+created:{start_date}..{end_date}"
  20. url = f"https://api.github.com/search/issues?per_page=100&sort=created&order=asc&q=repo:{repo}+is:issue{time_filter}"
  21. samples = []
  22. print(f"[{repo}/issues] Fetching page: ", end=" ", flush=True)
  23. while True:
  24. response = fetch_github_endpoint(url)
  25. if response.status_code == 200:
  26. print(". ", end=" ", flush=True)
  27. issues = response.json()['items']
  28. for issue in issues:
  29. if issue['body'] is None:
  30. continue
  31. issue['discussion'] = issue['title'] + "\n" + issue['body']
  32. if issue['comments'] > 0:
  33. comments_response = fetch_github_endpoint(issue['comments_url']).json()
  34. comments = "\n> ".join([x['body'] for x in comments_response])
  35. issue['discussion'] += "\n> " + comments
  36. samples.append(issue)
  37. # Check if there are more pages
  38. if "Link" in response.headers:
  39. link_header = [h.split(';') for h in response.headers["Link"].split(', ')]
  40. link_header = [x for x in link_header if "next" in x[1]]
  41. if link_header:
  42. url = link_header[0][0].strip().replace('<', '').replace('>','')
  43. else:
  44. break
  45. else:
  46. break
  47. else:
  48. print(f"Error: {response.status_code}")
  49. break
  50. rows = [{
  51. "repo_name": repo,
  52. "number": d['number'],
  53. "html_url": d['html_url'],
  54. "closed": (d['state'] == 'closed'),
  55. "num_comments": d['comments'],
  56. "created_at": d["created_at"],
  57. "discussion": d['discussion'],
  58. } for d in samples]
  59. return pd.DataFrame(rows)
  60. def fetch_repo_stats(repo):
  61. repo_info = fetch_github_endpoint(f"https://api.github.com/repos/{repo}").json()
  62. repo_stats = {
  63. "Total Open Issues": repo_info['open_issues_count'],
  64. "Total Stars": repo_info['stargazers_count'],
  65. "Total Forks": repo_info['forks_count'],
  66. }
  67. return repo_stats
  68. def validate_df_values(df, out_folder=None, name=None):
  69. df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("-", "_")
  70. for c in df.columns:
  71. x = df[c].iloc[0]
  72. if isinstance(x, str) and '[' in x:
  73. df[c] = df[c].apply(lambda x: eval(x))
  74. if out_folder is not None:
  75. df.to_csv(f"{out_folder}/{name}.csv", index=False)
  76. return df