utils.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. import requests
  2. import yaml
  3. import pandas as pd
  4. import logging
  5. logger = logging.getLogger(__name__)
  6. logger.addHandler(logging.StreamHandler())
  7. CFG = yaml.safe_load(open("config.yaml", "r"))
  8. def fetch_github_endpoint(url):
  9. headers = {
  10. "Authorization": f"Bearer {CFG['github_token']}",
  11. "Content-Type": "application/json"
  12. }
  13. logger.debug(f"Requesting url: {url}")
  14. response = requests.get(url, headers=headers, timeout=10)
  15. return response
  16. def fetch_repo_issues(repo, start_date=None, end_date=None):
  17. time_filter = ""
  18. if start_date and not end_date:
  19. time_filter = f"+created:>{start_date}"
  20. if end_date and not start_date:
  21. time_filter = f"+created:<{end_date}"
  22. if start_date and end_date:
  23. time_filter = f"+created:{start_date}..{end_date}"
  24. url = f"https://api.github.com/search/issues?per_page=100&sort=created&order=asc&q=repo:{repo}+is:issue{time_filter}"
  25. samples = []
  26. logger.info(f"Fetching issues on {repo} from {start_date} to {end_date}")
  27. while True:
  28. response = fetch_github_endpoint(url)
  29. if response.status_code == 200:
  30. issues = response.json()['items']
  31. for issue in issues:
  32. if issue['body'] is None:
  33. continue
  34. issue['discussion'] = issue['title'] + "\n" + issue['body']
  35. if issue['comments'] > 0:
  36. comments_response = fetch_github_endpoint(issue['comments_url']).json()
  37. comments = "\n> ".join([x['body'] for x in comments_response])
  38. issue['discussion'] += "\n> " + comments
  39. samples.append(issue)
  40. # Check if there are more pages
  41. if "Link" in response.headers:
  42. link_header = [h.split(';') for h in response.headers["Link"].split(', ')]
  43. link_header = [x for x in link_header if "next" in x[1]]
  44. if link_header:
  45. url = link_header[0][0].strip().replace('<', '').replace('>','')
  46. else:
  47. break
  48. else:
  49. break
  50. else:
  51. raise Exception(f"Fetching issues failed with Error: {response.status_code}")
  52. print()
  53. rows = [{
  54. "repo_name": repo,
  55. "number": d['number'],
  56. "html_url": d['html_url'],
  57. "closed": (d['state'] == 'closed'),
  58. "num_comments": d['comments'],
  59. "created_at": d["created_at"],
  60. "discussion": d['discussion'],
  61. } for d in samples]
  62. logger.info(f"Fetched {len(samples)} issues on {repo} from {start_date} to {end_date}")
  63. return pd.DataFrame(rows)
  64. def fetch_repo_stats(repo):
  65. repo_info = fetch_github_endpoint(f"https://api.github.com/repos/{repo}").json()
  66. repo_stats = {
  67. "Total Open Issues": repo_info['open_issues_count'],
  68. "Total Stars": repo_info['stargazers_count'],
  69. "Total Forks": repo_info['forks_count'],
  70. }
  71. return repo_stats
  72. def validate_df_values(df, out_folder=None, name=None):
  73. df.columns = df.columns.str.lower().str.replace(" ", "_").str.replace("-", "_")
  74. # for c in df.columns:
  75. # x = df[c].iloc[0]
  76. # if isinstance(x, str) and '[' in x:
  77. # df[c] = df[c].apply(lambda x: eval(x))
  78. if out_folder is not None:
  79. path = f"{out_folder}/{name}.csv"
  80. df.to_csv(path, index=False)
  81. logger.info(f"Data saved to {path}")
  82. return df