main.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. #!/usr/bin/env -S uv --quiet run --script
  2. # /// script
  3. # requires-python = ">=3.12"
  4. # dependencies = [
  5. # "bs4",
  6. # "httpx",
  7. # "pydantic",
  8. # "python-dateutil",
  9. # "python-frontmatter",
  10. # "python-slugify",
  11. # "pytz",
  12. # "rich",
  13. # "typer",
  14. # "markdown-it-py",
  15. # ]
  16. # ///
  17. import os
  18. import re
  19. from pathlib import Path
  20. from typing import Any
  21. from urllib.parse import urlparse
  22. import frontmatter
  23. import httpx
  24. import typer
  25. from bs4 import BeautifulSoup
  26. from bs4 import Tag
  27. from markdown_it import MarkdownIt
  28. from pydantic import BaseModel
  29. from pydantic import ConfigDict
  30. from pydantic import Field
  31. from rich import print
  32. from rich.progress import track
  33. from slugify import slugify
  34. app = typer.Typer(
  35. add_help_option=False,
  36. no_args_is_help=True,
  37. rich_markup_mode="rich",
  38. )
  39. class Project(BaseModel):
  40. """Model representing a Django project from the awesome list."""
  41. model_config = ConfigDict(extra="allow")
  42. name: str
  43. description: str
  44. url: str
  45. category: str
  46. slug: str = Field(default="")
  47. tags: list[str] = Field(default_factory=list)
  48. github_stars: int | None = None
  49. github_forks: int | None = None
  50. github_last_update: str | None = None
  51. github_last_commit: str | None = None
  52. previous_urls: list[str] = Field(default_factory=list)
  53. def __init__(self, **data):
  54. super().__init__(**data)
  55. if not self.slug:
  56. self.slug = slugify(self.name)
  57. def parse_project_line(line: Tag, category: str) -> Project | None:
  58. """Parse a project line from the markdown and return a Project object."""
  59. try:
  60. # Find the project link
  61. link = line.find("a")
  62. if not link:
  63. return None
  64. name = link.text.strip()
  65. url = link.get("href", "").strip()
  66. # Get description (text after the link)
  67. description = line.text.replace(name, "").strip()
  68. description = re.sub(r"^\s*-\s*", "", description) # Remove leading dash
  69. description = re.sub(r"^\s*", "", description) # Remove leading whitespace
  70. if not all([name, url, description]):
  71. return None
  72. return Project(name=name, description=description, url=url, category=category)
  73. except Exception as e:
  74. print(f"[red]Error parsing project line: {e}[/red]")
  75. return None
  76. def read_readme(file_path: Path) -> str:
  77. """Read README content from local file and convert to HTML."""
  78. markdown_content = file_path.read_text()
  79. md = MarkdownIt()
  80. html_content = md.render(markdown_content)
  81. return html_content
  82. def parse_readme(content: str) -> list[Project]:
  83. """Parse README content and extract projects."""
  84. soup = BeautifulSoup(content, "html.parser")
  85. projects = []
  86. current_category = ""
  87. for element in soup.find_all(["h2", "h3", "li"]):
  88. if element.name in ["h2", "h3"]:
  89. current_category = element.text.strip()
  90. elif element.name == "li" and current_category:
  91. if current_category == "Contents":
  92. continue
  93. project = parse_project_line(element, current_category)
  94. if project:
  95. projects.append(project)
  96. return projects
  97. def merge_project_data(existing: dict[str, Any], new: dict[str, Any]) -> dict[str, Any]:
  98. """
  99. Merge existing project data with new data, preserving existing values
  100. while updating with new information where appropriate.
  101. """
  102. # Start with the existing data
  103. merged = existing.copy()
  104. # Always update core fields from the README
  105. core_fields = {"name", "url", "category"}
  106. for field in core_fields:
  107. if field in new:
  108. # If URL is changing, store the old URL in previous_urls
  109. if field == "url" and new["url"] != existing.get("url"):
  110. previous_urls = merged.get("previous_urls", [])
  111. old_url = existing.get("url")
  112. if old_url and old_url not in previous_urls:
  113. previous_urls.append(old_url)
  114. merged["previous_urls"] = previous_urls
  115. merged[field] = new[field]
  116. # Smart merge for description - update only if meaningfully different
  117. if "description" in new and new["description"] != existing.get("description", ""):
  118. merged["description"] = new["description"]
  119. # Update GitHub metrics if they exist in new data
  120. github_fields = {"github_stars", "github_forks", "github_last_update", "github_last_commit"}
  121. for field in github_fields:
  122. if field in new and new[field] is not None:
  123. merged[field] = new[field]
  124. return merged
  125. def save_project(project: Project, output_dir: Path):
  126. """Save project as a markdown file with frontmatter, preserving and merging existing content."""
  127. output_file = output_dir / f"{project.slug}.md"
  128. project_data = project.model_dump(exclude_none=True)
  129. if output_file.exists():
  130. try:
  131. # Load existing file
  132. existing_post = frontmatter.load(output_file)
  133. existing_data = dict(existing_post.metadata)
  134. # Merge data, favoring preservation of existing content
  135. merged_data = merge_project_data(existing_data, project_data)
  136. # Create new post with merged data but keep existing content
  137. post = frontmatter.Post(existing_post.content, **merged_data)
  138. except Exception as e:
  139. print(
  140. f"[yellow]Warning: Could not load existing file {output_file}, creating new: {e}[/yellow]"
  141. )
  142. post = frontmatter.Post(project.description, **project_data)
  143. else:
  144. # Create new file
  145. post = frontmatter.Post(project.description, **project_data)
  146. output_file.write_text(frontmatter.dumps(post))
  147. def extract_github_info(url: str) -> dict[str, str] | None:
  148. """Extract owner and repo from a GitHub URL."""
  149. parsed = urlparse(url)
  150. if parsed.netloc != "github.com":
  151. return None
  152. parts = parsed.path.strip("/").split("/")
  153. if len(parts) >= 2:
  154. return {"owner": parts[0], "repo": parts[1]}
  155. return None
  156. def get_github_metrics(
  157. owner: str, repo: str, client: httpx.Client
  158. ) -> tuple[dict, str | None]:
  159. """
  160. Fetch GitHub metrics for a repository.
  161. Returns a tuple of (metrics_dict, new_url) where new_url is set if the repo has moved.
  162. """
  163. headers = {}
  164. if github_token := os.environ.get("GITHUB_TOKEN"):
  165. headers["Authorization"] = f"token {github_token}"
  166. api_url = f"https://api.github.com/repos/{owner}/{repo}"
  167. try:
  168. response = client.get(
  169. api_url,
  170. headers=headers,
  171. timeout=10.0,
  172. follow_redirects=True, # Enable following redirects
  173. )
  174. # Check if we followed a redirect
  175. new_url = None
  176. if len(response.history) > 0:
  177. for r in response.history:
  178. if r.status_code == 301:
  179. # Get the new location from the API response
  180. data = response.json()
  181. new_url = data.get("html_url")
  182. if new_url:
  183. print(
  184. f"[yellow]Repository moved: {owner}/{repo} -> {new_url}[/yellow]"
  185. )
  186. break
  187. response.raise_for_status()
  188. data = response.json()
  189. metrics = {
  190. "github_stars": data["stargazers_count"],
  191. "github_forks": data["forks_count"],
  192. "github_last_update": data["updated_at"],
  193. }
  194. # Fetch last commit date
  195. commits_url = f"https://api.github.com/repos/{owner}/{repo}/commits"
  196. try:
  197. commits_response = client.get(
  198. commits_url,
  199. headers=headers,
  200. params={"per_page": 1},
  201. timeout=10.0,
  202. follow_redirects=True,
  203. )
  204. commits_response.raise_for_status()
  205. commits_data = commits_response.json()
  206. if commits_data and len(commits_data) > 0:
  207. metrics["github_last_commit"] = commits_data[0]["commit"]["committer"]["date"]
  208. except httpx.HTTPError as e:
  209. print(f"[yellow]Warning: Could not fetch commits for {owner}/{repo}: {str(e)}[/yellow]")
  210. return metrics, new_url
  211. except httpx.HTTPError as e:
  212. print(f"[red]Error fetching GitHub metrics for {owner}/{repo}: {str(e)}[/red]")
  213. return {}, None
  214. def load_project(file_path: Path) -> Project | None:
  215. """Load a project from a markdown file."""
  216. try:
  217. post = frontmatter.load(file_path)
  218. return Project(**post.metadata)
  219. except Exception as e:
  220. print(f"[red]Error loading project from {file_path}: {str(e)}[/red]")
  221. return None
  222. @app.command()
  223. def parse(readme_path: Path = Path("README.md"), output_dir: str = "_projects"):
  224. """
  225. Parse local Awesome Django README and create individual project files with frontmatter.
  226. Preserves existing file content and metadata while updating with new information from README.
  227. """
  228. if not readme_path.exists():
  229. print(f"[red]Error: README file not found at {readme_path}[/red]")
  230. raise typer.Exit(1)
  231. print(f"[bold blue]Reading README from {readme_path}...[/bold blue]")
  232. # Create output directory
  233. output_path = Path(output_dir)
  234. output_path.mkdir(exist_ok=True)
  235. # Read and parse README
  236. content = read_readme(readme_path)
  237. projects = parse_readme(content)
  238. print(f"[green]Found {len(projects)} projects[/green]")
  239. # Save individual project files
  240. for project in projects:
  241. save_project(project, output_path)
  242. print(f"[green]Updated {project.name} in {project.slug}.md[/green]")
  243. @app.command()
  244. def update_metrics(projects_dir: Path = Path("_projects"), batch_size: int = 50):
  245. """
  246. Update GitHub metrics (stars, forks, last update) for all projects.
  247. """
  248. if not projects_dir.exists():
  249. print(f"[red]Error: Projects directory not found at {projects_dir}[/red]")
  250. raise typer.Exit(1)
  251. print(
  252. f"[bold blue]Updating GitHub metrics for projects in {projects_dir}...[/bold blue]"
  253. )
  254. # Load all projects
  255. project_files = list(projects_dir.glob("*.md"))
  256. projects = []
  257. for file in project_files:
  258. if project := load_project(file):
  259. projects.append((file, project))
  260. print(f"[green]Found {len(projects)} projects to update[/green]")
  261. # Update metrics in batches to avoid rate limiting
  262. with httpx.Client() as client:
  263. for i in track(
  264. range(0, len(projects), batch_size), description="Updating projects"
  265. ):
  266. batch = projects[i : i + batch_size]
  267. for file_path, project in batch:
  268. if github_info := extract_github_info(project.url):
  269. metrics, new_url = get_github_metrics(
  270. github_info["owner"], github_info["repo"], client
  271. )
  272. if metrics:
  273. # Update project with new metrics
  274. for key, value in metrics.items():
  275. setattr(project, key, value)
  276. # Update URL if repository has moved
  277. if new_url and new_url != project.url:
  278. # Store the old URL in previous_urls
  279. if not hasattr(project, "previous_urls"):
  280. project.previous_urls = []
  281. project.previous_urls.append(project.url)
  282. # Update to new URL
  283. project.url = new_url
  284. print(
  285. f"[yellow]Updated URL for {project.name}: {project.url}[/yellow]"
  286. )
  287. save_project(project, projects_dir)
  288. print(f"[green]Updated metrics for {project.name}[/green]")
  289. print("[bold blue]Finished updating GitHub metrics![/bold blue]")
  290. if __name__ == "__main__":
  291. app()