main.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. #!/usr/bin/env -S uv --quiet run --script
  2. # /// script
  3. # requires-python = ">=3.12"
  4. # dependencies = [
  5. # "bs4",
  6. # "httpx",
  7. # "pydantic",
  8. # "python-dateutil",
  9. # "python-frontmatter",
  10. # "python-slugify",
  11. # "pytz",
  12. # "rich",
  13. # "typer",
  14. # "markdown-it-py",
  15. # ]
  16. # ///
  17. import os
  18. import re
  19. from pathlib import Path
  20. from typing import Any
  21. from urllib.parse import urlparse
  22. import frontmatter
  23. import httpx
  24. import typer
  25. from bs4 import BeautifulSoup
  26. from bs4 import Tag
  27. from markdown_it import MarkdownIt
  28. from pydantic import BaseModel
  29. from pydantic import ConfigDict
  30. from pydantic import Field
  31. from rich import print
  32. from rich.progress import track
  33. from slugify import slugify
  34. app = typer.Typer(
  35. add_help_option=False,
  36. no_args_is_help=True,
  37. rich_markup_mode="rich",
  38. )
  39. class Project(BaseModel):
  40. """Model representing a Django project from the awesome list."""
  41. model_config = ConfigDict(extra="allow")
  42. name: str
  43. description: str
  44. url: str
  45. category: str
  46. slug: str = Field(default="")
  47. tags: list[str] = Field(default_factory=list)
  48. github_stars: int | None = None
  49. github_forks: int | None = None
  50. github_last_update: str | None = None
  51. previous_urls: list[str] = Field(default_factory=list)
  52. def __init__(self, **data):
  53. super().__init__(**data)
  54. if not self.slug:
  55. self.slug = slugify(self.name)
  56. def parse_project_line(line: Tag, category: str) -> Project | None:
  57. """Parse a project line from the markdown and return a Project object."""
  58. try:
  59. # Find the project link
  60. link = line.find("a")
  61. if not link:
  62. return None
  63. name = link.text.strip()
  64. url = link.get("href", "").strip()
  65. # Get description (text after the link)
  66. description = line.text.replace(name, "").strip()
  67. description = re.sub(r"^\s*-\s*", "", description) # Remove leading dash
  68. description = re.sub(r"^\s*", "", description) # Remove leading whitespace
  69. if not all([name, url, description]):
  70. return None
  71. return Project(name=name, description=description, url=url, category=category)
  72. except Exception as e:
  73. print(f"[red]Error parsing project line: {e}[/red]")
  74. return None
  75. def read_readme(file_path: Path) -> str:
  76. """Read README content from local file and convert to HTML."""
  77. markdown_content = file_path.read_text()
  78. md = MarkdownIt()
  79. html_content = md.render(markdown_content)
  80. return html_content
  81. def parse_readme(content: str) -> list[Project]:
  82. """Parse README content and extract projects."""
  83. soup = BeautifulSoup(content, "html.parser")
  84. projects = []
  85. current_category = ""
  86. for element in soup.find_all(["h2", "h3", "li"]):
  87. if element.name in ["h2", "h3"]:
  88. current_category = element.text.strip()
  89. elif element.name == "li" and current_category:
  90. if current_category == "Contents":
  91. continue
  92. project = parse_project_line(element, current_category)
  93. if project:
  94. projects.append(project)
  95. return projects
  96. def merge_project_data(existing: dict[str, Any], new: dict[str, Any]) -> dict[str, Any]:
  97. """
  98. Merge existing project data with new data, preserving existing values
  99. while updating with new information where appropriate.
  100. """
  101. # Start with the existing data
  102. merged = existing.copy()
  103. # Always update core fields from the README
  104. core_fields = {"name", "url", "category"}
  105. for field in core_fields:
  106. if field in new:
  107. # If URL is changing, store the old URL in previous_urls
  108. if field == "url" and new["url"] != existing.get("url"):
  109. previous_urls = merged.get("previous_urls", [])
  110. old_url = existing.get("url")
  111. if old_url and old_url not in previous_urls:
  112. previous_urls.append(old_url)
  113. merged["previous_urls"] = previous_urls
  114. merged[field] = new[field]
  115. # Smart merge for description - update only if meaningfully different
  116. if "description" in new and new["description"] != existing.get("description", ""):
  117. merged["description"] = new["description"]
  118. # Update GitHub metrics if they exist in new data
  119. github_fields = {"github_stars", "github_forks", "github_last_update"}
  120. for field in github_fields:
  121. if field in new and new[field] is not None:
  122. merged[field] = new[field]
  123. return merged
  124. def save_project(project: Project, output_dir: Path):
  125. """Save project as a markdown file with frontmatter, preserving and merging existing content."""
  126. output_file = output_dir / f"{project.slug}.md"
  127. project_data = project.model_dump(exclude_none=True)
  128. if output_file.exists():
  129. try:
  130. # Load existing file
  131. existing_post = frontmatter.load(output_file)
  132. existing_data = dict(existing_post.metadata)
  133. # Merge data, favoring preservation of existing content
  134. merged_data = merge_project_data(existing_data, project_data)
  135. # Create new post with merged data but keep existing content
  136. post = frontmatter.Post(existing_post.content, **merged_data)
  137. except Exception as e:
  138. print(
  139. f"[yellow]Warning: Could not load existing file {output_file}, creating new: {e}[/yellow]"
  140. )
  141. post = frontmatter.Post(project.description, **project_data)
  142. else:
  143. # Create new file
  144. post = frontmatter.Post(project.description, **project_data)
  145. output_file.write_text(frontmatter.dumps(post))
  146. def extract_github_info(url: str) -> dict[str, str] | None:
  147. """Extract owner and repo from a GitHub URL."""
  148. parsed = urlparse(url)
  149. if parsed.netloc != "github.com":
  150. return None
  151. parts = parsed.path.strip("/").split("/")
  152. if len(parts) >= 2:
  153. return {"owner": parts[0], "repo": parts[1]}
  154. return None
  155. def get_github_metrics(
  156. owner: str, repo: str, client: httpx.Client
  157. ) -> tuple[dict, str | None]:
  158. """
  159. Fetch GitHub metrics for a repository.
  160. Returns a tuple of (metrics_dict, new_url) where new_url is set if the repo has moved.
  161. """
  162. headers = {}
  163. if github_token := os.environ.get("GITHUB_TOKEN"):
  164. headers["Authorization"] = f"token {github_token}"
  165. api_url = f"https://api.github.com/repos/{owner}/{repo}"
  166. try:
  167. response = client.get(
  168. api_url,
  169. headers=headers,
  170. timeout=10.0,
  171. follow_redirects=True, # Enable following redirects
  172. )
  173. # Check if we followed a redirect
  174. new_url = None
  175. if len(response.history) > 0:
  176. for r in response.history:
  177. if r.status_code == 301:
  178. # Get the new location from the API response
  179. data = response.json()
  180. new_url = data.get("html_url")
  181. if new_url:
  182. print(
  183. f"[yellow]Repository moved: {owner}/{repo} -> {new_url}[/yellow]"
  184. )
  185. break
  186. response.raise_for_status()
  187. data = response.json()
  188. return {
  189. "github_stars": data["stargazers_count"],
  190. "github_forks": data["forks_count"],
  191. "github_last_update": data["updated_at"],
  192. }, new_url
  193. except httpx.HTTPError as e:
  194. print(f"[red]Error fetching GitHub metrics for {owner}/{repo}: {str(e)}[/red]")
  195. return {}, None
  196. def load_project(file_path: Path) -> Project | None:
  197. """Load a project from a markdown file."""
  198. try:
  199. post = frontmatter.load(file_path)
  200. return Project(**post.metadata)
  201. except Exception as e:
  202. print(f"[red]Error loading project from {file_path}: {str(e)}[/red]")
  203. return None
  204. @app.command()
  205. def parse(readme_path: Path = Path("README.md"), output_dir: str = "_projects"):
  206. """
  207. Parse local Awesome Django README and create individual project files with frontmatter.
  208. Preserves existing file content and metadata while updating with new information from README.
  209. """
  210. if not readme_path.exists():
  211. print(f"[red]Error: README file not found at {readme_path}[/red]")
  212. raise typer.Exit(1)
  213. print(f"[bold blue]Reading README from {readme_path}...[/bold blue]")
  214. # Create output directory
  215. output_path = Path(output_dir)
  216. output_path.mkdir(exist_ok=True)
  217. # Read and parse README
  218. content = read_readme(readme_path)
  219. projects = parse_readme(content)
  220. print(f"[green]Found {len(projects)} projects[/green]")
  221. # Save individual project files
  222. for project in projects:
  223. save_project(project, output_path)
  224. print(f"[green]Updated {project.name} in {project.slug}.md[/green]")
  225. @app.command()
  226. def update_metrics(projects_dir: Path = Path("_projects"), batch_size: int = 50):
  227. """
  228. Update GitHub metrics (stars, forks, last update) for all projects.
  229. """
  230. if not projects_dir.exists():
  231. print(f"[red]Error: Projects directory not found at {projects_dir}[/red]")
  232. raise typer.Exit(1)
  233. print(
  234. f"[bold blue]Updating GitHub metrics for projects in {projects_dir}...[/bold blue]"
  235. )
  236. # Load all projects
  237. project_files = list(projects_dir.glob("*.md"))
  238. projects = []
  239. for file in project_files:
  240. if project := load_project(file):
  241. projects.append((file, project))
  242. print(f"[green]Found {len(projects)} projects to update[/green]")
  243. # Update metrics in batches to avoid rate limiting
  244. with httpx.Client() as client:
  245. for i in track(
  246. range(0, len(projects), batch_size), description="Updating projects"
  247. ):
  248. batch = projects[i : i + batch_size]
  249. for file_path, project in batch:
  250. if github_info := extract_github_info(project.url):
  251. metrics, new_url = get_github_metrics(
  252. github_info["owner"], github_info["repo"], client
  253. )
  254. if metrics:
  255. # Update project with new metrics
  256. for key, value in metrics.items():
  257. setattr(project, key, value)
  258. # Update URL if repository has moved
  259. if new_url and new_url != project.url:
  260. # Store the old URL in previous_urls
  261. if not hasattr(project, "previous_urls"):
  262. project.previous_urls = []
  263. project.previous_urls.append(project.url)
  264. # Update to new URL
  265. project.url = new_url
  266. print(
  267. f"[yellow]Updated URL for {project.name}: {project.url}[/yellow]"
  268. )
  269. save_project(project, projects_dir)
  270. print(f"[green]Updated metrics for {project.name}[/green]")
  271. print("[bold blue]Finished updating GitHub metrics![/bold blue]")
  272. if __name__ == "__main__":
  273. app()