123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338 |
- #!/usr/bin/env -S uv --quiet run --script
- # /// script
- # requires-python = ">=3.12"
- # dependencies = [
- # "bs4",
- # "httpx",
- # "pydantic",
- # "python-dateutil",
- # "python-frontmatter",
- # "python-slugify",
- # "pytz",
- # "rich",
- # "typer",
- # "markdown-it-py",
- # ]
- # ///
- import os
- import re
- from pathlib import Path
- from typing import Any
- from urllib.parse import urlparse
- import frontmatter
- import httpx
- import typer
- from bs4 import BeautifulSoup
- from bs4 import Tag
- from markdown_it import MarkdownIt
- from pydantic import BaseModel
- from pydantic import ConfigDict
- from pydantic import Field
- from rich import print
- from rich.progress import track
- from slugify import slugify
- app = typer.Typer(
- add_help_option=False,
- no_args_is_help=True,
- rich_markup_mode="rich",
- )
- class Project(BaseModel):
- """Model representing a Django project from the awesome list."""
- model_config = ConfigDict(extra="allow")
- name: str
- description: str
- url: str
- category: str
- slug: str = Field(default="")
- tags: list[str] = Field(default_factory=list)
- github_stars: int | None = None
- github_forks: int | None = None
- github_last_update: str | None = None
- previous_urls: list[str] = Field(default_factory=list)
- def __init__(self, **data):
- super().__init__(**data)
- if not self.slug:
- self.slug = slugify(self.name)
- def parse_project_line(line: Tag, category: str) -> Project | None:
- """Parse a project line from the markdown and return a Project object."""
- try:
- # Find the project link
- link = line.find("a")
- if not link:
- return None
- name = link.text.strip()
- url = link.get("href", "").strip()
- # Get description (text after the link)
- description = line.text.replace(name, "").strip()
- description = re.sub(r"^\s*-\s*", "", description) # Remove leading dash
- description = re.sub(r"^\s*", "", description) # Remove leading whitespace
- if not all([name, url, description]):
- return None
- return Project(name=name, description=description, url=url, category=category)
- except Exception as e:
- print(f"[red]Error parsing project line: {e}[/red]")
- return None
- def read_readme(file_path: Path) -> str:
- """Read README content from local file and convert to HTML."""
- markdown_content = file_path.read_text()
- md = MarkdownIt()
- html_content = md.render(markdown_content)
- return html_content
- def parse_readme(content: str) -> list[Project]:
- """Parse README content and extract projects."""
- soup = BeautifulSoup(content, "html.parser")
- projects = []
- current_category = ""
- for element in soup.find_all(["h2", "h3", "li"]):
- if element.name in ["h2", "h3"]:
- current_category = element.text.strip()
- elif element.name == "li" and current_category:
- if current_category == "Contents":
- continue
- project = parse_project_line(element, current_category)
- if project:
- projects.append(project)
- return projects
- def merge_project_data(existing: dict[str, Any], new: dict[str, Any]) -> dict[str, Any]:
- """
- Merge existing project data with new data, preserving existing values
- while updating with new information where appropriate.
- """
- # Start with the existing data
- merged = existing.copy()
- # Always update core fields from the README
- core_fields = {"name", "url", "category"}
- for field in core_fields:
- if field in new:
- # If URL is changing, store the old URL in previous_urls
- if field == "url" and new["url"] != existing.get("url"):
- previous_urls = merged.get("previous_urls", [])
- old_url = existing.get("url")
- if old_url and old_url not in previous_urls:
- previous_urls.append(old_url)
- merged["previous_urls"] = previous_urls
- merged[field] = new[field]
- # Smart merge for description - update only if meaningfully different
- if "description" in new and new["description"] != existing.get("description", ""):
- merged["description"] = new["description"]
- # Update GitHub metrics if they exist in new data
- github_fields = {"github_stars", "github_forks", "github_last_update"}
- for field in github_fields:
- if field in new and new[field] is not None:
- merged[field] = new[field]
- return merged
- def save_project(project: Project, output_dir: Path):
- """Save project as a markdown file with frontmatter, preserving and merging existing content."""
- output_file = output_dir / f"{project.slug}.md"
- project_data = project.model_dump(exclude_none=True)
- if output_file.exists():
- try:
- # Load existing file
- existing_post = frontmatter.load(output_file)
- existing_data = dict(existing_post.metadata)
- # Merge data, favoring preservation of existing content
- merged_data = merge_project_data(existing_data, project_data)
- # Create new post with merged data but keep existing content
- post = frontmatter.Post(existing_post.content, **merged_data)
- except Exception as e:
- print(
- f"[yellow]Warning: Could not load existing file {output_file}, creating new: {e}[/yellow]"
- )
- post = frontmatter.Post(project.description, **project_data)
- else:
- # Create new file
- post = frontmatter.Post(project.description, **project_data)
- output_file.write_text(frontmatter.dumps(post))
- def extract_github_info(url: str) -> dict[str, str] | None:
- """Extract owner and repo from a GitHub URL."""
- parsed = urlparse(url)
- if parsed.netloc != "github.com":
- return None
- parts = parsed.path.strip("/").split("/")
- if len(parts) >= 2:
- return {"owner": parts[0], "repo": parts[1]}
- return None
- def get_github_metrics(
- owner: str, repo: str, client: httpx.Client
- ) -> tuple[dict, str | None]:
- """
- Fetch GitHub metrics for a repository.
- Returns a tuple of (metrics_dict, new_url) where new_url is set if the repo has moved.
- """
- headers = {}
- if github_token := os.environ.get("GITHUB_TOKEN"):
- headers["Authorization"] = f"token {github_token}"
- api_url = f"https://api.github.com/repos/{owner}/{repo}"
- try:
- response = client.get(
- api_url,
- headers=headers,
- timeout=10.0,
- follow_redirects=True, # Enable following redirects
- )
- # Check if we followed a redirect
- new_url = None
- if len(response.history) > 0:
- for r in response.history:
- if r.status_code == 301:
- # Get the new location from the API response
- data = response.json()
- new_url = data.get("html_url")
- if new_url:
- print(
- f"[yellow]Repository moved: {owner}/{repo} -> {new_url}[/yellow]"
- )
- break
- response.raise_for_status()
- data = response.json()
- return {
- "github_stars": data["stargazers_count"],
- "github_forks": data["forks_count"],
- "github_last_update": data["updated_at"],
- }, new_url
- except httpx.HTTPError as e:
- print(f"[red]Error fetching GitHub metrics for {owner}/{repo}: {str(e)}[/red]")
- return {}, None
- def load_project(file_path: Path) -> Project | None:
- """Load a project from a markdown file."""
- try:
- post = frontmatter.load(file_path)
- return Project(**post.metadata)
- except Exception as e:
- print(f"[red]Error loading project from {file_path}: {str(e)}[/red]")
- return None
- @app.command()
- def parse(readme_path: Path = Path("README.md"), output_dir: str = "_projects"):
- """
- Parse local Awesome Django README and create individual project files with frontmatter.
- Preserves existing file content and metadata while updating with new information from README.
- """
- if not readme_path.exists():
- print(f"[red]Error: README file not found at {readme_path}[/red]")
- raise typer.Exit(1)
- print(f"[bold blue]Reading README from {readme_path}...[/bold blue]")
- # Create output directory
- output_path = Path(output_dir)
- output_path.mkdir(exist_ok=True)
- # Read and parse README
- content = read_readme(readme_path)
- projects = parse_readme(content)
- print(f"[green]Found {len(projects)} projects[/green]")
- # Save individual project files
- for project in projects:
- save_project(project, output_path)
- print(f"[green]Updated {project.name} in {project.slug}.md[/green]")
- @app.command()
- def update_metrics(projects_dir: Path = Path("_projects"), batch_size: int = 50):
- """
- Update GitHub metrics (stars, forks, last update) for all projects.
- """
- if not projects_dir.exists():
- print(f"[red]Error: Projects directory not found at {projects_dir}[/red]")
- raise typer.Exit(1)
- print(
- f"[bold blue]Updating GitHub metrics for projects in {projects_dir}...[/bold blue]"
- )
- # Load all projects
- project_files = list(projects_dir.glob("*.md"))
- projects = []
- for file in project_files:
- if project := load_project(file):
- projects.append((file, project))
- print(f"[green]Found {len(projects)} projects to update[/green]")
- # Update metrics in batches to avoid rate limiting
- with httpx.Client() as client:
- for i in track(
- range(0, len(projects), batch_size), description="Updating projects"
- ):
- batch = projects[i : i + batch_size]
- for file_path, project in batch:
- if github_info := extract_github_info(project.url):
- metrics, new_url = get_github_metrics(
- github_info["owner"], github_info["repo"], client
- )
- if metrics:
- # Update project with new metrics
- for key, value in metrics.items():
- setattr(project, key, value)
- # Update URL if repository has moved
- if new_url and new_url != project.url:
- # Store the old URL in previous_urls
- if not hasattr(project, "previous_urls"):
- project.previous_urls = []
- project.previous_urls.append(project.url)
- # Update to new URL
- project.url = new_url
- print(
- f"[yellow]Updated URL for {project.name}: {project.url}[/yellow]"
- )
- save_project(project, projects_dir)
- print(f"[green]Updated metrics for {project.name}[/green]")
- print("[bold blue]Finished updating GitHub metrics![/bold blue]")
- if __name__ == "__main__":
- app()
|