Quellcode durchsuchen

:robot: Quick attempt at a file to help us slice the project up

Jeff Triplett vor 4 Monaten
Ursprung
Commit
5a897c9095
1 geänderte Dateien mit 337 neuen und 0 gelöschten Zeilen
  1. 337 0
      scripts/main.py

+ 337 - 0
scripts/main.py

@@ -0,0 +1,337 @@
+#!/usr/bin/env -S uv --quiet run --script
+# /// script
+# requires-python = ">=3.12"
+# dependencies = [
+#     "bs4",
+#     "httpx",
+#     "pydantic",
+#     "python-dateutil",
+#     "python-frontmatter",
+#     "python-slugify",
+#     "pytz",
+#     "rich",
+#     "typer",
+#     "markdown-it-py",
+# ]
+# ///
+import os
+import re
+from pathlib import Path
+from typing import Any
+from urllib.parse import urlparse
+
+import frontmatter
+import httpx
+import typer
+from bs4 import BeautifulSoup
+from bs4 import Tag
+from markdown_it import MarkdownIt
+from pydantic import BaseModel
+from pydantic import ConfigDict
+from pydantic import Field
+from rich import print
+from rich.progress import track
+from slugify import slugify
+
+
+app = typer.Typer(
+    add_help_option=False,
+    no_args_is_help=True,
+    rich_markup_mode="rich",
+)
+
+
+class Project(BaseModel):
+    """Model representing a Django project from the awesome list."""
+
+    model_config = ConfigDict(extra="allow")
+
+    name: str
+    description: str
+    url: str
+    category: str
+    slug: str = Field(default="")
+    tags: list[str] = Field(default_factory=list)
+    github_stars: int | None = None
+    github_forks: int | None = None
+    github_last_update: str | None = None
+    previous_urls: list[str] = Field(default_factory=list)
+
+    def __init__(self, **data):
+        super().__init__(**data)
+        if not self.slug:
+            self.slug = slugify(self.name)
+
+
+def parse_project_line(line: Tag, category: str) -> Project | None:
+    """Parse a project line from the markdown and return a Project object."""
+    try:
+        # Find the project link
+        link = line.find("a")
+        if not link:
+            return None
+
+        name = link.text.strip()
+        url = link.get("href", "").strip()
+
+        # Get description (text after the link)
+        description = line.text.replace(name, "").strip()
+        description = re.sub(r"^\s*-\s*", "", description)  # Remove leading dash
+        description = re.sub(r"^\s*", "", description)  # Remove leading whitespace
+
+        if not all([name, url, description]):
+            return None
+
+        return Project(name=name, description=description, url=url, category=category)
+    except Exception as e:
+        print(f"[red]Error parsing project line: {e}[/red]")
+        return None
+
+
+def read_readme(file_path: Path) -> str:
+    """Read README content from local file and convert to HTML."""
+    markdown_content = file_path.read_text()
+    md = MarkdownIt()
+    html_content = md.render(markdown_content)
+    return html_content
+
+
+def parse_readme(content: str) -> list[Project]:
+    """Parse README content and extract projects."""
+    soup = BeautifulSoup(content, "html.parser")
+    projects = []
+    current_category = ""
+
+    for element in soup.find_all(["h2", "h3", "li"]):
+        if element.name in ["h2", "h3"]:
+            current_category = element.text.strip()
+        elif element.name == "li" and current_category:
+            if current_category == "Contents":
+                continue
+
+            project = parse_project_line(element, current_category)
+            if project:
+                projects.append(project)
+
+    return projects
+
+
+def merge_project_data(existing: dict[str, Any], new: dict[str, Any]) -> dict[str, Any]:
+    """
+    Merge existing project data with new data, preserving existing values
+    while updating with new information where appropriate.
+    """
+    # Start with the existing data
+    merged = existing.copy()
+
+    # Always update core fields from the README
+    core_fields = {"name", "url", "category"}
+    for field in core_fields:
+        if field in new:
+            # If URL is changing, store the old URL in previous_urls
+            if field == "url" and new["url"] != existing.get("url"):
+                previous_urls = merged.get("previous_urls", [])
+                old_url = existing.get("url")
+                if old_url and old_url not in previous_urls:
+                    previous_urls.append(old_url)
+                merged["previous_urls"] = previous_urls
+            merged[field] = new[field]
+
+    # Smart merge for description - update only if meaningfully different
+    if "description" in new and new["description"] != existing.get("description", ""):
+        merged["description"] = new["description"]
+
+    # Update GitHub metrics if they exist in new data
+    github_fields = {"github_stars", "github_forks", "github_last_update"}
+    for field in github_fields:
+        if field in new and new[field] is not None:
+            merged[field] = new[field]
+
+    return merged
+
+
+def save_project(project: Project, output_dir: Path):
+    """Save project as a markdown file with frontmatter, preserving and merging existing content."""
+    output_file = output_dir / f"{project.slug}.md"
+    project_data = project.model_dump(exclude_none=True)
+
+    if output_file.exists():
+        try:
+            # Load existing file
+            existing_post = frontmatter.load(output_file)
+            existing_data = dict(existing_post.metadata)
+
+            # Merge data, favoring preservation of existing content
+            merged_data = merge_project_data(existing_data, project_data)
+
+            # Create new post with merged data but keep existing content
+            post = frontmatter.Post(existing_post.content, **merged_data)
+        except Exception as e:
+            print(
+                f"[yellow]Warning: Could not load existing file {output_file}, creating new: {e}[/yellow]"
+            )
+            post = frontmatter.Post(project.description, **project_data)
+    else:
+        # Create new file
+        post = frontmatter.Post(project.description, **project_data)
+
+    output_file.write_text(frontmatter.dumps(post))
+
+
+def extract_github_info(url: str) -> dict[str, str] | None:
+    """Extract owner and repo from a GitHub URL."""
+    parsed = urlparse(url)
+    if parsed.netloc != "github.com":
+        return None
+
+    parts = parsed.path.strip("/").split("/")
+    if len(parts) >= 2:
+        return {"owner": parts[0], "repo": parts[1]}
+    return None
+
+
+def get_github_metrics(
+    owner: str, repo: str, client: httpx.Client
+) -> tuple[dict, str | None]:
+    """
+    Fetch GitHub metrics for a repository.
+    Returns a tuple of (metrics_dict, new_url) where new_url is set if the repo has moved.
+    """
+    headers = {}
+    if github_token := os.environ.get("GITHUB_TOKEN"):
+        headers["Authorization"] = f"token {github_token}"
+
+    api_url = f"https://api.github.com/repos/{owner}/{repo}"
+    try:
+        response = client.get(
+            api_url,
+            headers=headers,
+            timeout=10.0,
+            follow_redirects=True,  # Enable following redirects
+        )
+
+        # Check if we followed a redirect
+        new_url = None
+        if len(response.history) > 0:
+            for r in response.history:
+                if r.status_code == 301:
+                    # Get the new location from the API response
+                    data = response.json()
+                    new_url = data.get("html_url")
+                    if new_url:
+                        print(
+                            f"[yellow]Repository moved: {owner}/{repo} -> {new_url}[/yellow]"
+                        )
+                    break
+
+        response.raise_for_status()
+        data = response.json()
+
+        return {
+            "github_stars": data["stargazers_count"],
+            "github_forks": data["forks_count"],
+            "github_last_update": data["updated_at"],
+        }, new_url
+
+    except httpx.HTTPError as e:
+        print(f"[red]Error fetching GitHub metrics for {owner}/{repo}: {str(e)}[/red]")
+        return {}, None
+
+
+def load_project(file_path: Path) -> Project | None:
+    """Load a project from a markdown file."""
+    try:
+        post = frontmatter.load(file_path)
+        return Project(**post.metadata)
+    except Exception as e:
+        print(f"[red]Error loading project from {file_path}: {str(e)}[/red]")
+        return None
+
+
+@app.command()
+def parse(readme_path: Path = Path("README.md"), output_dir: str = "_projects"):
+    """
+    Parse local Awesome Django README and create individual project files with frontmatter.
+    Preserves existing file content and metadata while updating with new information from README.
+    """
+    if not readme_path.exists():
+        print(f"[red]Error: README file not found at {readme_path}[/red]")
+        raise typer.Exit(1)
+
+    print(f"[bold blue]Reading README from {readme_path}...[/bold blue]")
+
+    # Create output directory
+    output_path = Path(output_dir)
+    output_path.mkdir(exist_ok=True)
+
+    # Read and parse README
+    content = read_readme(readme_path)
+    projects = parse_readme(content)
+
+    print(f"[green]Found {len(projects)} projects[/green]")
+
+    # Save individual project files
+    for project in projects:
+        save_project(project, output_path)
+        print(f"[green]Updated {project.name} in {project.slug}.md[/green]")
+
+
+@app.command()
+def update_metrics(projects_dir: Path = Path("_projects"), batch_size: int = 50):
+    """
+    Update GitHub metrics (stars, forks, last update) for all projects.
+    """
+    if not projects_dir.exists():
+        print(f"[red]Error: Projects directory not found at {projects_dir}[/red]")
+        raise typer.Exit(1)
+
+    print(
+        f"[bold blue]Updating GitHub metrics for projects in {projects_dir}...[/bold blue]"
+    )
+
+    # Load all projects
+    project_files = list(projects_dir.glob("*.md"))
+    projects = []
+    for file in project_files:
+        if project := load_project(file):
+            projects.append((file, project))
+
+    print(f"[green]Found {len(projects)} projects to update[/green]")
+
+    # Update metrics in batches to avoid rate limiting
+    with httpx.Client() as client:
+        for i in track(
+            range(0, len(projects), batch_size), description="Updating projects"
+        ):
+            batch = projects[i : i + batch_size]
+            for file_path, project in batch:
+                if github_info := extract_github_info(project.url):
+                    metrics, new_url = get_github_metrics(
+                        github_info["owner"], github_info["repo"], client
+                    )
+
+                    if metrics:
+                        # Update project with new metrics
+                        for key, value in metrics.items():
+                            setattr(project, key, value)
+
+                        # Update URL if repository has moved
+                        if new_url and new_url != project.url:
+                            # Store the old URL in previous_urls
+                            if not hasattr(project, "previous_urls"):
+                                project.previous_urls = []
+                            project.previous_urls.append(project.url)
+                            # Update to new URL
+                            project.url = new_url
+                            print(
+                                f"[yellow]Updated URL for {project.name}: {project.url}[/yellow]"
+                            )
+
+                        save_project(project, projects_dir)
+                        print(f"[green]Updated metrics for {project.name}[/green]")
+
+    print("[bold blue]Finished updating GitHub metrics![/bold blue]")
+
+
+if __name__ == "__main__":
+    app()