123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105 |
- import json
- import logging
- from bs4 import BeautifulSoup
- from markdownify import markdownify as md
- from concurrent.futures import ThreadPoolExecutor
- class HTMLToMarkdownConverter:
- def __init__(self, strip_tags=None, convert_links=True):
- """ Initialize converter with configuration options. """
- self.strip_tags = strip_tags or []
- self.convert_links = convert_links
- def convert(self, html_content):
- """ Convert HTML content to Markdown. """
- try:
- curated_html = self.curate_content(html_content)
- return md(curated_html, strip_tags=self.strip_tags, convert_links=self.convert_links)
- except Exception as e:
- logging.error(f"Error in HTML to Markdown conversion: {e}")
- return ""
-
- def curate_content(self, html):
- """ Curate the HTML content before conversion. """
- soup = BeautifulSoup(html, 'html.parser')
- # Implement specific curation logic here based on the content nature
- return str(soup)
- class DatasetFormatter:
- def __init__(self, converter):
- self.converter = converter
- def format_entry(self, entry):
- """ Format a single entry from the dataset. """
- try:
- title = entry.get('title', 'Untitled')
- url = entry.get('url', '')
- html_content = entry.get('html', '')
- markdown_content = self.converter.convert(html_content)
- return self.structure_markdown(title, url, markdown_content)
- except Exception as e:
- logging.error(f"Error formatting entry: {e}")
- return ""
- def structure_markdown(self, title, url, content):
- """ Structure the Markdown content with headers, lists, etc. """
- structured_content = f"## {title}\n\n"
- if url:
- structured_content += f"[Read More]({url})\n\n"
- structured_content += content
- return structured_content
- def format_dataset(self, data):
- """ Format the entire dataset. """
- formatted_content = ""
- for entry in data:
- formatted_content += self.format_entry(entry)
- return formatted_content
- def load_json(file_path):
- """ Load the JSON file. """
- with open(file_path, 'r') as file:
- return json.load(file)
- def save_output_in_chunks(file_path, contents, chunk_size=1024):
- """ Save the formatted content in chunks. """
- with open(file_path, 'w') as file:
- for content in contents:
- file.write(content)
- if len(content) > chunk_size:
- file.flush() # Flush after writing a large chunk
- def chunk_dataset(data, chunk_size):
- """ Divide the dataset into chunks of approximately equal size. """
- for i in range(0, len(data), chunk_size):
- yield data[i:i + chunk_size]
- def process_chunk(chunk):
- """ Process a single chunk of the dataset. """
- formatter = DatasetFormatter(HTMLToMarkdownConverter())
- return formatter.format_dataset(chunk)
- def main():
- logging.basicConfig(level=logging.INFO)
- try:
- original_data = load_json('transformers_documentation-gpt-crawler_output.json')
- chunk_size = 200 # Adjust chunk size as needed
- max_threads = 10 # Adjust the maximum number of threads as needed
- chunks = list(chunk_dataset(original_data, chunk_size))
- formatted_contents = []
- with ThreadPoolExecutor(max_workers=max_threads) as executor:
- results = executor.map(process_chunk, chunks)
- for result in results:
- formatted_contents.append(result)
- final_formatted_content = '\n'.join(formatted_contents)
- save_output_in_chunks('transformers_documentation-gpt-crawler-curated_markdown.md', formatted_contents)
- logging.info("Content formatted and saved in chunks successfully.")
- except Exception as e:
- logging.error(f"An error occurred: {e}")
- if __name__ == "__main__":
- main()
|