|
@@ -0,0 +1,104 @@
|
|
|
+import json
|
|
|
+import logging
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+from markdownify import markdownify as md
|
|
|
+from concurrent.futures import ThreadPoolExecutor
|
|
|
+
|
|
|
+class HTMLToMarkdownConverter:
|
|
|
+ def __init__(self, strip_tags=None, convert_links=True):
|
|
|
+ """ Initialize converter with configuration options. """
|
|
|
+ self.strip_tags = strip_tags or []
|
|
|
+ self.convert_links = convert_links
|
|
|
+
|
|
|
+ def convert(self, html_content):
|
|
|
+ """ Convert HTML content to Markdown. """
|
|
|
+ try:
|
|
|
+ curated_html = self.curate_content(html_content)
|
|
|
+ return md(curated_html, strip_tags=self.strip_tags, convert_links=self.convert_links)
|
|
|
+ except Exception as e:
|
|
|
+ logging.error(f"Error in HTML to Markdown conversion: {e}")
|
|
|
+ return ""
|
|
|
+
|
|
|
+ def curate_content(self, html):
|
|
|
+ """ Curate the HTML content before conversion. """
|
|
|
+ soup = BeautifulSoup(html, 'html.parser')
|
|
|
+ # Implement specific curation logic here based on the content nature
|
|
|
+ return str(soup)
|
|
|
+
|
|
|
+class DatasetFormatter:
|
|
|
+ def __init__(self, converter):
|
|
|
+ self.converter = converter
|
|
|
+
|
|
|
+ def format_entry(self, entry):
|
|
|
+ """ Format a single entry from the dataset. """
|
|
|
+ try:
|
|
|
+ title = entry.get('title', 'Untitled')
|
|
|
+ url = entry.get('url', '')
|
|
|
+ html_content = entry.get('html', '')
|
|
|
+ markdown_content = self.converter.convert(html_content)
|
|
|
+ return self.structure_markdown(title, url, markdown_content)
|
|
|
+ except Exception as e:
|
|
|
+ logging.error(f"Error formatting entry: {e}")
|
|
|
+ return ""
|
|
|
+
|
|
|
+ def structure_markdown(self, title, url, content):
|
|
|
+ """ Structure the Markdown content with headers, lists, etc. """
|
|
|
+ structured_content = f"## {title}\n\n"
|
|
|
+ if url:
|
|
|
+ structured_content += f"[Read More]({url})\n\n"
|
|
|
+ structured_content += content
|
|
|
+ return structured_content
|
|
|
+
|
|
|
+ def format_dataset(self, data):
|
|
|
+ """ Format the entire dataset. """
|
|
|
+ formatted_content = ""
|
|
|
+ for entry in data:
|
|
|
+ formatted_content += self.format_entry(entry)
|
|
|
+ return formatted_content
|
|
|
+
|
|
|
+def load_json(file_path):
|
|
|
+ """ Load the JSON file. """
|
|
|
+ with open(file_path, 'r') as file:
|
|
|
+ return json.load(file)
|
|
|
+
|
|
|
+def save_output_in_chunks(file_path, contents, chunk_size=1024):
|
|
|
+ """ Save the formatted content in chunks. """
|
|
|
+ with open(file_path, 'w') as file:
|
|
|
+ for content in contents:
|
|
|
+ file.write(content)
|
|
|
+ if len(content) > chunk_size:
|
|
|
+ file.flush() # Flush after writing a large chunk
|
|
|
+
|
|
|
+def chunk_dataset(data, chunk_size):
|
|
|
+ """ Divide the dataset into chunks of approximately equal size. """
|
|
|
+ for i in range(0, len(data), chunk_size):
|
|
|
+ yield data[i:i + chunk_size]
|
|
|
+
|
|
|
+def process_chunk(chunk):
|
|
|
+ """ Process a single chunk of the dataset. """
|
|
|
+ formatter = DatasetFormatter(HTMLToMarkdownConverter())
|
|
|
+ return formatter.format_dataset(chunk)
|
|
|
+
|
|
|
+def main():
|
|
|
+ logging.basicConfig(level=logging.INFO)
|
|
|
+ try:
|
|
|
+ original_data = load_json('transformers_documentation-gpt-crawler_output.json')
|
|
|
+ chunk_size = 200 # Adjust chunk size as needed
|
|
|
+ max_threads = 10 # Adjust the maximum number of threads as needed
|
|
|
+
|
|
|
+ chunks = list(chunk_dataset(original_data, chunk_size))
|
|
|
+
|
|
|
+ formatted_contents = []
|
|
|
+ with ThreadPoolExecutor(max_workers=max_threads) as executor:
|
|
|
+ results = executor.map(process_chunk, chunks)
|
|
|
+ for result in results:
|
|
|
+ formatted_contents.append(result)
|
|
|
+
|
|
|
+ final_formatted_content = '\n'.join(formatted_contents)
|
|
|
+ save_output_in_chunks('transformers_documentation-gpt-crawler-curated_markdown.md', formatted_contents)
|
|
|
+ logging.info("Content formatted and saved in chunks successfully.")
|
|
|
+ except Exception as e:
|
|
|
+ logging.error(f"An error occurred: {e}")
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|