conv_html_to_markdown.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. import json
  2. import logging
  3. from bs4 import BeautifulSoup
  4. from markdownify import markdownify as md
  5. from concurrent.futures import ThreadPoolExecutor
  6. class HTMLToMarkdownConverter:
  7. def __init__(self, strip_tags=None, convert_links=True):
  8. """ Initialize converter with configuration options. """
  9. self.strip_tags = strip_tags or []
  10. self.convert_links = convert_links
  11. def convert(self, html_content):
  12. """ Convert HTML content to Markdown. """
  13. try:
  14. curated_html = self.curate_content(html_content)
  15. return md(curated_html, strip_tags=self.strip_tags, convert_links=self.convert_links)
  16. except Exception as e:
  17. logging.error(f"Error in HTML to Markdown conversion: {e}")
  18. return ""
  19. def curate_content(self, html):
  20. """ Curate the HTML content before conversion. """
  21. soup = BeautifulSoup(html, 'html.parser')
  22. # Implement specific curation logic here based on the content nature
  23. return str(soup)
  24. class DatasetFormatter:
  25. def __init__(self, converter):
  26. self.converter = converter
  27. def format_entry(self, entry):
  28. """ Format a single entry from the dataset. """
  29. try:
  30. title = entry.get('title', 'Untitled')
  31. url = entry.get('url', '')
  32. html_content = entry.get('html', '')
  33. markdown_content = self.converter.convert(html_content)
  34. return self.structure_markdown(title, url, markdown_content)
  35. except Exception as e:
  36. logging.error(f"Error formatting entry: {e}")
  37. return ""
  38. def structure_markdown(self, title, url, content):
  39. """ Structure the Markdown content with headers, lists, etc. """
  40. structured_content = f"## {title}\n\n"
  41. if url:
  42. structured_content += f"[Read More]({url})\n\n"
  43. structured_content += content
  44. return structured_content
  45. def format_dataset(self, data):
  46. """ Format the entire dataset. """
  47. formatted_content = ""
  48. for entry in data:
  49. formatted_content += self.format_entry(entry)
  50. return formatted_content
  51. def load_json(file_path):
  52. """ Load the JSON file. """
  53. with open(file_path, 'r') as file:
  54. return json.load(file)
  55. def save_output_in_chunks(file_path, contents, chunk_size=1024):
  56. """ Save the formatted content in chunks. """
  57. with open(file_path, 'w') as file:
  58. for content in contents:
  59. file.write(content)
  60. if len(content) > chunk_size:
  61. file.flush() # Flush after writing a large chunk
  62. def chunk_dataset(data, chunk_size):
  63. """ Divide the dataset into chunks of approximately equal size. """
  64. for i in range(0, len(data), chunk_size):
  65. yield data[i:i + chunk_size]
  66. def process_chunk(chunk):
  67. """ Process a single chunk of the dataset. """
  68. formatter = DatasetFormatter(HTMLToMarkdownConverter())
  69. return formatter.format_dataset(chunk)
  70. def main():
  71. logging.basicConfig(level=logging.INFO)
  72. try:
  73. original_data = load_json('transformers_documentation-gpt-crawler_output.json')
  74. chunk_size = 200 # Adjust chunk size as needed
  75. max_threads = 10 # Adjust the maximum number of threads as needed
  76. chunks = list(chunk_dataset(original_data, chunk_size))
  77. formatted_contents = []
  78. with ThreadPoolExecutor(max_workers=max_threads) as executor:
  79. results = executor.map(process_chunk, chunks)
  80. for result in results:
  81. formatted_contents.append(result)
  82. final_formatted_content = '\n'.join(formatted_contents)
  83. save_output_in_chunks('transformers_documentation-gpt-crawler-curated_markdown.md', formatted_contents)
  84. logging.info("Content formatted and saved in chunks successfully.")
  85. except Exception as e:
  86. logging.error(f"An error occurred: {e}")
  87. if __name__ == "__main__":
  88. main()