prep-for-FT.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. import json
  2. import os
  3. from typing import Dict, Any
  4. from pathlib import Path
  5. import ast
  6. def clean_json_string(json_str: str) -> str:
  7. """Clean a JSON string by removing extra escapes."""
  8. # First try to parse it as a raw string literal
  9. try:
  10. # This handles cases where the string is like "\\n" -> "\n"
  11. cleaned = ast.literal_eval(f"'''{json_str}'''")
  12. return cleaned
  13. except:
  14. return json_str
  15. def process_dict(data: Dict[str, Any]) -> Dict[str, Any]:
  16. """Recursively process dictionary values to clean strings."""
  17. cleaned_data = {}
  18. for key, value in data.items():
  19. if isinstance(value, str):
  20. cleaned_data[key] = clean_json_string(value)
  21. elif isinstance(value, dict):
  22. cleaned_data[key] = process_dict(value)
  23. elif isinstance(value, list):
  24. cleaned_data[key] = [
  25. process_dict(item) if isinstance(item, dict)
  26. else clean_json_string(item) if isinstance(item, str)
  27. else item
  28. for item in value
  29. ]
  30. else:
  31. cleaned_data[key] = value
  32. return cleaned_data
  33. def process_dataset(input_path: str, output_path: str):
  34. """Process a dataset file or directory and save cleaned version."""
  35. input_path = Path(input_path)
  36. output_path = Path(output_path)
  37. # Create output directory if it doesn't exist
  38. output_path.parent.mkdir(parents=True, exist_ok=True)
  39. if input_path.is_file():
  40. # Process single file
  41. with open(input_path, 'r', encoding='utf-8') as f:
  42. data = json.load(f)
  43. # Clean the data
  44. if isinstance(data, dict):
  45. cleaned_data = process_dict(data)
  46. elif isinstance(data, list):
  47. cleaned_data = [
  48. process_dict(item) if isinstance(item, dict)
  49. else clean_json_string(item) if isinstance(item, str)
  50. else item
  51. for item in data
  52. ]
  53. # Write cleaned data
  54. with open(output_path, 'w', encoding='utf-8') as f:
  55. json.dump(cleaned_data, f, indent=2, ensure_ascii=False)
  56. elif input_path.is_dir():
  57. # Process directory of files
  58. output_path.mkdir(parents=True, exist_ok=True)
  59. for file_path in input_path.glob('**/*.json'):
  60. # Maintain directory structure in output
  61. relative_path = file_path.relative_to(input_path)
  62. output_file = output_path / relative_path
  63. output_file.parent.mkdir(parents=True, exist_ok=True)
  64. # Process individual file
  65. process_dataset(str(file_path), str(output_file))
  66. if __name__ == "__main__":
  67. import argparse
  68. parser = argparse.ArgumentParser(description='Clean JSON dataset by removing extra escapes')
  69. parser.add_argument('--input', required=True, help='Path to input JSON file or directory')
  70. parser.add_argument('--output', required=True, help='Path for cleaned output')
  71. args = parser.parse_args()
  72. process_dataset(args.input, args.output)