prep-for-FT.py 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. #python prep-for-FT.py --input input_file.json --output output_file.json
  2. import json
  3. import os
  4. from typing import Dict, Any
  5. from pathlib import Path
  6. import ast
  7. def clean_json_string(json_str: str) -> str:
  8. """Clean a JSON string by removing extra escapes."""
  9. # First try to parse it as a raw string literal
  10. try:
  11. # This handles cases where the string is like "\\n" -> "\n"
  12. cleaned = ast.literal_eval(f"'''{json_str}'''")
  13. return cleaned
  14. except:
  15. return json_str
  16. def process_dict(data: Dict[str, Any]) -> Dict[str, Any]:
  17. """Recursively process dictionary values to clean strings."""
  18. cleaned_data = {}
  19. for key, value in data.items():
  20. if isinstance(value, str):
  21. cleaned_data[key] = clean_json_string(value)
  22. elif isinstance(value, dict):
  23. cleaned_data[key] = process_dict(value)
  24. elif isinstance(value, list):
  25. cleaned_data[key] = [
  26. process_dict(item) if isinstance(item, dict)
  27. else clean_json_string(item) if isinstance(item, str)
  28. else item
  29. for item in value
  30. ]
  31. else:
  32. cleaned_data[key] = value
  33. return cleaned_data
  34. def process_dataset(input_path: str, output_path: str):
  35. """Process a dataset file or directory and save cleaned version."""
  36. input_path = Path(input_path)
  37. output_path = Path(output_path)
  38. # Create output directory if it doesn't exist
  39. output_path.parent.mkdir(parents=True, exist_ok=True)
  40. if input_path.is_file():
  41. # Process single file
  42. with open(input_path, 'r', encoding='utf-8') as f:
  43. data = json.load(f)
  44. # Clean the data
  45. if isinstance(data, dict):
  46. cleaned_data = process_dict(data)
  47. elif isinstance(data, list):
  48. cleaned_data = [
  49. process_dict(item) if isinstance(item, dict)
  50. else clean_json_string(item) if isinstance(item, str)
  51. else item
  52. for item in data
  53. ]
  54. # Write cleaned data
  55. with open(output_path, 'w', encoding='utf-8') as f:
  56. json.dump(cleaned_data, f, indent=2, ensure_ascii=False)
  57. elif input_path.is_dir():
  58. # Process directory of files
  59. output_path.mkdir(parents=True, exist_ok=True)
  60. for file_path in input_path.glob('**/*.json'):
  61. # Maintain directory structure in output
  62. relative_path = file_path.relative_to(input_path)
  63. output_file = output_path / relative_path
  64. output_file.parent.mkdir(parents=True, exist_ok=True)
  65. # Process individual file
  66. process_dataset(str(file_path), str(output_file))
  67. if __name__ == "__main__":
  68. import argparse
  69. parser = argparse.ArgumentParser(description='Clean JSON dataset by removing extra escapes')
  70. parser.add_argument('--input', required=True, help='Path to input JSON file or directory')
  71. parser.add_argument('--output', required=True, help='Path for cleaned output')
  72. args = parser.parse_args()
  73. process_dataset(args.input, args.output)