prep-for-FT.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. #python prep-for-FT.py --input input_file.json --output output_file.json
  2. import json
  3. import os
  4. from typing import Dict, Any
  5. from pathlib import Path
  6. import ast
  7. def clean_json_string(json_str):
  8. try:
  9. # This handles cases where the string is like "\\n" -> "\n"
  10. cleaned = ast.literal_eval(f"'''{json_str}'''")
  11. return cleaned
  12. except:
  13. return json_str
  14. def process_dict(data):
  15. #iteratively clean
  16. cleaned_data = {}
  17. for key, value in data.items():
  18. if isinstance(value, str):
  19. cleaned_data[key] = clean_json_string(value)
  20. elif isinstance(value, dict):
  21. cleaned_data[key] = process_dict(value)
  22. elif isinstance(value, list):
  23. cleaned_data[key] = [
  24. process_dict(item) if isinstance(item, dict)
  25. else clean_json_string(item) if isinstance(item, str)
  26. else item
  27. for item in value
  28. ]
  29. else:
  30. cleaned_data[key] = value
  31. return cleaned_data
  32. def process_dataset(input_path, output_path):
  33. input_path = Path(input_path)
  34. output_path = Path(output_path)
  35. output_path.parent.mkdir(parents=True, exist_ok=True)
  36. if input_path.is_file():
  37. with open(input_path, 'r', encoding='utf-8') as f:
  38. data = json.load(f)
  39. if isinstance(data, dict):
  40. cleaned_data = process_dict(data)
  41. elif isinstance(data, list):
  42. cleaned_data = [
  43. process_dict(item) if isinstance(item, dict)
  44. else clean_json_string(item) if isinstance(item, str)
  45. else item
  46. for item in data
  47. ]
  48. with open(output_path, 'w', encoding='utf-8') as f:
  49. json.dump(cleaned_data, f, indent=2, ensure_ascii=False)
  50. elif input_path.is_dir():
  51. output_path.mkdir(parents=True, exist_ok=True)
  52. for file_path in input_path.glob('**/*.json'):
  53. relative_path = file_path.relative_to(input_path)
  54. output_file = output_path / relative_path
  55. output_file.parent.mkdir(parents=True, exist_ok=True)
  56. process_dataset(str(file_path), str(output_file))
  57. if __name__ == "__main__":
  58. import argparse
  59. parser = argparse.ArgumentParser(description='Clean JSON dataset by removing extra escapes')
  60. parser.add_argument('--input', required=True, help='Path to input JSON file or directory')
  61. parser.add_argument('--output', required=True, help='Path for cleaned output')
  62. args = parser.parse_args()
  63. process_dataset(args.input, args.output)