#python prep-for-FT.py --input input_file.json --output output_file.json import json import os from typing import Dict, Any from pathlib import Path import ast def clean_json_string(json_str): try: # This handles cases where the string is like "\\n" -> "\n" cleaned = ast.literal_eval(f"'''{json_str}'''") return cleaned except: return json_str def process_dict(data): #iteratively clean cleaned_data = {} for key, value in data.items(): if isinstance(value, str): cleaned_data[key] = clean_json_string(value) elif isinstance(value, dict): cleaned_data[key] = process_dict(value) elif isinstance(value, list): cleaned_data[key] = [ process_dict(item) if isinstance(item, dict) else clean_json_string(item) if isinstance(item, str) else item for item in value ] else: cleaned_data[key] = value return cleaned_data def process_dataset(input_path, output_path): input_path = Path(input_path) output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) if input_path.is_file(): with open(input_path, 'r', encoding='utf-8') as f: data = json.load(f) if isinstance(data, dict): cleaned_data = process_dict(data) elif isinstance(data, list): cleaned_data = [ process_dict(item) if isinstance(item, dict) else clean_json_string(item) if isinstance(item, str) else item for item in data ] with open(output_path, 'w', encoding='utf-8') as f: json.dump(cleaned_data, f, indent=2, ensure_ascii=False) elif input_path.is_dir(): output_path.mkdir(parents=True, exist_ok=True) for file_path in input_path.glob('**/*.json'): relative_path = file_path.relative_to(input_path) output_file = output_path / relative_path output_file.parent.mkdir(parents=True, exist_ok=True) process_dataset(str(file_path), str(output_file)) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Clean JSON dataset by removing extra escapes') parser.add_argument('--input', required=True, help='Path to input JSON file or directory') parser.add_argument('--output', required=True, help='Path for cleaned output') args = parser.parse_args() process_dataset(args.input, args.output)