| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475 | #python prep-for-FT.py --input input_file.json --output output_file.jsonimport jsonimport osfrom typing import Dict, Anyfrom pathlib import Pathimport astdef clean_json_string(json_str):    try:        # This handles cases where the string is like "\\n" -> "\n"        cleaned = ast.literal_eval(f"'''{json_str}'''")        return cleaned    except:        return json_strdef process_dict(data):    #iteratively clean    cleaned_data = {}    for key, value in data.items():        if isinstance(value, str):            cleaned_data[key] = clean_json_string(value)        elif isinstance(value, dict):            cleaned_data[key] = process_dict(value)        elif isinstance(value, list):            cleaned_data[key] = [                process_dict(item) if isinstance(item, dict)                else clean_json_string(item) if isinstance(item, str)                else item                for item in value            ]        else:            cleaned_data[key] = value    return cleaned_datadef process_dataset(input_path, output_path):    input_path = Path(input_path)    output_path = Path(output_path)        output_path.parent.mkdir(parents=True, exist_ok=True)        if input_path.is_file():        with open(input_path, 'r', encoding='utf-8') as f:            data = json.load(f)                if isinstance(data, dict):            cleaned_data = process_dict(data)        elif isinstance(data, list):            cleaned_data = [                process_dict(item) if isinstance(item, dict)                else clean_json_string(item) if isinstance(item, str)                else item                for item in data            ]                with open(output_path, 'w', encoding='utf-8') as f:            json.dump(cleaned_data, f, indent=2, ensure_ascii=False)                elif input_path.is_dir():        output_path.mkdir(parents=True, exist_ok=True)        for file_path in input_path.glob('**/*.json'):            relative_path = file_path.relative_to(input_path)            output_file = output_path / relative_path            output_file.parent.mkdir(parents=True, exist_ok=True)            process_dataset(str(file_path), str(output_file))if __name__ == "__main__":    import argparse        parser = argparse.ArgumentParser(description='Clean JSON dataset by removing extra escapes')    parser.add_argument('--input', required=True, help='Path to input JSON file or directory')    parser.add_argument('--output', required=True, help='Path for cleaned output')        args = parser.parse_args()    process_dataset(args.input, args.output)
 |