raft.py 4.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. import logging
  2. import os
  3. import argparse
  4. from raft_utils import generate_questions, add_chunk_to_dataset
  5. from format import DatasetConverter, datasetFormats, outputDatasetTypes
  6. from config import load_config
  7. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  8. def main(api_config):
  9. ds = None
  10. try:
  11. logging.info("Starting to generate question pair.")
  12. # Generate questions as list for each chunk
  13. chunk_questions_zip = generate_questions(api_config)
  14. if not chunk_questions_zip:
  15. logging.warning("No questions generated from text. Please check the api_config or model configuration.")
  16. return
  17. for chunk, questions in chunk_questions_zip:
  18. logging.info(f"Chunk: {chunk}, question length: {len(questions)}")
  19. for question in questions:
  20. logging.info(f"Question: {question}")
  21. logging.info(f"Successfully generated {sum([len(q) for c,q in chunk_questions_zip])} question/answer pairs.")
  22. ds = add_chunk_to_dataset(chunk_questions_zip,api_config,ds)
  23. ds.save_to_disk(args.output)
  24. logging.info(f"Data successfully written to {api_config['output']}. Process completed.")
  25. formatter = DatasetConverter()
  26. # Extract format specific params
  27. format_params = {}
  28. formatter.convert(ds=ds, format=args.output_format, output_path=args.output+"raft", output_type=args.output_type, params=format_params)
  29. except Exception as e:
  30. logging.error(f"An unexpected error occurred during the process: {e}",exc_info=True)
  31. def parse_arguments():
  32. # Define command line arguments for the script
  33. parser = argparse.ArgumentParser(
  34. description="Generate RAFT question/answer/context pairs from documentation."
  35. )
  36. parser.add_argument(
  37. "-t", "--questions_per_chunk",
  38. type=int,
  39. default=3,
  40. help="Specify the number of question pairs to generate per chunk."
  41. )
  42. parser.add_argument(
  43. "-m", "--model",
  44. default="meta-llama/Meta-Llama-3-70B-Instruct",
  45. help="Select the model to use for generation."
  46. )
  47. parser.add_argument(
  48. "-c", "--config_path",
  49. default="./raft.yaml",
  50. help="Set the configuration file path that has system prompt along with language, dataset path and number of questions."
  51. )
  52. parser.add_argument(
  53. "-u", "--endpoint_url",
  54. default="http://localhost:8001/v1",
  55. type=str,
  56. help="LLM API url for generating question/answer pairs."
  57. )
  58. parser.add_argument(
  59. "-k", "--api_key",
  60. default="EMPTY",
  61. type=str,
  62. help="LLM API key for generating question/answer pairs."
  63. )
  64. parser.add_argument("--chunk_size", type=int, default=1000, help="The size of each chunk in number of tokens")
  65. parser.add_argument("-o","--output", type=str, default="./output/", help="The path at which to save the dataset")
  66. parser.add_argument("--output-format", type=str, default="hf", help="Format to convert the dataset to. Defaults to hf.", choices=datasetFormats)
  67. parser.add_argument("--output-type", type=str, default="jsonl", help="Type to export the dataset to. Defaults to jsonl.", choices=outputDatasetTypes)
  68. return parser.parse_args()
  69. if __name__ == "__main__":
  70. logging.info("Initializing the process and loading configuration...")
  71. args = parse_arguments()
  72. api_config = load_config(args.config_path)
  73. api_config["questions_per_chunk"] = args.questions_per_chunk
  74. api_config["model"] = args.model
  75. api_config["chunk_size"] = args.chunk_size
  76. api_config["endpoint_url"] = args.endpoint_url
  77. api_config["output"] = args.output
  78. api_config["api_key"] = args.api_key
  79. # if OPENAI_API_KEY is defined in the system environment, use it as the API key
  80. if os.environ.get('API_KEY') is not None:
  81. api_config["api_key"] = os.environ["API_KEY"]
  82. logging.info(f"Configuration loaded. Generating {args.questions_per_chunk} question per chunk using model '{args.model}'.")
  83. logging.info(f"Chunk size: {args.chunk_size}.")
  84. logging.info(f"num_distract_docs: {api_config['num_distract_docs']}, oracle_p: {api_config['oracle_p']}")
  85. logging.info(f"Will use endpoint_url: {args.endpoint_url}.")
  86. logging.info(f"Output will be written to {args.output}.")
  87. main(api_config)