setup_qdrant_collection.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. """
  2. Script to set up a Qdrant collection with provided markdown files.
  3. To use this script, replace the file paths in the NEW_COLLECTIONS list with your own markdown files.
  4. Then, run the script using Python: `python setup_qdrant_collection.py`
  5. """
  6. from pathlib import Path
  7. from qdrant_client import QdrantClient, models
  8. from sentence_transformers import SentenceTransformer
  9. import uuid
  10. import re
  11. # Configuration - in case you want to create an online collection
  12. QDRANT_URL = "replace with your Qdrant URL"
  13. QDRANT_API_KEY = "replace with your qdrant API key"
  14. EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
  15. # New files to process
  16. # IMPORTANT: Added the configuration for readme_blogs_latest here
  17. NEW_COLLECTIONS = [
  18. {
  19. "file_path": "path/to/your/markdown/file1.txt",
  20. "collection_name": "example_collection_1"
  21. },
  22. {
  23. "file_path": "path/to/your/markdown/file2.txt",
  24. "collection_name": "example_collection_2"
  25. }
  26. ]
  27. def markdown_splitter(text, max_chunk=800):
  28. sections = re.split(r'(?=^#+ .*)', text, flags=re.MULTILINE)
  29. chunks = []
  30. current_chunk = []
  31. for section in sections:
  32. if len(''.join(current_chunk)) + len(section) > max_chunk:
  33. chunks.append(''.join(current_chunk))
  34. current_chunk = [section]
  35. else:
  36. current_chunk.append(section)
  37. if current_chunk:
  38. chunks.append(''.join(current_chunk))
  39. return [{"text": chunk, "header": f"section_{i}"} for i, chunk in enumerate(chunks)]
  40. def get_qdrant_client():
  41. return QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
  42. def get_embedding_model():
  43. return SentenceTransformer(EMBEDDING_MODEL)
  44. def process_file(config):
  45. client = get_qdrant_client()
  46. embedding_model = get_embedding_model()
  47. # Create collection if not exists
  48. if not client.collection_exists(config["collection_name"]):
  49. client.create_collection(
  50. collection_name=config["collection_name"],
  51. vectors_config=models.VectorParams(
  52. size=384,
  53. distance=models.Distance.COSINE
  54. )
  55. )
  56. # Process and store documents
  57. try:
  58. text = Path(config["file_path"]).read_text(encoding='utf-8')
  59. chunks = markdown_splitter(text)
  60. batch_size = 100
  61. for i in range(0, len(chunks), batch_size):
  62. batch = chunks[i:i+batch_size]
  63. points = []
  64. for chunk in batch:
  65. embedding = embedding_model.encode(chunk["text"]).tolist()
  66. points.append(
  67. models.PointStruct(
  68. id=str(uuid.uuid4()),
  69. vector=embedding,
  70. payload=chunk
  71. )
  72. )
  73. client.upsert(collection_name=config["collection_name"], points=points)
  74. print(f"Processed {len(chunks)} chunks for {config['collection_name']}")
  75. except FileNotFoundError:
  76. print(f"Error: The file at {config['file_path']} was not found. Skipping collection setup.")
  77. def setup_all_collections():
  78. for config in NEW_COLLECTIONS:
  79. process_file(config)
  80. print("All collections created and populated successfully!")
  81. if __name__ == "__main__":
  82. setup_all_collections()