| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 | 
							
- """
 
- Script to set up a Qdrant collection with provided markdown files.
 
- To use this script, replace the file paths in the NEW_COLLECTIONS list with your own markdown files.
 
- Then, run the script using Python: `python setup_qdrant_collection.py`
 
- """
 
- from pathlib import Path
 
- from qdrant_client import QdrantClient, models
 
- from sentence_transformers import SentenceTransformer
 
- import uuid
 
- import re
 
- # Configuration - in case you want to create an online collection
 
- QDRANT_URL = "replace with your Qdrant URL"
 
- QDRANT_API_KEY = "replace with your qdrant API key"
 
- EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
 
- # New files to process
 
- # IMPORTANT: Added the configuration for readme_blogs_latest here
 
- NEW_COLLECTIONS = [
 
-     {
 
-         "file_path": "path/to/your/markdown/file1.txt",
 
-         "collection_name": "example_collection_1"
 
-     },
 
-     {
 
-         "file_path": "path/to/your/markdown/file2.txt",
 
-         "collection_name": "example_collection_2"
 
-     }
 
- ]
 
- def markdown_splitter(text, max_chunk=800):
 
-     sections = re.split(r'(?=^#+ .*)', text, flags=re.MULTILINE)
 
-     chunks = []
 
-     current_chunk = []
 
-     
 
-     for section in sections:
 
-         if len(''.join(current_chunk)) + len(section) > max_chunk:
 
-             chunks.append(''.join(current_chunk))
 
-             current_chunk = [section]
 
-         else:
 
-             current_chunk.append(section)
 
-     
 
-     if current_chunk:
 
-         chunks.append(''.join(current_chunk))
 
-     
 
-     return [{"text": chunk, "header": f"section_{i}"} for i, chunk in enumerate(chunks)]
 
- def get_qdrant_client():
 
-     return QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
 
- def get_embedding_model():
 
-     return SentenceTransformer(EMBEDDING_MODEL)
 
- def process_file(config):
 
-     client = get_qdrant_client()
 
-     embedding_model = get_embedding_model()
 
-     
 
-     # Create collection if not exists
 
-     if not client.collection_exists(config["collection_name"]):
 
-         client.create_collection(
 
-             collection_name=config["collection_name"],
 
-             vectors_config=models.VectorParams(
 
-                 size=384,
 
-                 distance=models.Distance.COSINE
 
-             )
 
-         )
 
-     
 
-     # Process and store documents
 
-     try:
 
-         text = Path(config["file_path"]).read_text(encoding='utf-8')
 
-         chunks = markdown_splitter(text)
 
-         
 
-         batch_size = 100
 
-         for i in range(0, len(chunks), batch_size):
 
-             batch = chunks[i:i+batch_size]
 
-             points = []
 
-             for chunk in batch:
 
-                 embedding = embedding_model.encode(chunk["text"]).tolist()
 
-                 points.append(
 
-                     models.PointStruct(
 
-                         id=str(uuid.uuid4()),
 
-                         vector=embedding,
 
-                         payload=chunk
 
-                     )
 
-                 )
 
-             client.upsert(collection_name=config["collection_name"], points=points)
 
-         
 
-         print(f"Processed {len(chunks)} chunks for {config['collection_name']}")
 
-     except FileNotFoundError:
 
-         print(f"Error: The file at {config['file_path']} was not found. Skipping collection setup.")
 
- def setup_all_collections():
 
-     for config in NEW_COLLECTIONS:
 
-         process_file(config)
 
-     print("All collections created and populated successfully!")
 
- if __name__ == "__main__":
 
-     setup_all_collections()
 
 
  |