1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- # qdrant_setup_partial.py
- from pathlib import Path
- from qdrant_client import QdrantClient, models
- from sentence_transformers import SentenceTransformer
- import uuid
- import re
- # Configuration
- QDRANT_URL = "https://754e68dd-c297-4ab2-9833-c81cbfbfb75c.eu-west-2-0.aws.cloud.qdrant.io:6333"
- QDRANT_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.kRiEKHQ5s4KUWoYJqhQ29tbmbgfqFT2jAAfgrPTshSM"
- EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
- # New files to process
- # IMPORTANT: Added the configuration for readme_blogs_latest here
- NEW_COLLECTIONS = [
- {
- "file_path": "/home/ubuntu/nilesh-workspace-backup-20250707/Blog_generation/internal-llama-cookbook/end-to-end-use-cases/technical_blogger/Blog_generation/cookbook_metadata/mdfiles_latest.txt",
- "collection_name": "readme_blogs_latest"
- },
- {
- "file_path": "/home/ubuntu/nilesh-workspace-backup-20250707/Blog_generation/internal-llama-cookbook/end-to-end-use-cases/technical_blogger/Blog_generation/cookbook_metadata/3rd_party_integrations.txt",
- "collection_name": "3rd_party_integrations"
- },
- {
- "file_path": "/home/ubuntu/nilesh-workspace-backup-20250707/Blog_generation/internal-llama-cookbook/end-to-end-use-cases/technical_blogger/Blog_generation/cookbook_metadata/Getting_started_files.txt",
- "collection_name": "getting_started_files"
- }
- ]
- def markdown_splitter(text, max_chunk=800):
- sections = re.split(r'(?=^#+ .*)', text, flags=re.MULTILINE)
- chunks = []
- current_chunk = []
-
- for section in sections:
- if len(''.join(current_chunk)) + len(section) > max_chunk:
- chunks.append(''.join(current_chunk))
- current_chunk = [section]
- else:
- current_chunk.append(section)
-
- if current_chunk:
- chunks.append(''.join(current_chunk))
-
- return [{"text": chunk, "header": f"section_{i}"} for i, chunk in enumerate(chunks)]
- def get_qdrant_client():
- return QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
- def get_embedding_model():
- return SentenceTransformer(EMBEDDING_MODEL)
- def process_file(config):
- client = get_qdrant_client()
- embedding_model = get_embedding_model()
-
- # Create collection if not exists
- if not client.collection_exists(config["collection_name"]):
- client.create_collection(
- collection_name=config["collection_name"],
- vectors_config=models.VectorParams(
- size=384,
- distance=models.Distance.COSINE
- )
- )
-
- # Process and store documents
- try:
- text = Path(config["file_path"]).read_text(encoding='utf-8')
- chunks = markdown_splitter(text)
-
- batch_size = 100
- for i in range(0, len(chunks), batch_size):
- batch = chunks[i:i+batch_size]
- points = []
- for chunk in batch:
- embedding = embedding_model.encode(chunk["text"]).tolist()
- points.append(
- models.PointStruct(
- id=str(uuid.uuid4()),
- vector=embedding,
- payload=chunk
- )
- )
- client.upsert(collection_name=config["collection_name"], points=points)
-
- print(f"Processed {len(chunks)} chunks for {config['collection_name']}")
- except FileNotFoundError:
- print(f"Error: The file at {config['file_path']} was not found. Skipping collection setup.")
- def setup_all_collections():
- for config in NEW_COLLECTIONS:
- process_file(config)
- print("All collections created and populated successfully!")
- if __name__ == "__main__":
- setup_all_collections()
|