radu
/
LLamaRecipes
зеркало из https://github.com/facebookresearch/llama-recipes.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
							

"""
Script to set up a Qdrant collection with provided markdown files.
To use this script, replace the file paths in the NEW_COLLECTIONS list with your own markdown files.
Then, run the script using Python: `python setup_qdrant_collection.py`
"""


from pathlib import Path
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
import uuid
import re

# Configuration - in case you want to create an online collection
QDRANT_URL = "replace with your Qdrant URL"
QDRANT_API_KEY = "replace with your qdrant API key"
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'

# New files to process
# IMPORTANT: Added the configuration for readme_blogs_latest here
NEW_COLLECTIONS = [
    {
        "file_path": "path/to/your/markdown/file1.txt",
        "collection_name": "example_collection_1"
    },
    {
        "file_path": "path/to/your/markdown/file2.txt",
        "collection_name": "example_collection_2"
    }

]

def markdown_splitter(text, max_chunk=800):
    sections = re.split(r'(?=^#+ .*)', text, flags=re.MULTILINE)
    chunks = []
    current_chunk = []
    
    for section in sections:
        if len(''.join(current_chunk)) + len(section) > max_chunk:
            chunks.append(''.join(current_chunk))
            current_chunk = [section]
        else:
            current_chunk.append(section)
    
    if current_chunk:
        chunks.append(''.join(current_chunk))
    
    return [{"text": chunk, "header": f"section_{i}"} for i, chunk in enumerate(chunks)]

def get_qdrant_client():
    return QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)

def get_embedding_model():
    return SentenceTransformer(EMBEDDING_MODEL)

def process_file(config):
    client = get_qdrant_client()
    embedding_model = get_embedding_model()
    
    # Create collection if not exists
    if not client.collection_exists(config["collection_name"]):
        client.create_collection(
            collection_name=config["collection_name"],
            vectors_config=models.VectorParams(
                size=384,
                distance=models.Distance.COSINE
            )
        )
    
    # Process and store documents
    try:
        text = Path(config["file_path"]).read_text(encoding='utf-8')
        chunks = markdown_splitter(text)
        
        batch_size = 100
        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i+batch_size]
            points = []
            for chunk in batch:
                embedding = embedding_model.encode(chunk["text"]).tolist()
                points.append(
                    models.PointStruct(
                        id=str(uuid.uuid4()),
                        vector=embedding,
                        payload=chunk
                    )
                )
            client.upsert(collection_name=config["collection_name"], points=points)
        
        print(f"Processed {len(chunks)} chunks for {config['collection_name']}")
    except FileNotFoundError:
        print(f"Error: The file at {config['file_path']} was not found. Skipping collection setup.")

def setup_all_collections():
    for config in NEW_COLLECTIONS:
        process_file(config)
    print("All collections created and populated successfully!")

if __name__ == "__main__":
    setup_all_collections()