123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778 |
- import pandas as pd
- from qdrant_client import QdrantClient, models
- from collections import defaultdict
- # Load the data
- tags = pd.read_csv("./data/ml-latest-small/tags.csv")
- movies = pd.read_csv("./data/ml-latest-small/movies.csv")
- ratings = pd.read_csv("./data/ml-latest-small/ratings.csv")
- # Initialize Qdrant client and create collections
- def init_qdrant():
- qdrant = QdrantClient(":memory:") # Use in-memory for simplicity
- qdrant.create_collection(
- "movielens", vectors_config={}, sparse_vectors_config={"ratings": models.SparseVectorParams()}
- )
- return qdrant
- # Load data and upload to Qdrant
- def load_data(qdrant):
- ratings['normalized_rating'] = (ratings.rating - ratings.rating.mean(axis=0)) / ratings.rating.std()
- user_sparse_vectors = defaultdict(lambda: {"values": [], "indices": []})
- for row in ratings.itertuples():
- user_sparse_vectors[row.userId]["values"].append(row.normalized_rating)
- user_sparse_vectors[row.userId]["indices"].append(row.movieId)
- def data_generator():
- for user_id, vector in user_sparse_vectors.items():
- yield models.PointStruct(
- id=user_id, vector={"ratings": vector}, payload={}
- )
- qdrant.upload_points("movielens", data_generator())
- # Function to input and normalize ratings
- def input_ratings(user_ratings, ratings):
- final_ratings = {}
-
- mean_rating = ratings.rating.mean()
- std_rating = ratings.rating.std()
- for movie_id, user_rating in user_ratings.values():
- normalized_input_rating = (user_rating - mean_rating) / std_rating
- final_ratings[movie_id] = normalized_input_rating
-
- return final_ratings
- # Search and recommendation function
- def recommend_movies(qdrant, movies, my_ratings):
- def to_vector(ratings):
- vector = models.SparseVector(values=[], indices=[])
- for movieId, rating in ratings.items():
- vector.values.append(rating)
- vector.indices.append(movieId)
- return vector
- user_vector = to_vector(my_ratings)
- results = qdrant.search(
- "movielens",
- query_vector=models.NamedSparseVector(name="ratings", vector=user_vector),
- with_vectors=True,
- limit=20,
- )
- movie_scores = defaultdict(lambda: 0)
- for user in results:
- user_scores = user.vector["ratings"]
- for idx, rating in zip(user_scores.indices, user_scores.values):
- if idx in my_ratings:
- continue
- movie_scores[idx] += rating
- top_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
- recommended_movies = [movies[movies.movieId == movieId].title.values[0] for movieId, score in top_movies[:5]]
-
- return recommended_movies
|