main.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. import pandas as pd
  2. from qdrant_client import QdrantClient, models
  3. from collections import defaultdict
  4. # Load the data
  5. tags = pd.read_csv("./data/ml-latest-small/tags.csv")
  6. movies = pd.read_csv("./data/ml-latest-small/movies.csv")
  7. ratings = pd.read_csv("./data/ml-latest-small/ratings.csv")
  8. # Initialize Qdrant client and create collections
  9. def init_qdrant():
  10. qdrant = QdrantClient(":memory:") # Use in-memory for simplicity
  11. qdrant.create_collection(
  12. "movielens", vectors_config={}, sparse_vectors_config={"ratings": models.SparseVectorParams()}
  13. )
  14. return qdrant
  15. # Load data and upload to Qdrant
  16. def load_data(qdrant):
  17. ratings['normalized_rating'] = (ratings.rating - ratings.rating.mean(axis=0)) / ratings.rating.std()
  18. user_sparse_vectors = defaultdict(lambda: {"values": [], "indices": []})
  19. for row in ratings.itertuples():
  20. user_sparse_vectors[row.userId]["values"].append(row.normalized_rating)
  21. user_sparse_vectors[row.userId]["indices"].append(row.movieId)
  22. def data_generator():
  23. for user_id, vector in user_sparse_vectors.items():
  24. yield models.PointStruct(
  25. id=user_id, vector={"ratings": vector}, payload={}
  26. )
  27. qdrant.upload_points("movielens", data_generator())
  28. # Function to input and normalize ratings
  29. def input_ratings(user_ratings, ratings):
  30. final_ratings = {}
  31. mean_rating = ratings.rating.mean()
  32. std_rating = ratings.rating.std()
  33. for movie_id, user_rating in user_ratings.values():
  34. normalized_input_rating = (user_rating - mean_rating) / std_rating
  35. final_ratings[movie_id] = normalized_input_rating
  36. return final_ratings
  37. # Search and recommendation function
  38. def recommend_movies(qdrant, movies, my_ratings):
  39. def to_vector(ratings):
  40. vector = models.SparseVector(values=[], indices=[])
  41. for movieId, rating in ratings.items():
  42. vector.values.append(rating)
  43. vector.indices.append(movieId)
  44. return vector
  45. user_vector = to_vector(my_ratings)
  46. results = qdrant.search(
  47. "movielens",
  48. query_vector=models.NamedSparseVector(name="ratings", vector=user_vector),
  49. with_vectors=True,
  50. limit=20,
  51. )
  52. movie_scores = defaultdict(lambda: 0)
  53. for user in results:
  54. user_scores = user.vector["ratings"]
  55. for idx, rating in zip(user_scores.indices, user_scores.values):
  56. if idx in my_ratings:
  57. continue
  58. movie_scores[idx] += rating
  59. top_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
  60. recommended_movies = [movies[movies.movieId == movieId].title.values[0] for movieId, score in top_movies[:5]]
  61. return recommended_movies