rms.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. import pandas as pd
  2. from qdrant_client import QdrantClient, models
  3. from collections import defaultdict
  4. import numpy as np
  5. # Load the data
  6. tags = pd.read_csv("./data/ml-latest-small/tags.csv")
  7. movies = pd.read_csv("./data/ml-latest-small/movies.csv")
  8. ratings = pd.read_csv("./data/ml-latest-small/ratings.csv")
  9. # Initialize Qdrant client and create collections
  10. def init_qdrant():
  11. qdrant = QdrantClient(":memory:") # Use in-memory for simplicity
  12. qdrant.create_collection(
  13. "movielens", vectors_config={}, sparse_vectors_config={"ratings": models.SparseVectorParams()}
  14. )
  15. return qdrant
  16. # Load data and upload to Qdrant
  17. def load_data(qdrant):
  18. # Normalize ratings
  19. ratings['normalized_rating'] = (ratings.rating - ratings.rating.mean(axis=0)) / ratings.rating.std()
  20. # Sparse vector preparation
  21. user_sparse_vectors = defaultdict(lambda: {"values": [], "indices": []})
  22. for row in ratings.itertuples():
  23. user_sparse_vectors[row.userId]["values"].append(row.normalized_rating)
  24. user_sparse_vectors[row.userId]["indices"].append(row.movieId)
  25. # Calculate total number of key-value pairs and number of users
  26. total_key_value_pairs = sum(len(v['values']) for v in user_sparse_vectors.values())
  27. total_users = len(user_sparse_vectors)
  28. print(f"Total number of users: {total_users}")
  29. print(f"Total number of key-value pairs: {total_key_value_pairs}")
  30. # Upload data
  31. def data_generator():
  32. for user_id, vector in user_sparse_vectors.items():
  33. yield models.PointStruct(
  34. id=user_id, vector={"ratings": vector}, payload={}
  35. )
  36. qdrant.upload_points("movielens", data_generator())
  37. # Function to retrieve and print vectors from Qdrant
  38. def print_uploaded_vectors(qdrant):
  39. # Retrieve all points from the collection
  40. results = qdrant.scroll(collection_name="movielens", limit=10, with_vectors=True)
  41. print("Uploaded sparse vectors and their shapes from Qdrant:")
  42. for result in results[0]:
  43. user_id = result.id
  44. user_vector = result.vector["ratings"]
  45. # Accessing the values and indices directly from the SparseVector object
  46. vector_shape = (len(user_vector.values), len(user_vector.indices))
  47. print(f"User ID: {user_id}, Sparse Vector Shape: {vector_shape}, Sparse Vector: {{'values': {user_vector.values}, 'indices': {user_vector.indices}}}")
  48. # Function to input and normalize ratings
  49. def input_ratings(movies, ratings):
  50. print("Enter ratings for the movies (scale 0 to 5, e.g., 'Black Panther, 5'):")
  51. print("Type 'done' when you are finished.")
  52. final_ratings = {}
  53. mean_rating = ratings.rating.mean()
  54. std_rating = ratings.rating.std()
  55. while True:
  56. entry = input("Enter movie and rating or type 'done' to finish: ")
  57. if entry.lower() == "done":
  58. break
  59. try:
  60. movie_name, user_rating = entry.rsplit(",", 1)
  61. user_rating = float(user_rating.strip())
  62. movie_id = movies[movies.title.str.contains(movie_name.strip(), case=False)].movieId.iloc[0]
  63. # Normalize the user's rating
  64. normalized_input_rating = (user_rating - mean_rating) / std_rating
  65. print('normalized rating:', normalized_input_rating)
  66. final_ratings[movie_id] = normalized_input_rating
  67. except IndexError:
  68. print(f"Movie '{movie_name.strip()}' not found in the database. Please try again.")
  69. except ValueError:
  70. print("Please enter a valid rating between 0 and 5.")
  71. return final_ratings
  72. # Search and recommendation function
  73. def recommend_movies(qdrant, movies, my_ratings):
  74. def to_vector(ratings):
  75. vector = models.SparseVector(values=[], indices=[])
  76. for movieId, rating in ratings.items():
  77. vector.values.append(rating)
  78. vector.indices.append(movieId)
  79. return vector
  80. user_vector = to_vector(my_ratings)
  81. results = qdrant.search(
  82. "movielens",
  83. query_vector=models.NamedSparseVector(name="ratings", vector=user_vector),
  84. with_vectors=True,
  85. limit=20,
  86. )
  87. results = np.array(results)
  88. print(results.shape)
  89. #print(results)
  90. movie_scores = defaultdict(lambda: 0)
  91. for user in results:
  92. user_scores = user.vector["ratings"]
  93. for idx, rating in zip(user_scores.indices, user_scores.values):
  94. if idx in my_ratings:
  95. continue
  96. movie_scores[idx] += rating
  97. top_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)
  98. print("Recommended movies for you:")
  99. for movieId, score in top_movies[:5]:
  100. print(movies[movies.movieId == movieId].title.values[0], score)
  101. # Main execution
  102. if __name__ == "__main__":
  103. qdrant = init_qdrant()
  104. load_data(qdrant) # This only uploads data, no return needed
  105. print_uploaded_vectors(qdrant)
  106. my_ratings = input_ratings(movies, ratings)
  107. recommend_movies(qdrant, movies, my_ratings)