From c11c1e17d387b7c27c6add3aa1a99c821aaf4805 Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Wed, 9 Oct 2024 16:46:41 -0700 Subject: [PATCH 01/12] wip: first pass at working collaborative filtering in redisvl --- .../collaborative_filtering.ipynb | 1095 +++++++++++++++++ .../collaborative_filtering_schema.yaml | 40 + .../recommendation-systems/user_schema.yaml | 18 + 3 files changed, 1153 insertions(+) create mode 100644 python-recipes/recommendation-systems/collaborative_filtering.ipynb create mode 100644 python-recipes/recommendation-systems/collaborative_filtering_schema.yaml create mode 100644 python-recipes/recommendation-systems/user_schema.yaml diff --git a/python-recipes/recommendation-systems/collaborative_filtering.ipynb b/python-recipes/recommendation-systems/collaborative_filtering.ipynb new file mode 100644 index 00000000..26a4de60 --- /dev/null +++ b/python-recipes/recommendation-systems/collaborative_filtering.ipynb @@ -0,0 +1,1095 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Collaborative Filtering in RedisVL" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Recommendation systems are a common application of machine learning and serve many industries from e-commerce to music streaming platforms.\n", + "\n", + "There are many different architechtures that can be followed to build a recommender system. \n", + "\n", + "In this notebook we'll demonstrate how to build a [content filtering](https://en.wikipedia.org/wiki/Recommender_system#:~:text=of%20hybrid%20systems.-,Content%2Dbased%20filtering,-%5Bedit%5D)\n", + "recommender and use the movies dataset as our example data." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install scikit-surprise --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "## IMPORTS\n", + "import os\n", + "import requests\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from surprise import SVD\n", + "from surprise import Dataset, Reader\n", + "from surprise.model_selection import train_test_split\n", + "\n", + "\n", + "# Replace values below with your own if using Redis Cloud instance\n", + "REDIS_HOST = os.getenv(\"REDIS_HOST\", \"localhost\") # ex: \"redis-18374.c253.us-central1-1.gce.cloud.redislabs.com\"\n", + "REDIS_PORT = os.getenv(\"REDIS_PORT\", \"6379\") # ex: 18374\n", + "REDIS_PASSWORD = os.getenv(\"REDIS_PASSWORD\", \"\") # ex: \"1TNxTEdYRDgIDKM2gDfasupCADXXXX\"\n", + "\n", + "# If SSL is enabled on the endpoint, use rediss:// as the URL prefix\n", + "REDIS_URL = f\"redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "## EVALUATE MOVE TO COLLABORATIVE FILTERING SO WE CAN SHOW BETTER NUMBERS\n", + "#let's see how well this works. we can choose some users, and based on their first watched movie we can recommend them some more.\n", + "#we can then look at the set intersection between our recommendations and the movies they actually watched (and rated highly) to see how well we did." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "## DONE\n", + "# clean up your index\n", + "\n", + "#while remaining := index.clear():\n", + "# print(f\"Deleted {remaining} keys\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# YOLO FTW" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To build a collaborative filtering example using the Surprise library and the Movies dataset, we need to first load the data, format it according to the requirements of Surprise, and then apply a collaborative filtering algorithm like SVD.\n", + "\n", + "Since you mentioned a modified version of the dataset hosted on Kaggle, I’ll show you how to structure the code, assuming you have the dataset ready.\n", + "\n", + "Here’s an example:\n", + "\n", + "Step-by-Step Guide\n", + "Install necessary libraries: Ensure you have installed the Surprise library if you haven’t already." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loading and Preparing the Data: Let’s assume the dataset contains at least two relevant files: ratings.csv (user, movie, rating) and movies.csv (movieId, title).\n", + "\n", + "You’ll need to load the ratings data and prepare it for use with Surprise." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def fetch_dataframe(file_name):\n", + " try:\n", + " df = pd.read_csv('datasets/collaborative_filtering/' + file_name)\n", + " except:\n", + " url = 'https://redis-ai-resources.s3.us-east-2.amazonaws.com/recommenders/datasets/collaborative-filtering/'\n", + " r = requests.get(url + file_name)\n", + " if not os.path.exists('datasets/collaborative_filtering'):\n", + " os.makedirs('datasets/collaborative_filtering')\n", + " with open('datasets/collaborative_filtering/' + file_name, 'wb') as f:\n", + " f.write(r.content)\n", + " df = pd.read_csv('datasets/collaborative_filtering/' + file_name)\n", + " return df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "ratings_file = 'ratings_small.csv'\n", + "\n", + "ratings_df = fetch_dataframe(ratings_file)\n", + "\n", + "# only keep the columns we need: userId, movieId, rating\n", + "ratings_df = ratings_df[['userId', 'movieId', 'rating']]\n", + "\n", + "reader = Reader(rating_scale=(0.0, 5.0))\n", + "\n", + "ratings_data = Dataset.load_from_df(ratings_df, reader)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training Our Model" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# split the data into training and testing sets (80% train, 20% test)\n", + "train_set, test_set = train_test_split(ratings_data, test_size=0.2)\n", + "\n", + "# use SVD (Singular Value Decomposition) for collaborative filtering\n", + "svd_algo = SVD(biased=False) # We'll set biased to False so that predictions are of the form \"rating_prediction = user_vector dot item_vector\"\n", + "\n", + "# train the algorithm on the train_set\n", + "svd_algo.fit(train_set)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A lot happened in the cell above. We split our full data into train and test sets. We defined the collaborative filtering algorithm to use, which in this case is the Singular Value Decomposition (SVD) algorithm. lastly, we fit our model to our data.\n", + "\n", + "It's worth going into more detail why we chose this algorithm and what it is computing in the `.fit(train_set)` method we're calling.\n", + "First, let's think about what data it's receiving - our ratings data. This only contains the user_ids, movie_ids, and the user's ratings of their watched movies on a scale of 1 to 5.\n", + "\n", + "We can put this data into a matrix with rows being users and columns being movies\n", + "\n", + "| RATINGS| movie_1 | movie_2 | movie_3 | movie_4 | movie_5 | movie_6 | ....... |\n", + "| ----- | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |\n", + "| user_1 | 4 | 1 | | 4 | | 5 | |\n", + "| user_2 | | 5 | 5 | 2 | 1 | | |\n", + "| user_3 | | | | | 1 | | |\n", + "| user_4 | 4 | 1 | | 4 | | ? | |\n", + "| user_5 | | 4 | 5 | 2 | | | |\n", + "| ...... | | | | | | | |\n", + "\n", + "Our empty cells aren't zero's their missing ratings, so `user_1` has never rated `movie_3`. They may like it or hate it." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Unlike Content Filtering, here we're only considering the ratings that users assign. We don't know the plot or genre or release year of any of these films.\n", + "But we can still build recommender by assuming that users have similar tastes to each other. As an intuitive example, we can see that `user_1` and `user_4` have very similar ratings on several movies, so we can assume that `user_4` will rate `movie_6` highly, just as `user_1` did. This is the idea behind collaborative filtering." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's the idea, but what about the math? Since we only have this matrix to work with what we want to do is decompose it into two constituent matrices.\n", + "Lets call our ratings matrix `[R]`. We want to find two other matrices, a user matrix `[U]`, and a movies matrix `[M]` that fit the equation:\n", + "\n", + "`[U] * [M] = [R]`\n", + "\n", + "`[U]` will look like:\n", + "|user_1_feature_1 | user_1_feature_2 | user_1_feature_3 | user_1_feature_4 | ... | user_1_feature_k |\n", + "| ----- | --------- | --------- | --------- | --- | --------- |\n", + "|user_2_feature_1 | user_2_feature_2 | user_2_feature_3 | user_2_feature_4 | ... | user_2_feature_k |\n", + "|user_3_feature_1 | user_3_feature_2 | user_3_feature_3 | user_3_feature_4 | ... | user_3_feature_k |\n", + "| ... | . | . | . | ... | . |\n", + "|user_N_feature_1 | user_N_feature_2 | user_N_feature_3 | user_N_feature_4 | ... | user_N_feature_k |\n", + "\n", + "`[M]` will look like:\n", + "\n", + "| movie_1_feature_1 | movie_2_feature_1 | movie_3_feature_1 | ... | movie_M_feature_1 |\n", + "| --- | --- | --- | --- | --- |\n", + "| movie_1_feature_2 | movie_2_feature_2 | movie_3_feature_2 | ... | movie_M_feature_1 |\n", + "| movie_1_feature_3 | movie_2_feature_3 | movie_3_feature_3 | ... | movie_M_feature_1 |\n", + "| movie_1_feature_4 | movie_2_feature_4 | movie_3_feature_4 | ... | movie_M_feature_1 |\n", + "| ... | . | . | ... | . |\n", + "| movie_1_feature_k | movie_2_feature_k | movie_3_feature_k | ... | movie_M_feature_k |\n", + "\n", + "\n", + "these features are called the latent features and are the values we're trying to find when we call the `.fit(training_data)` method. The algorithm that computes these features from our ratings matrix is the SVD algorithm." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Explanation:\n", + "Dataset Preparation: We load the ratings data and ensure it has the necessary format with userId, movieId, and rating columns.\n", + "Surprise Reader: This helps in specifying the format of the data.\n", + "SVD Algorithm: We use the SVD algorithm for collaborative filtering. It decomposes the user-item interaction matrix into the latent factors.\n", + "Accuracy: After training the model, we evaluate it using the RMSE (Root Mean Squared Error).\n", + "Next Steps:\n", + "You can experiment with different algorithms such as KNNBasic or NMF in the Surprise library.\n", + "If your dataset contains titles, you can join movies.csv to display movie names in recommendations.\n", + "Would you like more details on dataset preprocessing or any specific functionality in collaborative filtering?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To extract user and item (movie) vectors from an SVD model trained using Surprise and then store them in a Redis vector database, you'll need to:\n", + "\n", + "Extract the learned latent factors (user and item vectors) from the SVD model.\n", + "Use Redis-py (Python client for Redis) to store those vectors in Redis as vectors in a vector database (assuming you are using Redis with the RedisVector or RedisSearch modules).\n", + "Here’s how to do it:\n", + "\n", + "Step 1: Extract User and Item Vectors from the SVD Model\n", + "The Surprise SVD model stores user and item vectors (latent factors) in two attributes:\n", + "\n", + "algo.pu: user factors matrix (a matrix where each row corresponds to the latent factors of a user).\n", + "algo.qi: item factors matrix (a matrix where each row corresponds to the latent factors of an item/movie).\n", + "These matrices store the vectors in the latent space after training.\n", + "\n", + "Step 2: Save the Vectors in Redis\n", + "Redis stores vectors in vector databases, such as Redis' HNSW index for vector similarity search. You can store both user and movie vectors as hashes in Redis and then use them for similarity search or recommendations.\n", + "\n", + "Install Redis and Redis-py\n", + "Make sure you have Redis installed with vector support (RediSearch or RedisVL), and install the Redis-py package:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(671, 100)\n", + "(8405, 100)\n" + ] + } + ], + "source": [ + "# step 1: extract vectors\n", + "user_vectors = svd_algo.pu # user latent features (matrix)\n", + "movie_vectors = svd_algo.qi # movie latent features (matrix)\n", + "\n", + "print(user_vectors.shape)\n", + "print(movie_vectors.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Explanation:\n", + "Extract Vectors:\n", + "\n", + "algo.pu gives you a matrix where each row corresponds to a user’s latent factors (user vector).\n", + "algo.qi gives you a matrix where each row corresponds to an item/movie’s latent factors (item vector).\n", + "Store in Redis:\n", + "\n", + "We store each vector under a unique Redis key (e.g., user:123, item:456).\n", + "The vector is stored as a hash in Redis with each dimension (dim_0, dim_1, etc.) being a field in the hash.\n", + "Step 3: Advanced Storage for Vector Similarity Search\n", + "If you want to store the vectors in a Redis vector search index (e.g., HNSW from RedisSearch for vector similarity queries), you would follow the Redis commands for indexing:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "user: 347 item: 5515 r_ui = None est = 1.42 {'was_impossible': False}\n", + "1.4150893670982523\n" + ] + }, + { + "data": { + "text/plain": [ + "Prediction(uid=347, iid=5515, r_ui=None, est=1.4150893670982523, details={'was_impossible': False})" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(svd_algo.predict(347, 5515))\n", + "\n", + "inner_uid = train_set.to_inner_uid(347)\n", + "inner_iid = train_set.to_inner_iid(5515)\n", + "print(np.dot(user_vectors[inner_uid], movie_vectors[inner_iid])) # surpirse casts userId and movieId to inner ids\n", + "svd_algo.predict(347, 5515)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "while our collaborative filtering algorithm was trained solely on user's ratings of movies, and doesn't require any data about the movies themselves - like the title, genres, or release year - we'll want that information stored as metadata.\n", + "\n", + "We can grab this data from our `movies_metadata.csv` file, clean it, and join it to our user ratings via the `movieId` column" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
belongs_to_collectionbudgetgenreshomepageidimdb_idoriginal_languageoriginal_titleoverviewpopularity...release_daterevenueruntimespoken_languagesstatustaglinetitlevideovote_averagevote_count
0{'id': 10194, 'name': 'Toy Story Collection', ...30000000[{'id': 16, 'name': 'Animation'}, {'id': 35, '...http://toystory.disney.com/toy-story862tt0114709enToy StoryLed by Woody, Andy's toys live happily in his ...21.946943...1995-10-3037355403381.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedNaNToy StoryFalse7.75415
1NaN65000000[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...NaN8844tt0113497enJumanjiWhen siblings Judy and Peter discover an encha...17.015539...1995-12-15262797249104.0[{'iso_639_1': 'en', 'name': 'English'}, {'iso...ReleasedRoll the dice and unleash the excitement!JumanjiFalse6.92413
2{'id': 119050, 'name': 'Grumpy Old Men Collect...0[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...NaN15602tt0113228enGrumpier Old MenA family wedding reignites the ancient feud be...11.712900...1995-12-220101.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedStill Yelling. Still Fighting. Still Ready for...Grumpier Old MenFalse6.592
3NaN16000000[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...NaN31357tt0114885enWaiting to ExhaleCheated on, mistreated and stepped on, the wom...3.859495...1995-12-2281452156127.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedFriends are the people who let you be yourself...Waiting to ExhaleFalse6.134
4{'id': 96871, 'name': 'Father of the Bride Col...0[{'id': 35, 'name': 'Comedy'}]NaN11862tt0113041enFather of the Bride Part IIJust when George Banks has recovered from his ...8.387519...1995-02-1076578911106.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedJust When His World Is Back To Normal... He's ...Father of the Bride Part IIFalse5.7173
\n", + "

5 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " belongs_to_collection budget \\\n", + "0 {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n", + "1 NaN 65000000 \n", + "2 {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n", + "3 NaN 16000000 \n", + "4 {'id': 96871, 'name': 'Father of the Bride Col... 0 \n", + "\n", + " genres \\\n", + "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n", + "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n", + "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n", + "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n", + "4 [{'id': 35, 'name': 'Comedy'}] \n", + "\n", + " homepage id imdb_id original_language \\\n", + "0 http://toystory.disney.com/toy-story 862 tt0114709 en \n", + "1 NaN 8844 tt0113497 en \n", + "2 NaN 15602 tt0113228 en \n", + "3 NaN 31357 tt0114885 en \n", + "4 NaN 11862 tt0113041 en \n", + "\n", + " original_title \\\n", + "0 Toy Story \n", + "1 Jumanji \n", + "2 Grumpier Old Men \n", + "3 Waiting to Exhale \n", + "4 Father of the Bride Part II \n", + "\n", + " overview popularity ... \\\n", + "0 Led by Woody, Andy's toys live happily in his ... 21.946943 ... \n", + "1 When siblings Judy and Peter discover an encha... 17.015539 ... \n", + "2 A family wedding reignites the ancient feud be... 11.712900 ... \n", + "3 Cheated on, mistreated and stepped on, the wom... 3.859495 ... \n", + "4 Just when George Banks has recovered from his ... 8.387519 ... \n", + "\n", + " release_date revenue runtime \\\n", + "0 1995-10-30 373554033 81.0 \n", + "1 1995-12-15 262797249 104.0 \n", + "2 1995-12-22 0 101.0 \n", + "3 1995-12-22 81452156 127.0 \n", + "4 1995-02-10 76578911 106.0 \n", + "\n", + " spoken_languages status \\\n", + "0 [{'iso_639_1': 'en', 'name': 'English'}] Released \n", + "1 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released \n", + "2 [{'iso_639_1': 'en', 'name': 'English'}] Released \n", + "3 [{'iso_639_1': 'en', 'name': 'English'}] Released \n", + "4 [{'iso_639_1': 'en', 'name': 'English'}] Released \n", + "\n", + " tagline \\\n", + "0 NaN \n", + "1 Roll the dice and unleash the excitement! \n", + "2 Still Yelling. Still Fighting. Still Ready for... \n", + "3 Friends are the people who let you be yourself... \n", + "4 Just When His World Is Back To Normal... He's ... \n", + "\n", + " title video vote_average vote_count \n", + "0 Toy Story False 7.7 5415 \n", + "1 Jumanji False 6.9 2413 \n", + "2 Grumpier Old Men False 6.5 92 \n", + "3 Waiting to Exhale False 6.1 34 \n", + "4 Father of the Bride Part II False 5.7 173 \n", + "\n", + "[5 rows x 23 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "movies_df = fetch_dataframe('movies_metadata.csv')\n", + "movies_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "budget 0\n", + "genres 0\n", + "id 0\n", + "imdb_id 0\n", + "original_language 0\n", + "overview 0\n", + "popularity 0\n", + "release_date 0\n", + "revenue 0\n", + "runtime 0\n", + "status 0\n", + "tagline 0\n", + "title 0\n", + "vote_average 0\n", + "vote_count 0\n", + "dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "import datetime\n", + "movies_df.drop(columns=['homepage', 'production_countries', 'production_companies', 'spoken_languages', 'video', 'original_title', 'video', 'poster_path', 'belongs_to_collection'], inplace=True)\n", + "\n", + "# drop rows that have missing values\n", + "movies_df.dropna(subset=['imdb_id'], inplace=True)\n", + "\n", + "movies_df['original_language'] = movies_df['original_language'].fillna('unknown')\n", + "movies_df['overview'] = movies_df['overview'].fillna('')\n", + "movies_df['popularity'] = movies_df['popularity'].fillna(0)\n", + "movies_df['release_date'] = movies_df['release_date'].fillna('1900-01-01').apply(lambda x: datetime.datetime.strptime(x, \"%Y-%m-%d\").timestamp())\n", + "movies_df['revenue'] = movies_df['revenue'].fillna(0) # fill with average?\n", + "movies_df['runtime'] = movies_df['runtime'].fillna(0) # fill with average?\n", + "movies_df['status'] = movies_df['status'].fillna('unknown')\n", + "movies_df['tagline'] = movies_df['tagline'].fillna('')\n", + "movies_df['title'] = movies_df['title'].fillna('')\n", + "movies_df['vote_average'] = movies_df['vote_average'].fillna(0)\n", + "movies_df['vote_count'] = movies_df['vote_count'].fillna(0)\n", + "movies_df['genres'] = movies_df['genres'].apply(lambda x: [g['name'] for g in eval(x)] if x != '' else []) # convert to a list of genre names\n", + "movies_df['imdb_id'] = movies_df['imdb_id'].apply(lambda x: x[2:] if str(x).startswith('tt') else x).astype(int) # remove leading 'tt' from imdb_id\n", + "\n", + "# make sure we've filled all missing values\n", + "movies_df.isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll eventually have to map these movies to their ratings, which we'll do so with the `links.csv` file that matches `movieId`, `imdbId`, and `tmdbId`.\n", + "\n", + "Let's do that now." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
budgetgenresidimdb_idoriginal_languageoverviewpopularityrelease_daterevenueruntimestatustaglinetitlevote_averagevote_countmovieIdimdbIdtmdbId
030000000[Animation, Comedy, Family]862114709enLed by Woody, Andy's toys live happily in his ...21.946943815040000.037355403381.0ReleasedToy Story7.754151114709862.0
165000000[Adventure, Fantasy, Family]8844113497enWhen siblings Judy and Peter discover an encha...17.015539819014400.0262797249104.0ReleasedRoll the dice and unleash the excitement!Jumanji6.9241321134978844.0
20[Romance, Comedy]15602113228enA family wedding reignites the ancient feud be...11.712900819619200.00101.0ReleasedStill Yelling. Still Fighting. Still Ready for...Grumpier Old Men6.592311322815602.0
316000000[Comedy, Drama, Romance]31357114885enCheated on, mistreated and stepped on, the wom...3.859495819619200.081452156127.0ReleasedFriends are the people who let you be yourself...Waiting to Exhale6.134411488531357.0
40[Comedy]11862113041enJust when George Banks has recovered from his ...8.387519792403200.076578911106.0ReleasedJust When His World Is Back To Normal... He's ...Father of the Bride Part II5.7173511304111862.0
\n", + "
" + ], + "text/plain": [ + " budget genres id imdb_id original_language \\\n", + "0 30000000 [Animation, Comedy, Family] 862 114709 en \n", + "1 65000000 [Adventure, Fantasy, Family] 8844 113497 en \n", + "2 0 [Romance, Comedy] 15602 113228 en \n", + "3 16000000 [Comedy, Drama, Romance] 31357 114885 en \n", + "4 0 [Comedy] 11862 113041 en \n", + "\n", + " overview popularity \\\n", + "0 Led by Woody, Andy's toys live happily in his ... 21.946943 \n", + "1 When siblings Judy and Peter discover an encha... 17.015539 \n", + "2 A family wedding reignites the ancient feud be... 11.712900 \n", + "3 Cheated on, mistreated and stepped on, the wom... 3.859495 \n", + "4 Just when George Banks has recovered from his ... 8.387519 \n", + "\n", + " release_date revenue runtime status \\\n", + "0 815040000.0 373554033 81.0 Released \n", + "1 819014400.0 262797249 104.0 Released \n", + "2 819619200.0 0 101.0 Released \n", + "3 819619200.0 81452156 127.0 Released \n", + "4 792403200.0 76578911 106.0 Released \n", + "\n", + " tagline \\\n", + "0 \n", + "1 Roll the dice and unleash the excitement! \n", + "2 Still Yelling. Still Fighting. Still Ready for... \n", + "3 Friends are the people who let you be yourself... \n", + "4 Just When His World Is Back To Normal... He's ... \n", + "\n", + " title vote_average vote_count movieId imdbId \\\n", + "0 Toy Story 7.7 5415 1 114709 \n", + "1 Jumanji 6.9 2413 2 113497 \n", + "2 Grumpier Old Men 6.5 92 3 113228 \n", + "3 Waiting to Exhale 6.1 34 4 114885 \n", + "4 Father of the Bride Part II 5.7 173 5 113041 \n", + "\n", + " tmdbId \n", + "0 862.0 \n", + "1 8844.0 \n", + "2 15602.0 \n", + "3 31357.0 \n", + "4 11862.0 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "links_df = fetch_dataframe('links_small.csv')\n", + "movies_df = movies_df.merge(links_df, left_on='imdb_id', right_on='imdbId', how='inner')\n", + "movies_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll want to move our SVD user vectors and movie vectors and their corresponding userId and movieId into 2 dataframes for later processing." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# build a dataframe out of the user vectors and their userIds\n", + "user_vectors_and_ids = {train_set.to_raw_uid(inner_id): user_vectors[inner_id].tolist() for inner_id in train_set.all_users()}\n", + "user_vector_df = pd.Series(user_vectors_and_ids).to_frame('user_vector')\n", + "\n", + "# now do the same for the movie vectors and their movieIds\n", + "movie_vectors_and_ids = {train_set.to_raw_iid(inner_id): movie_vectors[inner_id].tolist() for inner_id in train_set.all_items()}\n", + "movie_vector_df = pd.Series(movie_vectors_and_ids).to_frame('movie_vector')\n", + "\n", + "# merge the movie vector series with the movies dataframe using the movieId and id fields\n", + "movies_df = movies_df.merge(movie_vector_df, left_on='id', right_index=True, how='inner')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Querying Vectors\n", + "You can use Redis’ vector similarity search to find the most similar vectors once they’re stored.\n", + "\n", + "\n", + "\n", + "Once you've stored your vectors in Redis, querying for vector similarity becomes straightforward, especially if you're using RediSearch with vector support (such as HNSW). I'll guide you through setting up and querying for vector similarity.\n", + "\n", + "Query Setup\n", + "We'll assume:\n", + "\n", + "You've already created a vector index using the HNSW algorithm (or another vector indexing mechanism).\n", + "You've stored your user or item vectors in Redis, either as fields in a Redis hash or as direct vector fields for vector similarity searches.\n", + "Step-by-Step Guide for Querying Vector Similarity\n", + "1. Create a Vector Index (If not already created)\n", + "Before you can perform similarity queries, you need to create a vector index using the FT.CREATE command. This defines how vectors are indexed in Redis.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from redis import Redis\n", + "from redisvl.schema import IndexSchema\n", + "from redisvl.index import SearchIndex\n", + "\n", + "client = Redis.from_url(REDIS_URL)\n", + "\n", + "movie_schema = IndexSchema.from_yaml(\"collaborative_filtering_schema.yaml\")\n", + "\n", + "movie_index = SearchIndex(movie_schema, redis_client=client)\n", + "movie_index.create(overwrite=True, drop=True)\n", + "\n", + "user_schema = IndexSchema.from_yaml(\"user_schema.yaml\")\n", + "\n", + "user_index = SearchIndex(user_schema, redis_client=client)\n", + "user_index.create(overwrite=True, drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "keys = movie_index.load(movies_df.to_dict(orient='records'))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1494\n", + "28386\n", + "1494\n", + "1482\n", + "9065\n" + ] + } + ], + "source": [ + "number_of_movies = len(movies_df.to_dict(orient='records'))\n", + "size_of_movie_df = movies_df.size\n", + "\n", + "print(number_of_movies)\n", + "print(size_of_movie_df)\n", + "unique_movie_ids = movies_df['id'].nunique()\n", + "print(unique_movie_ids)\n", + "unique_movie_titles = movies_df['title'].nunique()\n", + "print(unique_movie_titles)\n", + "\n", + "unique_movies_rated = ratings_df['movieId'].nunique()\n", + "print(unique_movies_rated)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Unlike in content filtering, where we want to compute vector similarity between items and we use cosine distance between items vectors to do so, in collaborative filtering we instead try to compute the predicted rating a user will give to a movie by taking the inner product of the user and movie vector.\n", + "\n", + "This is why in our `collaborative_filtering_schema.yaml` we use `ip` (inner product) as our distance metric.\n", + "\n", + "It's also why we'll use our user vector as the query vector when we do a vector query." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'id': 'movie:9df0babc731549909e929885973aee58', 'vector_distance': '-4.31072711945', 'title': 'The Million Dollar Hotel', 'genres': '[\"Drama\",\"Thriller\"]'}\n", + "{'id': 'movie:5d7079fac9534a0585608c9b0d01ba80', 'vector_distance': '-4.11799812317', 'title': \"Pandora's Box\", 'genres': '[\"Drama\",\"Thriller\",\"Romance\"]'}\n", + "{'id': 'movie:120d744065394c499f76589586051337', 'vector_distance': '-4.10946702957', 'title': \"Monsieur Hulot's Holiday\", 'genres': '[\"Comedy\",\"Family\"]'}\n", + "{'id': 'movie:ae34e2e4c64147c994f3c5c3c29f5190', 'vector_distance': '-4.01828145981', 'title': 'Scarface', 'genres': '[\"Action\",\"Crime\",\"Drama\",\"Thriller\"]'}\n", + "{'id': 'movie:a1506eff623a4ba0a165a850e7250d3e', 'vector_distance': '-4.00052165985', 'title': 'The Thomas Crown Affair', 'genres': '[\"Romance\",\"Crime\",\"Thriller\",\"Drama\"]'}\n", + "{'id': 'movie:3dbfc6b11f374d649663c0365792a3d2', 'vector_distance': '-3.99500846863', 'title': 'Dead Man', 'genres': '[\"Drama\",\"Fantasy\",\"Western\"]'}\n", + "{'id': 'movie:4df54c2a6f674b85ab12ced9b6eddf3e', 'vector_distance': '-3.99404859543', 'title': 'True Romance', 'genres': '[\"Action\",\"Thriller\",\"Crime\",\"Romance\"]'}\n", + "{'id': 'movie:fef75d040f864cc4abeda72c3d7830b6', 'vector_distance': '-3.98610448837', 'title': 'Sunshine', 'genres': '[\"Science Fiction\",\"Thriller\"]'}\n", + "{'id': 'movie:e6b996ce1db7497da1595b1ffeba66c9', 'vector_distance': '-3.94609308243', 'title': 'The Sixth Sense', 'genres': '[\"Mystery\",\"Thriller\",\"Drama\"]'}\n", + "{'id': 'movie:38c0ee9c438242ed8347afefc2469ca5', 'vector_distance': '-3.94594812393', 'title': 'Zatoichi', 'genres': '[\"Adventure\",\"Drama\",\"Action\"]'}\n", + "{'id': 'movie:e6345ce2592a4807ae5929094f8871db', 'vector_distance': '-3.94536495209', 'title': 'Straw Dogs', 'genres': '[\"Crime\",\"Drama\",\"Thriller\",\"Mystery\"]'}\n", + "{'id': 'movie:679bbb2708e847dc8730a5bf87d10b95', 'vector_distance': '-3.93847608566', 'title': 'While You Were Sleeping', 'genres': '[\"Comedy\",\"Drama\",\"Romance\"]'}\n", + "{'id': 'movie:d88f066c1c1e4eedb4bbcad4be8e85cd', 'vector_distance': '-3.92860937119', 'title': 'Cold Mountain', 'genres': '[\"Drama\"]'}\n", + "{'id': 'movie:67fa2af9660549a0bab428e2cb4c5801', 'vector_distance': '-3.92834091187', 'title': 'The Good Shepherd', 'genres': '[\"Drama\",\"Thriller\",\"History\"]'}\n", + "{'id': 'movie:5bc1a44fdea4421880acc9a050bdf8be', 'vector_distance': '-3.90582227707', 'title': \"Charlie's Angels\", 'genres': '[\"Action\",\"Adventure\",\"Comedy\",\"Crime\",\"Thriller\"]'}\n", + "{'id': 'movie:f1459b204f054f8daa7bb03e349d4bc1', 'vector_distance': '-3.90310573578', 'title': 'Gremlins 2: The New Batch', 'genres': '[\"Comedy\",\"Horror\",\"Fantasy\"]'}\n", + "{'id': 'movie:d0d08cb2caa44b42a3f21af5687fb7dc', 'vector_distance': '-3.90276098251', 'title': 'Ghost Rider', 'genres': '[\"Thriller\",\"Action\",\"Fantasy\",\"Horror\"]'}\n", + "{'id': 'movie:0c27eb9238744640a318c57104df4ddb', 'vector_distance': '-3.90011119843', 'title': 'Once Were Warriors', 'genres': '[\"Drama\"]'}\n", + "{'id': 'movie:e0a34f0930fb44d5857dc8d75b05b985', 'vector_distance': '-3.89683389664', 'title': 'Hard Target', 'genres': '[\"Action\",\"Adventure\",\"Crime\",\"Thriller\"]'}\n", + "{'id': 'movie:0facbd81368840769a3b49dcbd479472', 'vector_distance': '-3.89002752304', 'title': 'Space Jam', 'genres': '[\"Animation\",\"Comedy\",\"Drama\",\"Family\",\"Fantasy\"]'}\n" + ] + } + ], + "source": [ + "from redisvl.query import RangeQuery, FilterQuery\n", + "from redisvl.query.filter import Tag, Num, Text\n", + "\n", + "user_1_vector = user_vectors[20].tolist()\n", + "\n", + "# the distance metric 'ip' inner product is computing \"score = 1 - u * v\" and returning the minimum, which corresponds to the max of \"u * v\"\n", + "# this is what we want. The predicted rating on a scale of 0 to 5 is then -(score - 1) == -score + 1\n", + "query = RangeQuery(vector=user_1_vector,\n", + " vector_field_name='movie_vector',\n", + " num_results=20,\n", + " return_score=True,\n", + " return_fields=['title', 'genres']\n", + " )\n", + "\n", + "results = movie_index.query(query)\n", + "\n", + "for r in results:\n", + " print(r)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "redis-ai-res", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python-recipes/recommendation-systems/collaborative_filtering_schema.yaml b/python-recipes/recommendation-systems/collaborative_filtering_schema.yaml new file mode 100644 index 00000000..f10d686b --- /dev/null +++ b/python-recipes/recommendation-systems/collaborative_filtering_schema.yaml @@ -0,0 +1,40 @@ +index: + name: movies + prefix: movie + storage_type: json + +fields: + - name: genres + type: tag + - name: movie_id + type: tag + - name: original_language + type: tag + - name: overview + type: text + - name: popularity + type: numeric + - name: release_date + type: numeric + - name: revenue + type: numeric + - name: runtime + type: numeric + - name: status + type: tag + - name: tagline + type: text + - name: title + type: text + - name: vote_average + type: numeric + - name: vote_count + type: numeric + + - name: movie_vector + type: vector + attrs: + dims: 100 + distance_metric: ip + algorithm: flat + dtype: float32 \ No newline at end of file diff --git a/python-recipes/recommendation-systems/user_schema.yaml b/python-recipes/recommendation-systems/user_schema.yaml new file mode 100644 index 00000000..95511038 --- /dev/null +++ b/python-recipes/recommendation-systems/user_schema.yaml @@ -0,0 +1,18 @@ +index: + name: users + prefix: user + storage_type: json + +fields: + - name: usr_id + type: tag + name: ratings + type: numeric + + - name: user_vector + type: vector + attrs: + dims: 100 + distance_metric: ip + algorithm: flat + dtype: float32 \ No newline at end of file From b254c3391523daefb62b01e5a46fe6af85a849ba Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Wed, 9 Oct 2024 17:51:56 -0700 Subject: [PATCH 02/12] cleans up collab filtering notebook. updates user schema --- .../collaborative_filtering.ipynb | 307 ++++++++---------- .../recommendation-systems/user_schema.yaml | 4 +- 2 files changed, 143 insertions(+), 168 deletions(-) diff --git a/python-recipes/recommendation-systems/collaborative_filtering.ipynb b/python-recipes/recommendation-systems/collaborative_filtering.ipynb index 26a4de60..e4fdfdd3 100644 --- a/python-recipes/recommendation-systems/collaborative_filtering.ipynb +++ b/python-recipes/recommendation-systems/collaborative_filtering.ipynb @@ -13,10 +13,12 @@ "source": [ "Recommendation systems are a common application of machine learning and serve many industries from e-commerce to music streaming platforms.\n", "\n", - "There are many different architechtures that can be followed to build a recommender system. \n", + "There are many different architechtures that can be followed to build a recommendation system.\n", "\n", - "In this notebook we'll demonstrate how to build a [content filtering](https://en.wikipedia.org/wiki/Recommender_system#:~:text=of%20hybrid%20systems.-,Content%2Dbased%20filtering,-%5Bedit%5D)\n", - "recommender and use the movies dataset as our example data." + "In this notebook we'll demonstrate how to build a [collaborative filtering](https://en.wikipedia.org/wiki/Collaborative_filtering)\n", + "recommendation system and use the large IMDB movies dataset as our example data.\n", + "\n", + "To generate our vectors we'll use the popular Python package [Surprise](https://surpriselib.com/)" ] }, { @@ -34,7 +36,6 @@ "metadata": {}, "outputs": [], "source": [ - "## IMPORTS\n", "import os\n", "import requests\n", "import pandas as pd\n", @@ -54,59 +55,11 @@ "REDIS_URL = f\"redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}\"" ] }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "## EVALUATE MOVE TO COLLABORATIVE FILTERING SO WE CAN SHOW BETTER NUMBERS\n", - "#let's see how well this works. we can choose some users, and based on their first watched movie we can recommend them some more.\n", - "#we can then look at the set intersection between our recommendations and the movies they actually watched (and rated highly) to see how well we did." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "## DONE\n", - "# clean up your index\n", - "\n", - "#while remaining := index.clear():\n", - "# print(f\"Deleted {remaining} keys\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# YOLO FTW" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To build a collaborative filtering example using the Surprise library and the Movies dataset, we need to first load the data, format it according to the requirements of Surprise, and then apply a collaborative filtering algorithm like SVD.\n", - "\n", - "Since you mentioned a modified version of the dataset hosted on Kaggle, I’ll show you how to structure the code, assuming you have the dataset ready.\n", - "\n", - "Here’s an example:\n", - "\n", - "Step-by-Step Guide\n", - "Install necessary libraries: Ensure you have installed the Surprise library if you haven’t already." - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Loading and Preparing the Data: Let’s assume the dataset contains at least two relevant files: ratings.csv (user, movie, rating) and movies.csv (movieId, title).\n", - "\n", - "You’ll need to load the ratings data and prepare it for use with Surprise." + "To build a collaborative filtering example using the Surprise library and the Movies dataset, we need to first load the data, format it according to the requirements of Surprise, and then apply a collaborative filtering algorithm like SVD." ] }, { @@ -151,41 +104,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Training Our Model" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# split the data into training and testing sets (80% train, 20% test)\n", - "train_set, test_set = train_test_split(ratings_data, test_size=0.2)\n", - "\n", - "# use SVD (Singular Value Decomposition) for collaborative filtering\n", - "svd_algo = SVD(biased=False) # We'll set biased to False so that predictions are of the form \"rating_prediction = user_vector dot item_vector\"\n", - "\n", - "# train the algorithm on the train_set\n", - "svd_algo.fit(train_set)" + "# What is Collaborative Filtering" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "A lot happened in the cell above. We split our full data into train and test sets. We defined the collaborative filtering algorithm to use, which in this case is the Singular Value Decomposition (SVD) algorithm. lastly, we fit our model to our data.\n", + "A lot is going to happen in the code cell below. We split our full data into train and test sets. We defined the collaborative filtering algorithm to use, which in this case is the Singular Value Decomposition (SVD) algorithm. lastly, we fit our model to our data.\n", "\n", "It's worth going into more detail why we chose this algorithm and what it is computing in the `.fit(train_set)` method we're calling.\n", "First, let's think about what data it's receiving - our ratings data. This only contains the user_ids, movie_ids, and the user's ratings of their watched movies on a scale of 1 to 5.\n", @@ -201,7 +127,7 @@ "| user_5 | | 4 | 5 | 2 | | | |\n", "| ...... | | | | | | | |\n", "\n", - "Our empty cells aren't zero's their missing ratings, so `user_1` has never rated `movie_3`. They may like it or hate it." + "Our empty cells aren't zero's, they're missing ratings, so `user_1` has never rated `movie_3`. They may like it or hate it." ] }, { @@ -209,14 +135,14 @@ "metadata": {}, "source": [ "Unlike Content Filtering, here we're only considering the ratings that users assign. We don't know the plot or genre or release year of any of these films.\n", - "But we can still build recommender by assuming that users have similar tastes to each other. As an intuitive example, we can see that `user_1` and `user_4` have very similar ratings on several movies, so we can assume that `user_4` will rate `movie_6` highly, just as `user_1` did. This is the idea behind collaborative filtering." + "But we can still build recommender by assuming that users have similar tastes to each other. As an intuitive example, we can see that `user_1` and `user_4` have very similar ratings on several movies, so we will assume that `user_4` will rate `movie_6` highly, just as `user_1` did. This is the idea behind collaborative filtering." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "That's the idea, but what about the math? Since we only have this matrix to work with what we want to do is decompose it into two constituent matrices.\n", + "That's the intuition, but what about the math? Since we only have this matrix to work with what we want to do is decompose it into two constituent matrices.\n", "Lets call our ratings matrix `[R]`. We want to find two other matrices, a user matrix `[U]`, and a movies matrix `[M]` that fit the equation:\n", "\n", "`[U] * [M] = [R]`\n", @@ -240,92 +166,116 @@ "| movie_1_feature_k | movie_2_feature_k | movie_3_feature_k | ... | movie_M_feature_k |\n", "\n", "\n", - "these features are called the latent features and are the values we're trying to find when we call the `.fit(training_data)` method. The algorithm that computes these features from our ratings matrix is the SVD algorithm." + "these features are called the latent features (or latent factors) and are the values we're trying to find when we call the `svd_algo.fit(training_data)` method. The algorithm that computes these features from our ratings matrix is the SVD algorithm. The number of users and movies is set by our data. The size of the latent feature vectors `k` is a parameter we choose. We'll keep it at the default 100 for this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# split the data into training and testing sets (80% train, 20% test)\n", + "train_set, test_set = train_test_split(ratings_data, test_size=0.2)\n", + "\n", + "# use SVD (Singular Value Decomposition) for collaborative filtering\n", + "svd_algo = SVD(n_factors=100, biased=False) # We'll set biased to False so that predictions are of the form \"rating_prediction = user_vector dot item_vector\"\n", + "\n", + "# train the algorithm on the train_set\n", + "svd_algo.fit(train_set)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Explanation:\n", - "Dataset Preparation: We load the ratings data and ensure it has the necessary format with userId, movieId, and rating columns.\n", - "Surprise Reader: This helps in specifying the format of the data.\n", - "SVD Algorithm: We use the SVD algorithm for collaborative filtering. It decomposes the user-item interaction matrix into the latent factors.\n", - "Accuracy: After training the model, we evaluate it using the RMSE (Root Mean Squared Error).\n", - "Next Steps:\n", - "You can experiment with different algorithms such as KNNBasic or NMF in the Surprise library.\n", - "If your dataset contains titles, you can join movies.csv to display movie names in recommendations.\n", - "Would you like more details on dataset preprocessing or any specific functionality in collaborative filtering?" + "## Extracting The User and Movie Vectors" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To extract user and item (movie) vectors from an SVD model trained using Surprise and then store them in a Redis vector database, you'll need to:\n", - "\n", - "Extract the learned latent factors (user and item vectors) from the SVD model.\n", - "Use Redis-py (Python client for Redis) to store those vectors in Redis as vectors in a vector database (assuming you are using Redis with the RedisVector or RedisSearch modules).\n", - "Here’s how to do it:\n", - "\n", - "Step 1: Extract User and Item Vectors from the SVD Model\n", - "The Surprise SVD model stores user and item vectors (latent factors) in two attributes:\n", + "Now that the the SVD algorithm has computed our `[U]` and `[M]` matrices - which are both really just lists of vectors - we can load them into our Redis instance.\n", "\n", - "algo.pu: user factors matrix (a matrix where each row corresponds to the latent factors of a user).\n", - "algo.qi: item factors matrix (a matrix where each row corresponds to the latent factors of an item/movie).\n", - "These matrices store the vectors in the latent space after training.\n", + "The Surprise SVD model stores user and movie vectors in two attributes:\n", "\n", - "Step 2: Save the Vectors in Redis\n", - "Redis stores vectors in vector databases, such as Redis' HNSW index for vector similarity search. You can store both user and movie vectors as hashes in Redis and then use them for similarity search or recommendations.\n", + "`algo.pu`: user features matrix (a matrix where each row corresponds to the latent features of a user).\n", + "`algo.qi`: item features matrix (a matrix where each row corresponds to the latent features of an item/movie).\n", "\n", - "Install Redis and Redis-py\n", - "Make sure you have Redis installed with vector support (RediSearch or RedisVL), and install the Redis-py package:" + "It's worth noting that the matrix `algo.qi` is the transpose of the matrix `[M]` we defined above. This way each row corresponds to one movie" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(671, 100)\n", - "(8405, 100)\n" + "we have 671 users with feature vectors of size 100\n", + "we have 8405 movies with feature vectors of size 100\n" ] } ], "source": [ - "# step 1: extract vectors\n", "user_vectors = svd_algo.pu # user latent features (matrix)\n", "movie_vectors = svd_algo.qi # movie latent features (matrix)\n", "\n", - "print(user_vectors.shape)\n", - "print(movie_vectors.shape)" + "print(f'we have {user_vectors.shape[0]} users with feature vectors of size {user_vectors.shape[1]}')\n", + "print(f'we have {movie_vectors.shape[0]} movies with feature vectors of size {movie_vectors.shape[1]}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Explanation:\n", - "Extract Vectors:\n", + "# Predicting User Ratings\n", + "The great thing about collaborative filtering is that using our user and movie vectors we can predict the rating any user will give to any movie in our dataset.\n", + "And unlike content filtering, there is no assumption that all the movies a user will be recommended are similar to each other. A user can be recommended dark horror films and light-hearted animations.\n", "\n", - "algo.pu gives you a matrix where each row corresponds to a user’s latent factors (user vector).\n", - "algo.qi gives you a matrix where each row corresponds to an item/movie’s latent factors (item vector).\n", - "Store in Redis:\n", + "Looking back at our SVD algorithm the equation is [User_features] * [Movie_features].transpose = [Ratings]\n", + "So to get a prediction of what a user will rate a movie they haven't seen yet we just need to take the dot product of that user's feature vector and a movie's feature vector." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the predicted rating of user 347 on movie 5515 is 1.4150893670982523\n" + ] + } + ], + "source": [ + "# predict one user's rating of one film\n", + "# surprise casts userId and movieId to inner ids\n", "\n", - "We store each vector under a unique Redis key (e.g., user:123, item:456).\n", - "The vector is stored as a hash in Redis with each dimension (dim_0, dim_1, etc.) being a field in the hash.\n", - "Step 3: Advanced Storage for Vector Similarity Search\n", - "If you want to store the vectors in a Redis vector search index (e.g., HNSW from RedisSearch for vector similarity queries), you would follow the Redis commands for indexing:\n", - "\n" + "inner_uid = train_set.to_inner_uid(347) # user_id\n", + "inner_iid = train_set.to_inner_iid(5515) # movie_id\n", + "print(f'the predicted rating of user {347} on movie {5515} is {np.dot(user_vectors[inner_uid], movie_vectors[inner_iid])}')\n" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -342,17 +292,18 @@ "Prediction(uid=347, iid=5515, r_ui=None, est=1.4150893670982523, details={'was_impossible': False})" ] }, - "execution_count": 9, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# sanity check my math matches Surprise package math\n", "print(svd_algo.predict(347, 5515))\n", "\n", "inner_uid = train_set.to_inner_uid(347)\n", "inner_iid = train_set.to_inner_iid(5515)\n", - "print(np.dot(user_vectors[inner_uid], movie_vectors[inner_iid])) # surpirse casts userId and movieId to inner ids\n", + "print(np.dot(user_vectors[inner_uid], movie_vectors[inner_iid])) # surprise casts userId and movieId to inner ids\n", "svd_algo.predict(347, 5515)" ] }, @@ -924,22 +875,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Querying Vectors\n", - "You can use Redis’ vector similarity search to find the most similar vectors once they’re stored.\n", - "\n", - "\n", - "\n", - "Once you've stored your vectors in Redis, querying for vector similarity becomes straightforward, especially if you're using RediSearch with vector support (such as HNSW). I'll guide you through setting up and querying for vector similarity.\n", + "## RedisVL Handles the Scale\n", "\n", - "Query Setup\n", - "We'll assume:\n", - "\n", - "You've already created a vector index using the HNSW algorithm (or another vector indexing mechanism).\n", - "You've stored your user or item vectors in Redis, either as fields in a Redis hash or as direct vector fields for vector similarity searches.\n", - "Step-by-Step Guide for Querying Vector Similarity\n", - "1. Create a Vector Index (If not already created)\n", - "Before you can perform similarity queries, you need to create a vector index using the FT.CREATE command. This defines how vectors are indexed in Redis.\n", - "\n" + "Especially for large datasets like the 45,000 movie catalog we're dealing with, you'll want Redis to do the heavy lifting of vector search.\n", + "All that's needed is to define the search index and load our data we've cleaned and merged with our vectors.\n" ] }, { @@ -992,6 +931,7 @@ } ], "source": [ + "# sanity check I merged all my dataframes properly and have the right sizes of moives, users, vectors, ids, etc.\n", "number_of_movies = len(movies_df.to_dict(orient='records'))\n", "size_of_movie_df = movies_df.size\n", "\n", @@ -1019,33 +959,33 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'id': 'movie:9df0babc731549909e929885973aee58', 'vector_distance': '-4.31072711945', 'title': 'The Million Dollar Hotel', 'genres': '[\"Drama\",\"Thriller\"]'}\n", - "{'id': 'movie:5d7079fac9534a0585608c9b0d01ba80', 'vector_distance': '-4.11799812317', 'title': \"Pandora's Box\", 'genres': '[\"Drama\",\"Thriller\",\"Romance\"]'}\n", - "{'id': 'movie:120d744065394c499f76589586051337', 'vector_distance': '-4.10946702957', 'title': \"Monsieur Hulot's Holiday\", 'genres': '[\"Comedy\",\"Family\"]'}\n", - "{'id': 'movie:ae34e2e4c64147c994f3c5c3c29f5190', 'vector_distance': '-4.01828145981', 'title': 'Scarface', 'genres': '[\"Action\",\"Crime\",\"Drama\",\"Thriller\"]'}\n", - "{'id': 'movie:a1506eff623a4ba0a165a850e7250d3e', 'vector_distance': '-4.00052165985', 'title': 'The Thomas Crown Affair', 'genres': '[\"Romance\",\"Crime\",\"Thriller\",\"Drama\"]'}\n", - "{'id': 'movie:3dbfc6b11f374d649663c0365792a3d2', 'vector_distance': '-3.99500846863', 'title': 'Dead Man', 'genres': '[\"Drama\",\"Fantasy\",\"Western\"]'}\n", - "{'id': 'movie:4df54c2a6f674b85ab12ced9b6eddf3e', 'vector_distance': '-3.99404859543', 'title': 'True Romance', 'genres': '[\"Action\",\"Thriller\",\"Crime\",\"Romance\"]'}\n", - "{'id': 'movie:fef75d040f864cc4abeda72c3d7830b6', 'vector_distance': '-3.98610448837', 'title': 'Sunshine', 'genres': '[\"Science Fiction\",\"Thriller\"]'}\n", - "{'id': 'movie:e6b996ce1db7497da1595b1ffeba66c9', 'vector_distance': '-3.94609308243', 'title': 'The Sixth Sense', 'genres': '[\"Mystery\",\"Thriller\",\"Drama\"]'}\n", - "{'id': 'movie:38c0ee9c438242ed8347afefc2469ca5', 'vector_distance': '-3.94594812393', 'title': 'Zatoichi', 'genres': '[\"Adventure\",\"Drama\",\"Action\"]'}\n", - "{'id': 'movie:e6345ce2592a4807ae5929094f8871db', 'vector_distance': '-3.94536495209', 'title': 'Straw Dogs', 'genres': '[\"Crime\",\"Drama\",\"Thriller\",\"Mystery\"]'}\n", - "{'id': 'movie:679bbb2708e847dc8730a5bf87d10b95', 'vector_distance': '-3.93847608566', 'title': 'While You Were Sleeping', 'genres': '[\"Comedy\",\"Drama\",\"Romance\"]'}\n", - "{'id': 'movie:d88f066c1c1e4eedb4bbcad4be8e85cd', 'vector_distance': '-3.92860937119', 'title': 'Cold Mountain', 'genres': '[\"Drama\"]'}\n", - "{'id': 'movie:67fa2af9660549a0bab428e2cb4c5801', 'vector_distance': '-3.92834091187', 'title': 'The Good Shepherd', 'genres': '[\"Drama\",\"Thriller\",\"History\"]'}\n", - "{'id': 'movie:5bc1a44fdea4421880acc9a050bdf8be', 'vector_distance': '-3.90582227707', 'title': \"Charlie's Angels\", 'genres': '[\"Action\",\"Adventure\",\"Comedy\",\"Crime\",\"Thriller\"]'}\n", - "{'id': 'movie:f1459b204f054f8daa7bb03e349d4bc1', 'vector_distance': '-3.90310573578', 'title': 'Gremlins 2: The New Batch', 'genres': '[\"Comedy\",\"Horror\",\"Fantasy\"]'}\n", - "{'id': 'movie:d0d08cb2caa44b42a3f21af5687fb7dc', 'vector_distance': '-3.90276098251', 'title': 'Ghost Rider', 'genres': '[\"Thriller\",\"Action\",\"Fantasy\",\"Horror\"]'}\n", - "{'id': 'movie:0c27eb9238744640a318c57104df4ddb', 'vector_distance': '-3.90011119843', 'title': 'Once Were Warriors', 'genres': '[\"Drama\"]'}\n", - "{'id': 'movie:e0a34f0930fb44d5857dc8d75b05b985', 'vector_distance': '-3.89683389664', 'title': 'Hard Target', 'genres': '[\"Action\",\"Adventure\",\"Crime\",\"Thriller\"]'}\n", - "{'id': 'movie:0facbd81368840769a3b49dcbd479472', 'vector_distance': '-3.89002752304', 'title': 'Space Jam', 'genres': '[\"Animation\",\"Comedy\",\"Drama\",\"Family\",\"Fantasy\"]'}\n" + "{'id': 'movie:9df0babc731549909e929885973aee58', 'vector_distance': '-3.6087179184', 'title': 'The Million Dollar Hotel', 'genres': '[\"Drama\",\"Thriller\"]'}\n", + "{'id': 'movie:ad9142a30fd045cfbb1e5ae0b051f4c8', 'vector_distance': '-3.56296348572', 'title': 'Terminator 3: Rise of the Machines', 'genres': '[\"Action\",\"Thriller\",\"Science Fiction\"]'}\n", + "{'id': 'movie:bf3354ae719b44c782876272c5f95ce1', 'vector_distance': '-3.52630567551', 'title': 'Beverly Hills Cop III', 'genres': '[\"Action\",\"Comedy\",\"Crime\"]'}\n", + "{'id': 'movie:88ce7f7738104c539b004135f306e9ec', 'vector_distance': '-3.46648168564', 'title': 'Backdraft', 'genres': '[\"Action\",\"Crime\",\"Drama\",\"Mystery\",\"Thriller\"]'}\n", + "{'id': 'movie:9fe2e20d887a4263a540f3945f10751d', 'vector_distance': '-3.40900659561', 'title': 'Boogie Nights', 'genres': '[\"Drama\"]'}\n", + "{'id': 'movie:9cd57bacb5de437a88279a17c2161ce2', 'vector_distance': '-3.38699388504', 'title': 'The Good Thief', 'genres': '[\"Crime\",\"Drama\",\"Thriller\"]'}\n", + "{'id': 'movie:5d7079fac9534a0585608c9b0d01ba80', 'vector_distance': '-3.3247923851', 'title': \"Pandora's Box\", 'genres': '[\"Drama\",\"Thriller\",\"Romance\"]'}\n", + "{'id': 'movie:fa9cf76285c348078bb9814fa6b9357f', 'vector_distance': '-3.31738758087', 'title': 'Dawn of the Dead', 'genres': '[\"Horror\"]'}\n", + "{'id': 'movie:0c27eb9238744640a318c57104df4ddb', 'vector_distance': '-3.309486866', 'title': 'Once Were Warriors', 'genres': '[\"Drama\"]'}\n", + "{'id': 'movie:f490c86a71164bf3956d92be5de8ba05', 'vector_distance': '-3.30277919769', 'title': 'Light of Day', 'genres': '[\"Music\",\"Drama\"]'}\n", + "{'id': 'movie:e8ef474819814eaea6cb757449d3eded', 'vector_distance': '-3.28701210022', 'title': 'Beetlejuice', 'genres': '[\"Fantasy\",\"Comedy\"]'}\n", + "{'id': 'movie:0c596d8911e0498a854b4e6d5faae545', 'vector_distance': '-3.27935218811', 'title': 'Enough', 'genres': '[\"Drama\",\"Thriller\"]'}\n", + "{'id': 'movie:d0d08cb2caa44b42a3f21af5687fb7dc', 'vector_distance': '-3.26998329163', 'title': 'Ghost Rider', 'genres': '[\"Thriller\",\"Action\",\"Fantasy\",\"Horror\"]'}\n", + "{'id': 'movie:b5bcd6dba0474b709ccb3b10c3e2fb14', 'vector_distance': '-3.26271867752', 'title': 'Cousin, Cousine', 'genres': '[\"Romance\",\"Comedy\"]'}\n", + "{'id': 'movie:5726b3476a2d450db8792217298b7b57', 'vector_distance': '-3.25473356247', 'title': \"We're No Angels\", 'genres': '[\"Comedy\",\"Crime\",\"Drama\"]'}\n", + "{'id': 'movie:f1459b204f054f8daa7bb03e349d4bc1', 'vector_distance': '-3.25318956375', 'title': 'Gremlins 2: The New Batch', 'genres': '[\"Comedy\",\"Horror\",\"Fantasy\"]'}\n", + "{'id': 'movie:1cb9f1fb5d3f45a3861607aca03dfd4d', 'vector_distance': '-3.20173215866', 'title': 'Sleepless in Seattle', 'genres': '[\"Comedy\",\"Drama\",\"Romance\"]'}\n", + "{'id': 'movie:a567595136ef488b90637b77afb10664', 'vector_distance': '-3.19421386719', 'title': 'Point Break', 'genres': '[\"Action\",\"Thriller\",\"Crime\"]'}\n", + "{'id': 'movie:776e7891798048629d2dfa532ace8ff5', 'vector_distance': '-3.1903834343', 'title': \"My Best Friend's Wedding\", 'genres': '[\"Comedy\",\"Romance\"]'}\n", + "{'id': 'movie:6bcc04c814d24da6926161c9e0c10a76', 'vector_distance': '-3.17692661285', 'title': 'Cool Hand Luke', 'genres': '[\"Crime\",\"Drama\"]'}\n" ] } ], @@ -1053,11 +993,11 @@ "from redisvl.query import RangeQuery, FilterQuery\n", "from redisvl.query.filter import Tag, Num, Text\n", "\n", - "user_1_vector = user_vectors[20].tolist()\n", + "user_vector = user_vectors[352].tolist()\n", "\n", "# the distance metric 'ip' inner product is computing \"score = 1 - u * v\" and returning the minimum, which corresponds to the max of \"u * v\"\n", "# this is what we want. The predicted rating on a scale of 0 to 5 is then -(score - 1) == -score + 1\n", - "query = RangeQuery(vector=user_1_vector,\n", + "query = RangeQuery(vector=user_vector,\n", " vector_field_name='movie_vector',\n", " num_results=20,\n", " return_score=True,\n", @@ -1069,6 +1009,39 @@ "for r in results:\n", " print(r)" ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "## EVALUATE MOVE TO COLLABORATIVE FILTERING SO WE CAN SHOW BETTER NUMBERS\n", + "#let's see how well this works. we can choose some users, and based on their first watched movie we can recommend them some more.\n", + "#we can then look at the set intersection between our recommendations and the movies they actually watched (and rated highly) to see how well we did." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO use bloom filter and/or cuckoo filter with the recommendations and user's watched_list in their index to filter out movies they already watched" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# clean up your index\n", + "\n", + "while remaining := movie_index.clear():\n", + " print(f\"Deleted {remaining} keys\")" + ] } ], "metadata": { diff --git a/python-recipes/recommendation-systems/user_schema.yaml b/python-recipes/recommendation-systems/user_schema.yaml index 95511038..e89bd6a0 100644 --- a/python-recipes/recommendation-systems/user_schema.yaml +++ b/python-recipes/recommendation-systems/user_schema.yaml @@ -6,8 +6,10 @@ index: fields: - name: usr_id type: tag - name: ratings + - name: ratings type: numeric + name: watched_list + type: text - name: user_vector type: vector From 9778b0abd6e2693f76452d7ebcc166a1e8d5fe22 Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Thu, 10 Oct 2024 20:58:44 -0700 Subject: [PATCH 03/12] fixes dataframe merging on movie ids --- .../collaborative_filtering.ipynb | 398 +++++++++--------- .../recommendation-systems/user_schema.yaml | 2 +- 2 files changed, 211 insertions(+), 189 deletions(-) diff --git a/python-recipes/recommendation-systems/collaborative_filtering.ipynb b/python-recipes/recommendation-systems/collaborative_filtering.ipynb index e4fdfdd3..9977d2ef 100644 --- a/python-recipes/recommendation-systems/collaborative_filtering.ipynb +++ b/python-recipes/recommendation-systems/collaborative_filtering.ipynb @@ -4,7 +4,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Collaborative Filtering in RedisVL" + "![Redis](https://redis.io/wp-content/uploads/2024/04/Logotype.svg?auto=webp&quality=85,75&width=120)\n", + "\n", + "# Collaborative Filtering in RedisVL\n", + "\n", + "\"Open" ] }, { @@ -23,16 +27,17 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 274, "metadata": {}, "outputs": [], "source": [ + "# NBVAL_SKIP\n", "!pip install scikit-surprise --quiet" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 275, "metadata": {}, "outputs": [], "source": [ @@ -64,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 276, "metadata": {}, "outputs": [], "source": [ @@ -84,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 277, "metadata": {}, "outputs": [], "source": [ @@ -113,7 +118,7 @@ "source": [ "A lot is going to happen in the code cell below. We split our full data into train and test sets. We defined the collaborative filtering algorithm to use, which in this case is the Singular Value Decomposition (SVD) algorithm. lastly, we fit our model to our data.\n", "\n", - "It's worth going into more detail why we chose this algorithm and what it is computing in the `.fit(train_set)` method we're calling.\n", + "It's worth going into more detail why we chose this algorithm and what it is computing in the `svd.fit(train_set)` method we're calling.\n", "First, let's think about what data it's receiving - our ratings data. This only contains the user_ids, movie_ids, and the user's ratings of their watched movies on a scale of 1 to 5.\n", "\n", "We can put this data into a matrix with rows being users and columns being movies\n", @@ -134,15 +139,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Unlike Content Filtering, here we're only considering the ratings that users assign. We don't know the plot or genre or release year of any of these films.\n", - "But we can still build recommender by assuming that users have similar tastes to each other. As an intuitive example, we can see that `user_1` and `user_4` have very similar ratings on several movies, so we will assume that `user_4` will rate `movie_6` highly, just as `user_1` did. This is the idea behind collaborative filtering." + "Unlike Content Filtering, here we're only considering the ratings that users assign. We don't know the plot or genre or release year of any of these films. We don't even know the title.\n", + "But we can still build a recommender by assuming that users have similar tastes to each other. As an intuitive example, we can see that `user_1` and `user_4` have very similar ratings on several movies, so we will assume that `user_4` will rate `movie_6` highly, just as `user_1` did. This is the idea behind collaborative filtering." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "That's the intuition, but what about the math? Since we only have this matrix to work with what we want to do is decompose it into two constituent matrices.\n", + "That's the intuition, but what about the math? Since we only have this matrix to work with, what we want to do is decompose it into two constituent matrices.\n", "Lets call our ratings matrix `[R]`. We want to find two other matrices, a user matrix `[U]`, and a movies matrix `[M]` that fit the equation:\n", "\n", "`[U] * [M] = [R]`\n", @@ -166,21 +171,21 @@ "| movie_1_feature_k | movie_2_feature_k | movie_3_feature_k | ... | movie_M_feature_k |\n", "\n", "\n", - "these features are called the latent features (or latent factors) and are the values we're trying to find when we call the `svd_algo.fit(training_data)` method. The algorithm that computes these features from our ratings matrix is the SVD algorithm. The number of users and movies is set by our data. The size of the latent feature vectors `k` is a parameter we choose. We'll keep it at the default 100 for this notebook." + "these features are called the latent features (or latent factors) and are the values we're trying to find when we call the `svd.fit(training_data)` method. The algorithm that computes these features from our ratings matrix is the SVD algorithm. The number of users and movies is set by our data. The size of the latent feature vectors `k` is a parameter we choose. We'll keep it at the default 100 for this notebook." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 278, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 7, + "execution_count": 278, "metadata": {}, "output_type": "execute_result" } @@ -190,10 +195,10 @@ "train_set, test_set = train_test_split(ratings_data, test_size=0.2)\n", "\n", "# use SVD (Singular Value Decomposition) for collaborative filtering\n", - "svd_algo = SVD(n_factors=100, biased=False) # We'll set biased to False so that predictions are of the form \"rating_prediction = user_vector dot item_vector\"\n", + "svd = SVD(n_factors=100, biased=False) # We'll set biased to False so that predictions are of the form \"rating_prediction = user_vector dot item_vector\"\n", "\n", "# train the algorithm on the train_set\n", - "svd_algo.fit(train_set)" + "svd.fit(train_set)" ] }, { @@ -211,15 +216,15 @@ "\n", "The Surprise SVD model stores user and movie vectors in two attributes:\n", "\n", - "`algo.pu`: user features matrix (a matrix where each row corresponds to the latent features of a user).\n", - "`algo.qi`: item features matrix (a matrix where each row corresponds to the latent features of an item/movie).\n", + "`svd.pu`: user features matrix (a matrix where each row corresponds to the latent features of a user).\n", + "`svd.qi`: item features matrix (a matrix where each row corresponds to the latent features of an item/movie).\n", "\n", - "It's worth noting that the matrix `algo.qi` is the transpose of the matrix `[M]` we defined above. This way each row corresponds to one movie" + "It's worth noting that the matrix `svd.qi` is the transpose of the matrix `[M]` we defined above. This way each row corresponds to one movie." ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 279, "metadata": {}, "outputs": [ { @@ -227,13 +232,13 @@ "output_type": "stream", "text": [ "we have 671 users with feature vectors of size 100\n", - "we have 8405 movies with feature vectors of size 100\n" + "we have 8435 movies with feature vectors of size 100\n" ] } ], "source": [ - "user_vectors = svd_algo.pu # user latent features (matrix)\n", - "movie_vectors = svd_algo.qi # movie latent features (matrix)\n", + "user_vectors = svd.pu # user latent features (matrix)\n", + "movie_vectors = svd.qi # movie latent features (matrix)\n", "\n", "print(f'we have {user_vectors.shape[0]} users with feature vectors of size {user_vectors.shape[1]}')\n", "print(f'we have {movie_vectors.shape[0]} movies with feature vectors of size {movie_vectors.shape[1]}')" @@ -253,64 +258,55 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 280, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "the predicted rating of user 347 on movie 5515 is 1.4150893670982523\n" + "the predicted rating of user 347 on movie 5515 is 1.83012299501901\n" ] } ], "source": [ - "# predict one user's rating of one film\n", - "# surprise casts userId and movieId to inner ids\n", - "\n", + "# surprise casts userId and movieId to inner ids, so we have to use their mapping to now which rows to use\n", "inner_uid = train_set.to_inner_uid(347) # user_id\n", "inner_iid = train_set.to_inner_iid(5515) # movie_id\n", - "print(f'the predicted rating of user {347} on movie {5515} is {np.dot(user_vectors[inner_uid], movie_vectors[inner_iid])}')\n" + "\n", + "# predict one user's rating of one film\n", + "predicted_rating = np.dot(user_vectors[inner_uid], movie_vectors[inner_iid])\n", + "print(f'the predicted rating of user {347} on movie {5515} is {predicted_rating}')" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 281, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "user: 347 item: 5515 r_ui = None est = 1.42 {'was_impossible': False}\n", - "1.4150893670982523\n" + "user: 347 item: 5515 r_ui = None est = 1.83 {'was_impossible': False}\n", + "1.83012299501901\n" ] - }, - { - "data": { - "text/plain": [ - "Prediction(uid=347, iid=5515, r_ui=None, est=1.4150893670982523, details={'was_impossible': False})" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ "# sanity check my math matches Surprise package math\n", - "print(svd_algo.predict(347, 5515))\n", + "print(svd.predict(347, 5515))\n", "\n", "inner_uid = train_set.to_inner_uid(347)\n", "inner_iid = train_set.to_inner_iid(5515)\n", - "print(np.dot(user_vectors[inner_uid], movie_vectors[inner_iid])) # surprise casts userId and movieId to inner ids\n", - "svd_algo.predict(347, 5515)" + "print(np.dot(user_vectors[inner_uid], movie_vectors[inner_iid])) # surprise casts userId and movieId to inner ids" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ + "## Adding Movie Data\n", "while our collaborative filtering algorithm was trained solely on user's ratings of movies, and doesn't require any data about the movies themselves - like the title, genres, or release year - we'll want that information stored as metadata.\n", "\n", "We can grab this data from our `movies_metadata.csv` file, clean it, and join it to our user ratings via the `movieId` column" @@ -318,7 +314,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 282, "metadata": {}, "outputs": [ { @@ -558,7 +554,7 @@ "[5 rows x 23 columns]" ] }, - "execution_count": 10, + "execution_count": 282, "metadata": {}, "output_type": "execute_result" } @@ -570,7 +566,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 283, "metadata": {}, "outputs": [ { @@ -594,7 +590,7 @@ "dtype: int64" ] }, - "execution_count": 11, + "execution_count": 283, "metadata": {}, "output_type": "execute_result" } @@ -629,16 +625,113 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We'll eventually have to map these movies to their ratings, which we'll do so with the `links.csv` file that matches `movieId`, `imdbId`, and `tmdbId`.\n", - "\n", + "We'll have to map these movies to their ratings, which we'll do so with the `links.csv` file that matches `movieId`, `imdbId`, and `tmdbId`.\n", "Let's do that now." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 284, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "links_df = fetch_dataframe('links_small.csv')\n", + "movies_df = movies_df.merge(links_df, left_on='imdb_id', right_on='imdbId', how='inner')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll want to move our SVD user vectors and movie vectors and their corresponding userId and movieId into 2 dataframes for later processing." + ] + }, + { + "cell_type": "code", + "execution_count": 285, + "metadata": {}, + "outputs": [], + "source": [ + "# build a dataframe out of the user vectors and their userIds\n", + "user_vectors_and_ids = {train_set.to_raw_uid(inner_id): user_vectors[inner_id].tolist() for inner_id in train_set.all_users()}\n", + "user_vector_df = pd.Series(user_vectors_and_ids).to_frame('user_vector')\n", + "\n", + "# now do the same for the movie vectors and their movieIds\n", + "movie_vectors_and_ids = {train_set.to_raw_iid(inner_id): movie_vectors[inner_id].tolist() for inner_id in train_set.all_items()}\n", + "movie_vector_df = pd.Series(movie_vectors_and_ids).to_frame('movie_vector')\n", + "\n", + "# merge the movie vector series with the movies dataframe using the movieId and id fields\n", + "movies_df = movies_df.merge(movie_vector_df, left_on='movieId', right_index=True, how='inner')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RedisVL Handles the Scale\n", + "\n", + "Especially for large datasets like the 45,000 movie catalog we're dealing with, you'll want Redis to do the heavy lifting of vector search.\n", + "All that's needed is to define the search index and load our data we've cleaned and merged with our vectors.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 286, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20:56:23 redisvl.index.index INFO Index already exists, overwriting.\n", + "20:56:23 redisvl.index.index INFO Index already exists, overwriting.\n" + ] + } + ], + "source": [ + "from redis import Redis\n", + "from redisvl.schema import IndexSchema\n", + "from redisvl.index import SearchIndex\n", + "\n", + "client = Redis.from_url(REDIS_URL)\n", + "\n", + "movie_schema = IndexSchema.from_yaml(\"collaborative_filtering_schema.yaml\")\n", + "\n", + "movie_index = SearchIndex(movie_schema, redis_client=client)\n", + "movie_index.create(overwrite=True, drop=True)\n", + "\n", + "user_schema = IndexSchema.from_yaml(\"user_schema.yaml\")\n", + "\n", + "user_index = SearchIndex(user_schema, redis_client=client)\n", + "user_index.create(overwrite=True, drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 287, + "metadata": {}, + "outputs": [], + "source": [ + "keys = movie_index.load(movies_df.to_dict(orient='records'))" + ] + }, + { + "cell_type": "code", + "execution_count": 288, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "number of movies 8387\n", + "size of movie df 8387\n", + "unique movie ids 8381\n", + "unique movie titles 8150\n", + "unique movies rated 9065\n" + ] + }, { "data": { "text/html": [ @@ -678,6 +771,7 @@ " movieId\n", " imdbId\n", " tmdbId\n", + " movie_vector\n", " \n", " \n", " \n", @@ -701,6 +795,7 @@ " 1\n", " 114709\n", " 862.0\n", + " [-0.023792249725276562, 0.1785839516922377, -0...\n", " \n", " \n", " 1\n", @@ -722,6 +817,7 @@ " 2\n", " 113497\n", " 8844.0\n", + " [0.2793838607565979, -0.21744939596620874, 0.1...\n", " \n", " \n", " 2\n", @@ -743,6 +839,7 @@ " 3\n", " 113228\n", " 15602.0\n", + " [-0.020947681442077554, 0.20694515937091487, 0...\n", " \n", " \n", " 3\n", @@ -764,6 +861,7 @@ " 4\n", " 114885\n", " 31357.0\n", + " [0.04080238290985722, 0.07032878736373183, -0....\n", " \n", " \n", " 4\n", @@ -785,6 +883,7 @@ " 5\n", " 113041\n", " 11862.0\n", + " [-0.004196795084205664, -0.04584846941882623, ...\n", " \n", " \n", "\n", @@ -826,124 +925,36 @@ "3 Waiting to Exhale 6.1 34 4 114885 \n", "4 Father of the Bride Part II 5.7 173 5 113041 \n", "\n", - " tmdbId \n", - "0 862.0 \n", - "1 8844.0 \n", - "2 15602.0 \n", - "3 31357.0 \n", - "4 11862.0 " + " tmdbId movie_vector \n", + "0 862.0 [-0.023792249725276562, 0.1785839516922377, -0... \n", + "1 8844.0 [0.2793838607565979, -0.21744939596620874, 0.1... \n", + "2 15602.0 [-0.020947681442077554, 0.20694515937091487, 0... \n", + "3 31357.0 [0.04080238290985722, 0.07032878736373183, -0.... \n", + "4 11862.0 [-0.004196795084205664, -0.04584846941882623, ... " ] }, - "execution_count": 12, + "execution_count": 288, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "\n", - "links_df = fetch_dataframe('links_small.csv')\n", - "movies_df = movies_df.merge(links_df, left_on='imdb_id', right_on='imdbId', how='inner')\n", - "movies_df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll want to move our SVD user vectors and movie vectors and their corresponding userId and movieId into 2 dataframes for later processing." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "# build a dataframe out of the user vectors and their userIds\n", - "user_vectors_and_ids = {train_set.to_raw_uid(inner_id): user_vectors[inner_id].tolist() for inner_id in train_set.all_users()}\n", - "user_vector_df = pd.Series(user_vectors_and_ids).to_frame('user_vector')\n", - "\n", - "# now do the same for the movie vectors and their movieIds\n", - "movie_vectors_and_ids = {train_set.to_raw_iid(inner_id): movie_vectors[inner_id].tolist() for inner_id in train_set.all_items()}\n", - "movie_vector_df = pd.Series(movie_vectors_and_ids).to_frame('movie_vector')\n", - "\n", - "# merge the movie vector series with the movies dataframe using the movieId and id fields\n", - "movies_df = movies_df.merge(movie_vector_df, left_on='id', right_index=True, how='inner')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## RedisVL Handles the Scale\n", - "\n", - "Especially for large datasets like the 45,000 movie catalog we're dealing with, you'll want Redis to do the heavy lifting of vector search.\n", - "All that's needed is to define the search index and load our data we've cleaned and merged with our vectors.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "from redis import Redis\n", - "from redisvl.schema import IndexSchema\n", - "from redisvl.index import SearchIndex\n", - "\n", - "client = Redis.from_url(REDIS_URL)\n", - "\n", - "movie_schema = IndexSchema.from_yaml(\"collaborative_filtering_schema.yaml\")\n", - "\n", - "movie_index = SearchIndex(movie_schema, redis_client=client)\n", - "movie_index.create(overwrite=True, drop=True)\n", - "\n", - "user_schema = IndexSchema.from_yaml(\"user_schema.yaml\")\n", - "\n", - "user_index = SearchIndex(user_schema, redis_client=client)\n", - "user_index.create(overwrite=True, drop=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "keys = movie_index.load(movies_df.to_dict(orient='records'))" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1494\n", - "28386\n", - "1494\n", - "1482\n", - "9065\n" - ] - } - ], - "source": [ - "# sanity check I merged all my dataframes properly and have the right sizes of moives, users, vectors, ids, etc.\n", + "# sanity check we merged all my dataframes properly and have the right sizes of moives, users, vectors, ids, etc.\n", "number_of_movies = len(movies_df.to_dict(orient='records'))\n", - "size_of_movie_df = movies_df.size\n", + "size_of_movie_df = movies_df.shape[0]\n", + "\n", + "print('number of movies', number_of_movies)\n", + "print('size of movie df', size_of_movie_df)\n", "\n", - "print(number_of_movies)\n", - "print(size_of_movie_df)\n", "unique_movie_ids = movies_df['id'].nunique()\n", - "print(unique_movie_ids)\n", + "print('unique movie ids', unique_movie_ids)\n", + "\n", "unique_movie_titles = movies_df['title'].nunique()\n", - "print(unique_movie_titles)\n", + "print('unique movie titles', unique_movie_titles)\n", "\n", "unique_movies_rated = ratings_df['movieId'].nunique()\n", - "print(unique_movies_rated)" + "print('unique movies rated', unique_movies_rated)\n", + "movies_df.head()" ] }, { @@ -954,38 +965,38 @@ "\n", "This is why in our `collaborative_filtering_schema.yaml` we use `ip` (inner product) as our distance metric.\n", "\n", - "It's also why we'll use our user vector as the query vector when we do a vector query." + "It's also why we'll use our user vector as the query vector when we do a query." ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 289, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'id': 'movie:9df0babc731549909e929885973aee58', 'vector_distance': '-3.6087179184', 'title': 'The Million Dollar Hotel', 'genres': '[\"Drama\",\"Thriller\"]'}\n", - "{'id': 'movie:ad9142a30fd045cfbb1e5ae0b051f4c8', 'vector_distance': '-3.56296348572', 'title': 'Terminator 3: Rise of the Machines', 'genres': '[\"Action\",\"Thriller\",\"Science Fiction\"]'}\n", - "{'id': 'movie:bf3354ae719b44c782876272c5f95ce1', 'vector_distance': '-3.52630567551', 'title': 'Beverly Hills Cop III', 'genres': '[\"Action\",\"Comedy\",\"Crime\"]'}\n", - "{'id': 'movie:88ce7f7738104c539b004135f306e9ec', 'vector_distance': '-3.46648168564', 'title': 'Backdraft', 'genres': '[\"Action\",\"Crime\",\"Drama\",\"Mystery\",\"Thriller\"]'}\n", - "{'id': 'movie:9fe2e20d887a4263a540f3945f10751d', 'vector_distance': '-3.40900659561', 'title': 'Boogie Nights', 'genres': '[\"Drama\"]'}\n", - "{'id': 'movie:9cd57bacb5de437a88279a17c2161ce2', 'vector_distance': '-3.38699388504', 'title': 'The Good Thief', 'genres': '[\"Crime\",\"Drama\",\"Thriller\"]'}\n", - "{'id': 'movie:5d7079fac9534a0585608c9b0d01ba80', 'vector_distance': '-3.3247923851', 'title': \"Pandora's Box\", 'genres': '[\"Drama\",\"Thriller\",\"Romance\"]'}\n", - "{'id': 'movie:fa9cf76285c348078bb9814fa6b9357f', 'vector_distance': '-3.31738758087', 'title': 'Dawn of the Dead', 'genres': '[\"Horror\"]'}\n", - "{'id': 'movie:0c27eb9238744640a318c57104df4ddb', 'vector_distance': '-3.309486866', 'title': 'Once Were Warriors', 'genres': '[\"Drama\"]'}\n", - "{'id': 'movie:f490c86a71164bf3956d92be5de8ba05', 'vector_distance': '-3.30277919769', 'title': 'Light of Day', 'genres': '[\"Music\",\"Drama\"]'}\n", - "{'id': 'movie:e8ef474819814eaea6cb757449d3eded', 'vector_distance': '-3.28701210022', 'title': 'Beetlejuice', 'genres': '[\"Fantasy\",\"Comedy\"]'}\n", - "{'id': 'movie:0c596d8911e0498a854b4e6d5faae545', 'vector_distance': '-3.27935218811', 'title': 'Enough', 'genres': '[\"Drama\",\"Thriller\"]'}\n", - "{'id': 'movie:d0d08cb2caa44b42a3f21af5687fb7dc', 'vector_distance': '-3.26998329163', 'title': 'Ghost Rider', 'genres': '[\"Thriller\",\"Action\",\"Fantasy\",\"Horror\"]'}\n", - "{'id': 'movie:b5bcd6dba0474b709ccb3b10c3e2fb14', 'vector_distance': '-3.26271867752', 'title': 'Cousin, Cousine', 'genres': '[\"Romance\",\"Comedy\"]'}\n", - "{'id': 'movie:5726b3476a2d450db8792217298b7b57', 'vector_distance': '-3.25473356247', 'title': \"We're No Angels\", 'genres': '[\"Comedy\",\"Crime\",\"Drama\"]'}\n", - "{'id': 'movie:f1459b204f054f8daa7bb03e349d4bc1', 'vector_distance': '-3.25318956375', 'title': 'Gremlins 2: The New Batch', 'genres': '[\"Comedy\",\"Horror\",\"Fantasy\"]'}\n", - "{'id': 'movie:1cb9f1fb5d3f45a3861607aca03dfd4d', 'vector_distance': '-3.20173215866', 'title': 'Sleepless in Seattle', 'genres': '[\"Comedy\",\"Drama\",\"Romance\"]'}\n", - "{'id': 'movie:a567595136ef488b90637b77afb10664', 'vector_distance': '-3.19421386719', 'title': 'Point Break', 'genres': '[\"Action\",\"Thriller\",\"Crime\"]'}\n", - "{'id': 'movie:776e7891798048629d2dfa532ace8ff5', 'vector_distance': '-3.1903834343', 'title': \"My Best Friend's Wedding\", 'genres': '[\"Comedy\",\"Romance\"]'}\n", - "{'id': 'movie:6bcc04c814d24da6926161c9e0c10a76', 'vector_distance': '-3.17692661285', 'title': 'Cool Hand Luke', 'genres': '[\"Crime\",\"Drama\"]'}\n" + "{'id': 'movie:ea1eb7855f474e1190fc997697717bce', 'vector_distance': '-3.16834640503', 'title': 'Terminator 2: Judgment Day', 'genres': '[\"Action\",\"Thriller\",\"Science Fiction\"]'}\n", + "{'id': 'movie:882a7c1429d24473abb22ad8a7294a74', 'vector_distance': '-3.07514286041', 'title': 'Pearl Harbor', 'genres': '[\"History\",\"Romance\",\"War\"]'}\n", + "{'id': 'movie:df31a1ffc0a3432fb1d6952c74a14bdc', 'vector_distance': '-3.04155731201', 'title': 'Girl, Interrupted', 'genres': '[\"Drama\"]'}\n", + "{'id': 'movie:ea1778357c934222a84099ab488503e7', 'vector_distance': '-2.99355602264', 'title': 'Cruel Intentions', 'genres': '[\"Drama\",\"Romance\",\"Thriller\"]'}\n", + "{'id': 'movie:df965ebd8dfa482dbcf184e9a4234f0a', 'vector_distance': '-2.97833967209', 'title': 'Remember the Titans', 'genres': '[\"Drama\"]'}\n", + "{'id': 'movie:5bd9e34ff7394e82b2508087543008be', 'vector_distance': '-2.91204404831', 'title': 'The Quiet American', 'genres': '[\"Drama\",\"Action\",\"Thriller\",\"Romance\"]'}\n", + "{'id': 'movie:ca5dcbe4ce844f74989294c4fc3ed61f', 'vector_distance': '-2.90972471237', 'title': 'The Departed', 'genres': '[\"Drama\",\"Thriller\",\"Crime\"]'}\n", + "{'id': 'movie:5efbb2346520455aaef3ae51fb9de029', 'vector_distance': '-2.9093708992', 'title': 'Run Lola Run', 'genres': '[\"Action\",\"Drama\",\"Thriller\"]'}\n", + "{'id': 'movie:3b51e695ae084eb0ace87c2f48235fe8', 'vector_distance': '-2.89018774033', 'title': 'Gladiator', 'genres': '[\"Action\",\"Drama\",\"Adventure\"]'}\n", + "{'id': 'movie:207756fd3d2f4e5599b8d149bf5c6b59', 'vector_distance': '-2.88189530373', 'title': 'Big Fish', 'genres': '[\"Adventure\",\"Fantasy\",\"Drama\"]'}\n", + "{'id': 'movie:1cb9cfa6294f48e8942becbe29af4765', 'vector_distance': '-2.87604212761', 'title': 'The Wrong Trousers', 'genres': '[\"Animation\",\"Comedy\",\"Family\"]'}\n", + "{'id': 'movie:ef35f7f4b72646b49e377e06f8dc7dbf', 'vector_distance': '-2.86843323708', 'title': 'The African Queen', 'genres': '[\"Adventure\",\"War\",\"Romance\"]'}\n", + "{'id': 'movie:c1f1b760faed43f09100a725b7420510', 'vector_distance': '-2.8342666626', 'title': 'Crumb', 'genres': '[\"Documentary\"]'}\n", + "{'id': 'movie:cdf6b6911dcd4378a69fa9ce2979b7a6', 'vector_distance': '-2.83272624016', 'title': 'Heathers', 'genres': '[\"Thriller\",\"Comedy\",\"Drama\"]'}\n", + "{'id': 'movie:5569250c1da8423ba4644c899a0523b5', 'vector_distance': '-2.83206033707', 'title': 'Murder in the First', 'genres': '[\"Crime\",\"Drama\"]'}\n", + "{'id': 'movie:a297851d857242f4b7cd8679c8d2c2f9', 'vector_distance': '-2.82705926895', 'title': 'Angels and Insects', 'genres': '[\"Drama\",\"Romance\"]'}\n", + "{'id': 'movie:ff792c59ab7045038f84e1ceefba1652', 'vector_distance': '-2.81982970238', 'title': 'Beauty and the Beast', 'genres': '[\"Romance\",\"Family\",\"Animation\",\"Fantasy\",\"Music\"]'}\n", + "{'id': 'movie:e6a67cfe875e452c8f0239f5d484fe08', 'vector_distance': '-2.81556129456', 'title': 'Gladiator 1992', 'genres': '[\"Action\",\"Drama\"]'}\n", + "{'id': 'movie:9f7d1fbd89a8477db90147d382d7f53c', 'vector_distance': '-2.80134963989', 'title': 'Band of Brothers', 'genres': '[\"Action\",\"Drama\",\"War\"]'}\n", + "{'id': 'movie:bb9113c4eb0a4e798eea4ccbb3cdf448', 'vector_distance': '-2.79896092415', 'title': 'Life Is Beautiful', 'genres': '[\"Comedy\",\"Drama\"]'}\n" ] } ], @@ -998,11 +1009,11 @@ "# the distance metric 'ip' inner product is computing \"score = 1 - u * v\" and returning the minimum, which corresponds to the max of \"u * v\"\n", "# this is what we want. The predicted rating on a scale of 0 to 5 is then -(score - 1) == -score + 1\n", "query = RangeQuery(vector=user_vector,\n", - " vector_field_name='movie_vector',\n", - " num_results=20,\n", - " return_score=True,\n", - " return_fields=['title', 'genres']\n", - " )\n", + " vector_field_name='movie_vector',\n", + " num_results=20,\n", + " return_score=True,\n", + " return_fields=['title', 'genres']\n", + " )\n", "\n", "results = movie_index.query(query)\n", "\n", @@ -1012,7 +1023,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 290, "metadata": {}, "outputs": [], "source": [ @@ -1024,7 +1035,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 291, "metadata": {}, "outputs": [], "source": [ @@ -1033,12 +1044,23 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 292, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Deleted 4387 keys\n", + "Deleted 2000 keys\n", + "Deleted 1000 keys\n", + "Deleted 500 keys\n", + "Deleted 500 keys\n" + ] + } + ], "source": [ "# clean up your index\n", - "\n", "while remaining := movie_index.clear():\n", " print(f\"Deleted {remaining} keys\")" ] diff --git a/python-recipes/recommendation-systems/user_schema.yaml b/python-recipes/recommendation-systems/user_schema.yaml index e89bd6a0..6d5c9ebd 100644 --- a/python-recipes/recommendation-systems/user_schema.yaml +++ b/python-recipes/recommendation-systems/user_schema.yaml @@ -4,7 +4,7 @@ index: storage_type: json fields: - - name: usr_id + - name: user_id type: tag - name: ratings type: numeric From ff02e4933715be559c4f7341c16cfcce360a6fff Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Sat, 12 Oct 2024 19:06:16 -0700 Subject: [PATCH 04/12] wip: adding bloom filtering to collaborative notebook --- .../collaborative_filtering.ipynb | 506 +++++++++++++++--- 1 file changed, 430 insertions(+), 76 deletions(-) diff --git a/python-recipes/recommendation-systems/collaborative_filtering.ipynb b/python-recipes/recommendation-systems/collaborative_filtering.ipynb index 9977d2ef..01e97b33 100644 --- a/python-recipes/recommendation-systems/collaborative_filtering.ipynb +++ b/python-recipes/recommendation-systems/collaborative_filtering.ipynb @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 274, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 275, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 276, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -89,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": 277, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -119,7 +119,7 @@ "A lot is going to happen in the code cell below. We split our full data into train and test sets. We defined the collaborative filtering algorithm to use, which in this case is the Singular Value Decomposition (SVD) algorithm. lastly, we fit our model to our data.\n", "\n", "It's worth going into more detail why we chose this algorithm and what it is computing in the `svd.fit(train_set)` method we're calling.\n", - "First, let's think about what data it's receiving - our ratings data. This only contains the user_ids, movie_ids, and the user's ratings of their watched movies on a scale of 1 to 5.\n", + "First, let's think about what data it's receiving - our ratings data. This only contains the userIds, movieIds, and the user's ratings of their watched movies on a scale of 1 to 5.\n", "\n", "We can put this data into a matrix with rows being users and columns being movies\n", "\n", @@ -176,16 +176,16 @@ }, { "cell_type": "code", - "execution_count": 278, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 278, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -224,7 +224,7 @@ }, { "cell_type": "code", - "execution_count": 279, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -232,7 +232,7 @@ "output_type": "stream", "text": [ "we have 671 users with feature vectors of size 100\n", - "we have 8435 movies with feature vectors of size 100\n" + "we have 8398 movies with feature vectors of size 100\n" ] } ], @@ -258,21 +258,21 @@ }, { "cell_type": "code", - "execution_count": 280, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "the predicted rating of user 347 on movie 5515 is 1.83012299501901\n" + "the predicted rating of user 347 on movie 5515 is 1.2662407571780765\n" ] } ], "source": [ "# surprise casts userId and movieId to inner ids, so we have to use their mapping to now which rows to use\n", - "inner_uid = train_set.to_inner_uid(347) # user_id\n", - "inner_iid = train_set.to_inner_iid(5515) # movie_id\n", + "inner_uid = train_set.to_inner_uid(347) # userId\n", + "inner_iid = train_set.to_inner_iid(5515) # movieId\n", "\n", "# predict one user's rating of one film\n", "predicted_rating = np.dot(user_vectors[inner_uid], movie_vectors[inner_iid])\n", @@ -281,15 +281,15 @@ }, { "cell_type": "code", - "execution_count": 281, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "user: 347 item: 5515 r_ui = None est = 1.83 {'was_impossible': False}\n", - "1.83012299501901\n" + "user: 347 item: 5515 r_ui = None est = 1.27 {'was_impossible': False}\n", + "1.2662407571780765\n" ] } ], @@ -314,7 +314,7 @@ }, { "cell_type": "code", - "execution_count": 282, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -554,7 +554,7 @@ "[5 rows x 23 columns]" ] }, - "execution_count": 282, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -566,7 +566,7 @@ }, { "cell_type": "code", - "execution_count": 283, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -590,7 +590,7 @@ "dtype: int64" ] }, - "execution_count": 283, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -631,7 +631,7 @@ }, { "cell_type": "code", - "execution_count": 284, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -649,7 +649,7 @@ }, { "cell_type": "code", - "execution_count": 285, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -677,15 +677,15 @@ }, { "cell_type": "code", - "execution_count": 286, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "20:56:23 redisvl.index.index INFO Index already exists, overwriting.\n", - "20:56:23 redisvl.index.index INFO Index already exists, overwriting.\n" + "18:57:45 redisvl.index.index INFO Index already exists, overwriting.\n", + "18:57:45 redisvl.index.index INFO Index already exists, overwriting.\n" ] } ], @@ -709,7 +709,7 @@ }, { "cell_type": "code", - "execution_count": 287, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -718,17 +718,17 @@ }, { "cell_type": "code", - "execution_count": 288, + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "number of movies 8387\n", - "size of movie df 8387\n", - "unique movie ids 8381\n", - "unique movie titles 8150\n", + "number of movies 8351\n", + "size of movie df 8351\n", + "unique movie ids 8347\n", + "unique movie titles 8104\n", "unique movies rated 9065\n" ] }, @@ -795,7 +795,7 @@ " 1\n", " 114709\n", " 862.0\n", - " [-0.023792249725276562, 0.1785839516922377, -0...\n", + " [0.003070617914363312, -0.2183623175004815, -0...\n", " \n", " \n", " 1\n", @@ -817,7 +817,7 @@ " 2\n", " 113497\n", " 8844.0\n", - " [0.2793838607565979, -0.21744939596620874, 0.1...\n", + " [0.013404150790652358, -0.1920666231028718, -0...\n", " \n", " \n", " 2\n", @@ -839,7 +839,7 @@ " 3\n", " 113228\n", " 15602.0\n", - " [-0.020947681442077554, 0.20694515937091487, 0...\n", + " [0.17041991275371088, -0.14362645391937717, -0...\n", " \n", " \n", " 3\n", @@ -861,7 +861,7 @@ " 4\n", " 114885\n", " 31357.0\n", - " [0.04080238290985722, 0.07032878736373183, -0....\n", + " [0.029246177676017816, -0.19591132539475606, -...\n", " \n", " \n", " 4\n", @@ -883,7 +883,7 @@ " 5\n", " 113041\n", " 11862.0\n", - " [-0.004196795084205664, -0.04584846941882623, ...\n", + " [-0.03755917677168938, -0.17405036529466641, 0...\n", " \n", " \n", "\n", @@ -926,14 +926,14 @@ "4 Father of the Bride Part II 5.7 173 5 113041 \n", "\n", " tmdbId movie_vector \n", - "0 862.0 [-0.023792249725276562, 0.1785839516922377, -0... \n", - "1 8844.0 [0.2793838607565979, -0.21744939596620874, 0.1... \n", - "2 15602.0 [-0.020947681442077554, 0.20694515937091487, 0... \n", - "3 31357.0 [0.04080238290985722, 0.07032878736373183, -0.... \n", - "4 11862.0 [-0.004196795084205664, -0.04584846941882623, ... " + "0 862.0 [0.003070617914363312, -0.2183623175004815, -0... \n", + "1 8844.0 [0.013404150790652358, -0.1920666231028718, -0... \n", + "2 15602.0 [0.17041991275371088, -0.14362645391937717, -0... \n", + "3 31357.0 [0.029246177676017816, -0.19591132539475606, -... \n", + "4 11862.0 [-0.03755917677168938, -0.17405036529466641, 0... " ] }, - "execution_count": 288, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -965,44 +965,43 @@ "\n", "This is why in our `collaborative_filtering_schema.yaml` we use `ip` (inner product) as our distance metric.\n", "\n", - "It's also why we'll use our user vector as the query vector when we do a query." + "It's also why we'll use our user vector as the query vector when we do a query. Let's pick a random user and their corresponding user vector to see what this looks like." ] }, { "cell_type": "code", - "execution_count": 289, + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'id': 'movie:ea1eb7855f474e1190fc997697717bce', 'vector_distance': '-3.16834640503', 'title': 'Terminator 2: Judgment Day', 'genres': '[\"Action\",\"Thriller\",\"Science Fiction\"]'}\n", - "{'id': 'movie:882a7c1429d24473abb22ad8a7294a74', 'vector_distance': '-3.07514286041', 'title': 'Pearl Harbor', 'genres': '[\"History\",\"Romance\",\"War\"]'}\n", - "{'id': 'movie:df31a1ffc0a3432fb1d6952c74a14bdc', 'vector_distance': '-3.04155731201', 'title': 'Girl, Interrupted', 'genres': '[\"Drama\"]'}\n", - "{'id': 'movie:ea1778357c934222a84099ab488503e7', 'vector_distance': '-2.99355602264', 'title': 'Cruel Intentions', 'genres': '[\"Drama\",\"Romance\",\"Thriller\"]'}\n", - "{'id': 'movie:df965ebd8dfa482dbcf184e9a4234f0a', 'vector_distance': '-2.97833967209', 'title': 'Remember the Titans', 'genres': '[\"Drama\"]'}\n", - "{'id': 'movie:5bd9e34ff7394e82b2508087543008be', 'vector_distance': '-2.91204404831', 'title': 'The Quiet American', 'genres': '[\"Drama\",\"Action\",\"Thriller\",\"Romance\"]'}\n", - "{'id': 'movie:ca5dcbe4ce844f74989294c4fc3ed61f', 'vector_distance': '-2.90972471237', 'title': 'The Departed', 'genres': '[\"Drama\",\"Thriller\",\"Crime\"]'}\n", - "{'id': 'movie:5efbb2346520455aaef3ae51fb9de029', 'vector_distance': '-2.9093708992', 'title': 'Run Lola Run', 'genres': '[\"Action\",\"Drama\",\"Thriller\"]'}\n", - "{'id': 'movie:3b51e695ae084eb0ace87c2f48235fe8', 'vector_distance': '-2.89018774033', 'title': 'Gladiator', 'genres': '[\"Action\",\"Drama\",\"Adventure\"]'}\n", - "{'id': 'movie:207756fd3d2f4e5599b8d149bf5c6b59', 'vector_distance': '-2.88189530373', 'title': 'Big Fish', 'genres': '[\"Adventure\",\"Fantasy\",\"Drama\"]'}\n", - "{'id': 'movie:1cb9cfa6294f48e8942becbe29af4765', 'vector_distance': '-2.87604212761', 'title': 'The Wrong Trousers', 'genres': '[\"Animation\",\"Comedy\",\"Family\"]'}\n", - "{'id': 'movie:ef35f7f4b72646b49e377e06f8dc7dbf', 'vector_distance': '-2.86843323708', 'title': 'The African Queen', 'genres': '[\"Adventure\",\"War\",\"Romance\"]'}\n", - "{'id': 'movie:c1f1b760faed43f09100a725b7420510', 'vector_distance': '-2.8342666626', 'title': 'Crumb', 'genres': '[\"Documentary\"]'}\n", - "{'id': 'movie:cdf6b6911dcd4378a69fa9ce2979b7a6', 'vector_distance': '-2.83272624016', 'title': 'Heathers', 'genres': '[\"Thriller\",\"Comedy\",\"Drama\"]'}\n", - "{'id': 'movie:5569250c1da8423ba4644c899a0523b5', 'vector_distance': '-2.83206033707', 'title': 'Murder in the First', 'genres': '[\"Crime\",\"Drama\"]'}\n", - "{'id': 'movie:a297851d857242f4b7cd8679c8d2c2f9', 'vector_distance': '-2.82705926895', 'title': 'Angels and Insects', 'genres': '[\"Drama\",\"Romance\"]'}\n", - "{'id': 'movie:ff792c59ab7045038f84e1ceefba1652', 'vector_distance': '-2.81982970238', 'title': 'Beauty and the Beast', 'genres': '[\"Romance\",\"Family\",\"Animation\",\"Fantasy\",\"Music\"]'}\n", - "{'id': 'movie:e6a67cfe875e452c8f0239f5d484fe08', 'vector_distance': '-2.81556129456', 'title': 'Gladiator 1992', 'genres': '[\"Action\",\"Drama\"]'}\n", - "{'id': 'movie:9f7d1fbd89a8477db90147d382d7f53c', 'vector_distance': '-2.80134963989', 'title': 'Band of Brothers', 'genres': '[\"Action\",\"Drama\",\"War\"]'}\n", - "{'id': 'movie:bb9113c4eb0a4e798eea4ccbb3cdf448', 'vector_distance': '-2.79896092415', 'title': 'Life Is Beautiful', 'genres': '[\"Comedy\",\"Drama\"]'}\n" + "{'id': 'movie:9a77231d27154ea1a678907d8e2c31ee', 'vector_distance': '-3.15922021866', 'title': 'Forrest Gump', 'genres': '[\"Comedy\",\"Drama\",\"Romance\"]'}\n", + "{'id': 'movie:51a38fb8f13f4726a8019a8d66f2b05b', 'vector_distance': '-3.15213918686', 'title': 'Cool Hand Luke', 'genres': '[\"Crime\",\"Drama\"]'}\n", + "{'id': 'movie:2e487266815945b0b8b857a848292e3e', 'vector_distance': '-3.07703495026', 'title': 'The Shawshank Redemption', 'genres': '[\"Drama\",\"Crime\"]'}\n", + "{'id': 'movie:c048199f45d340e282e37ff1c54089ca', 'vector_distance': '-3.04389858246', 'title': 'Lock, Stock and Two Smoking Barrels', 'genres': '[\"Comedy\",\"Crime\"]'}\n", + "{'id': 'movie:0996165a33924a79beb757bcefc17ed3', 'vector_distance': '-3.03677082062', 'title': 'Return of the Jedi', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", + "{'id': 'movie:00e1ad78f47745b6a0e41b101eb98ff0', 'vector_distance': '-3.01881790161', 'title': 'In the Line of Fire', 'genres': '[\"Action\",\"Drama\",\"Thriller\",\"Crime\",\"Mystery\"]'}\n", + "{'id': 'movie:8fd381ca8404447caf53b63a901c69eb', 'vector_distance': '-3.01792764664', 'title': \"Mr. Holland's Opus\", 'genres': '[\"Music\",\"Drama\",\"Family\"]'}\n", + "{'id': 'movie:139f8da6282541a3a5881c7c457dd5ea', 'vector_distance': '-3.00384759903', 'title': 'Lifeboat', 'genres': '[\"Drama\",\"War\"]'}\n", + "{'id': 'movie:600e4f66decf454587888263b3fd7c6c', 'vector_distance': '-3.00213766098', 'title': 'Fargo', 'genres': '[\"Crime\",\"Drama\",\"Thriller\"]'}\n", + "{'id': 'movie:657859724d944786afa80a74a8ba7590', 'vector_distance': '-3.00026941299', 'title': 'Crumb', 'genres': '[\"Documentary\"]'}\n", + "{'id': 'movie:a8a6ce2be2e547d7be27cfcd3c4b412e', 'vector_distance': '-2.98649430275', 'title': 'Much Ado About Nothing', 'genres': '[\"Drama\",\"Comedy\",\"Romance\"]'}\n", + "{'id': 'movie:b77ab676009448b8b31f05cfad877d78', 'vector_distance': '-2.96709799767', 'title': 'Dead Man Walking', 'genres': '[\"Drama\"]'}\n", + "{'id': 'movie:203d662a71db49b7afc86e101ed3b61c', 'vector_distance': '-2.96438765526', 'title': 'Star Wars', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", + "{'id': 'movie:4355e3c0948943db8b9c1774e3c616a2', 'vector_distance': '-2.94901204109', 'title': 'The Lord of the Rings: The Fellowship of the Ring', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n", + "{'id': 'movie:81dc7920656a4d389767fd13c2cb24b3', 'vector_distance': '-2.92887830734', 'title': 'Sunset Boulevard', 'genres': '[\"Drama\"]'}\n", + "{'id': 'movie:42d61bf4d1fc42a498fdb2985cd402dd', 'vector_distance': '-2.92803192139', 'title': 'Cinema Paradiso', 'genres': '[\"Drama\",\"Romance\"]'}\n", + "{'id': 'movie:0673b1bb33a144b78397fff8939ec758', 'vector_distance': '-2.92664003372', 'title': 'Eat Drink Man Woman', 'genres': '[\"Comedy\",\"Drama\",\"Romance\"]'}\n", + "{'id': 'movie:8bbac427a25643d48a0582a411f2ce71', 'vector_distance': '-2.92609977722', 'title': \"Ocean's Eleven\", 'genres': '[\"Thriller\",\"Crime\"]'}\n", + "{'id': 'movie:ead41675cf414eca86c3e995f398b09e', 'vector_distance': '-2.92416906357', 'title': 'Three Colors: Red', 'genres': '[\"Drama\",\"Mystery\",\"Romance\"]'}\n", + "{'id': 'movie:9843a33a6dd44f46ad475808cebb06fe', 'vector_distance': '-2.91586065292', 'title': 'Ponyo', 'genres': '[\"Animation\",\"Family\"]'}\n" ] } ], "source": [ - "from redisvl.query import RangeQuery, FilterQuery\n", - "from redisvl.query.filter import Tag, Num, Text\n", + "from redisvl.query import RangeQuery\n", "\n", "user_vector = user_vectors[352].tolist()\n", "\n", @@ -1021,49 +1020,404 @@ " print(r)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adding All the Bells & Whistles\n", + "Vector search handles the bulk of our collaborative filtering recommendation system and is a great approach to generating personalized recommendations that are unique to each user.\n", + "\n", + "To up our RecSys game even further we can leverage RedisVl filter logic to give more control to what users are shown. Why have only one feed of recommended movies when you can have several, each with its own theme and personalized to each user." + ] + }, { "cell_type": "code", - "execution_count": 290, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "\n", - "## EVALUATE MOVE TO COLLABORATIVE FILTERING SO WE CAN SHOW BETTER NUMBERS\n", - "#let's see how well this works. we can choose some users, and based on their first watched movie we can recommend them some more.\n", - "#we can then look at the set intersection between our recommendations and the movies they actually watched (and rated highly) to see how well we did." + "from redisvl.query.filter import Tag, Num, Text\n", + "\n", + "def get_recommendations(user_id, filters=None, num_results=10):\n", + " user_vector = user_vectors[user_id].tolist()\n", + " query = RangeQuery(vector=user_vector,\n", + " vector_field_name='movie_vector',\n", + " num_results=num_results,\n", + " filter_expression=filters,\n", + " return_fields=['title', 'overview', 'genres'])\n", + "\n", + " results = movie_index.query(query)\n", + "\n", + " return [(r['title'], r['overview'], r['genres'], r['vector_distance']) for r in results]\n", + "\n", + "Top_picks_for_you = get_recommendations(user_id=42) # general SVD results, no filter\n", + "\n", + "block_buster_filter = Num('revenue') > 30_000_000\n", + "block_buster_hits = get_recommendations(user_id=42, filters=block_buster_filter)\n", + "\n", + "classics_filter = Num('release_date') < datetime.datetime(1990, 1, 1).timestamp()\n", + "classics = get_recommendations(user_id=42, filters=classics_filter)\n", + "\n", + "popular_filter = (Num('popularity') > 50) & (Num('vote_average') > 7)\n", + "Whats_popular = get_recommendations(user_id=42, filters=popular_filter)\n", + "\n", + "indie_filter = (Num('revenue') < 1_000_000) & (Num('popularity') > 10)\n", + "indie_hits = get_recommendations(user_id=42, filters=indie_filter)\n", + "\n", + "fruity = Text('title') % 'apple|orange|peach|banana|grape|pineapple'\n", + "fruity_films = get_recommendations(user_id=42, filters=fruity)\n" ] }, { "cell_type": "code", - "execution_count": 291, + "execution_count": 25, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
top picksblock bustersclassicswhat's popularindie hitsfruity films
0The GraduateThe GraduateThe GraduatePulp FictionAll About EveThe Grapes of Wrath
1Das BootDas BootDas BootThe Shawshank RedemptionThe PostmanWhat's Eating Gilbert Grape
2AmadeusAmadeusAmadeusGone GirlBicycle ThievesA Clockwork Orange
3FargoFargoDr. Strangelove or: How I Learned to Stop Worr...Dawn of the Planet of the ApesMy Neighbor TotoroBananas
4Dr. Strangelove or: How I Learned to Stop Worr...Shakespeare in LoveCinema ParadisoFight ClubThe Wild BunchPineapple Express
5Cinema ParadisoThe Last EmperorTake the Money and RunBlade RunnerMJames and the Giant Peach
6Take the Money and RunThe Color PurpleThe Last EmperorWhiplashRebel Without a CauseThe Apple Dumpling Gang
7Shakespeare in LoveManhattanRaging BullBig Hero 6Withnail & IAdam's Apples
8The Last EmperorAnnie HallThe Color PurpleGuardians of the GalaxyMeet John DoeOrange County
9Raging BullThe PianoNorth by NorthwestCaptain America: Civil WarOnce Upon a Time in AmericaHerbie Goes Bananas
\n", + "
" + ], + "text/plain": [ + " top picks block busters \\\n", + "0 The Graduate The Graduate \n", + "1 Das Boot Das Boot \n", + "2 Amadeus Amadeus \n", + "3 Fargo Fargo \n", + "4 Dr. Strangelove or: How I Learned to Stop Worr... Shakespeare in Love \n", + "5 Cinema Paradiso The Last Emperor \n", + "6 Take the Money and Run The Color Purple \n", + "7 Shakespeare in Love Manhattan \n", + "8 The Last Emperor Annie Hall \n", + "9 Raging Bull The Piano \n", + "\n", + " classics \\\n", + "0 The Graduate \n", + "1 Das Boot \n", + "2 Amadeus \n", + "3 Dr. Strangelove or: How I Learned to Stop Worr... \n", + "4 Cinema Paradiso \n", + "5 Take the Money and Run \n", + "6 The Last Emperor \n", + "7 Raging Bull \n", + "8 The Color Purple \n", + "9 North by Northwest \n", + "\n", + " what's popular indie hits \\\n", + "0 Pulp Fiction All About Eve \n", + "1 The Shawshank Redemption The Postman \n", + "2 Gone Girl Bicycle Thieves \n", + "3 Dawn of the Planet of the Apes My Neighbor Totoro \n", + "4 Fight Club The Wild Bunch \n", + "5 Blade Runner M \n", + "6 Whiplash Rebel Without a Cause \n", + "7 Big Hero 6 Withnail & I \n", + "8 Guardians of the Galaxy Meet John Doe \n", + "9 Captain America: Civil War Once Upon a Time in America \n", + "\n", + " fruity films \n", + "0 The Grapes of Wrath \n", + "1 What's Eating Gilbert Grape \n", + "2 A Clockwork Orange \n", + "3 Bananas \n", + "4 Pineapple Express \n", + "5 James and the Giant Peach \n", + "6 The Apple Dumpling Gang \n", + "7 Adam's Apples \n", + "8 Orange County \n", + "9 Herbie Goes Bananas " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# put all these titles into a single pandas dataframe , where each column is one category\n", + "all_recommendations = pd.DataFrame(columns=[\"top picks\", \"block busters\", \"classics\", \"what's popular\", \"indie hits\", \"fruity films\"])\n", + "all_recommendations[\"top picks\"] = [m[0] for m in Top_picks_for_you]\n", + "all_recommendations[\"block busters\"] = [m[0] for m in block_buster_hits]\n", + "all_recommendations[\"classics\"] = [m[0] for m in classics]\n", + "all_recommendations[\"what's popular\"] = [m[0] for m in Whats_popular]\n", + "all_recommendations[\"indie hits\"] = [m[0] for m in indie_hits]\n", + "all_recommendations[\"fruity films\"] = [m[0] for m in fruity_films]\n", + "\n", + "all_recommendations.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Keeping Things Fresh\n", + "You've probably noticed that a few movies get repeated in these lists. That's not surprising as all our results are personalized and things like `popularity` and `user_rating` and `revenue` are likely highly correlated. And it's more that likely that at least some of the recommendations we're expecting to be highly rated by a given user is one they've already watched and rated highly.\n", + "\n", + "Luckily Redis offers an easy anwer to keeping recommendations new and interesting, and that answer is Bloom Filters." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'bool' object has no attribute 'add'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[34], line 40\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28mfilter\u001b[39m \u001b[38;5;241m=\u001b[39m client\u001b[38;5;241m.\u001b[39mbf()\u001b[38;5;241m.\u001b[39mcreate(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124muser_watched_list:\u001b[39m\u001b[38;5;132;01m{user_id}\u001b[39;00m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;241m0.01\u001b[39m, \u001b[38;5;241m1000\u001b[39m)\n\u001b[1;32m 39\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m movie_id \u001b[38;5;129;01min\u001b[39;00m watched_movies:\n\u001b[0;32m---> 40\u001b[0m \u001b[38;5;28;43mfilter\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madd\u001b[49m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00muser_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmovie_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 42\u001b[0m Top_picks_for_you \u001b[38;5;241m=\u001b[39m get_unique_recommendations(user_id\u001b[38;5;241m=\u001b[39muser_id) \u001b[38;5;66;03m# general SVD results, no filter\u001b[39;00m\n\u001b[1;32m 43\u001b[0m block_buster_hits \u001b[38;5;241m=\u001b[39m get_unique_recommendations(user_id\u001b[38;5;241m=\u001b[39muser_id, filters\u001b[38;5;241m=\u001b[39mblock_buster_filter)\n", + "\u001b[0;31mAttributeError\u001b[0m: 'bool' object has no attribute 'add'" + ] + } + ], + "source": [ + "\n", + "# create a bloom filter for a given user and add their watched list to it\n", + "def create_bloom_filter(user_id, watched_movies):\n", + " if not client.bf().exists(f\"user_watched_list\"):\n", + " filter = client.bf().create(f\"user_watched_list\", 0.01, 1000)\n", + " for movie_id in watched_movies:\n", + " client.bf().add(f\"user_watched_list\", f\"{user_id}:{movie_id}\")\n", + " return filter\n", + "\n", + "# rewrite the get_recommendations() function to use a bloom filter and apply it before we return results\n", + "def get_unique_recommendations(user_id, filters=None, num_results=10):\n", + " user_vector = user_vectors[user_id].tolist()\n", + " bloom_filter_name = f\"user:{user_id}:watched\"\n", + "\n", + " query = RangeQuery(vector=user_vector,\n", + " vector_field_name='movie_vector',\n", + " num_results=num_results * 2, # fetch more results to filter out watched movies\n", + " filter_expression=filters,\n", + " #return_fields=['title', 'overview', 'genres', 'movie_id'])\n", + " return_fields=['title', 'movieId'])\n", + "\n", + " results = movie_index.query(query)\n", + "\n", + " # filter out movies that the user has already watched\n", + " recommendations = []\n", + " for r in results:\n", + " print(r)\n", + " if not bloom_client.bfExists(bloom_filter_name, r['movieId']):\n", + " recommendations.append((r['title'], r['overview'], r['genres'], r['vector_distance']))\n", + " if len(recommendations) >= num_results:\n", + " break\n", + "\n", + " return recommendations\n", + "\n", + "# example usage\n", + "user_id = 42\n", + "watched_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()\n", + "\n", + "filter = client.bf().create('user_watched_list:{user_id}', 0.01, 1000)\n", + "for movie_id in watched_movies:\n", + " filter.add(f'{user_id}:{movie_id}')\n", + "\n", + "Top_picks_for_you = get_unique_recommendations(user_id=user_id) # general SVD results, no filter\n", + "block_buster_hits = get_unique_recommendations(user_id=user_id, filters=block_buster_filter)\n", + "classics = get_unique_recommendations(user_id=user_id, filters=classics_filter)\n", + "Whats_popular = get_unique_recommendations(user_id=user_id, filters=popular_filter)\n", + "indie_hits = get_unique_recommendations(user_id=user_id, filters=indie_filter)\n", + "fruity_films = get_unique_recommendations(user_id=user_id, filters=fruity)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "ruby" + } + }, "outputs": [], "source": [ - "# TODO use bloom filter and/or cuckoo filter with the recommendations and user's watched_list in their index to filter out movies they already watched" + "# put all these titles into a single pandas dataframe , where each column is one category\n", + "all_recommendations = pd.DataFrame(columns=[\"top picks\", \"block busters\", \"classics\", \"what's popular\", \"indie hits\", \"fruity films\"])\n", + "all_recommendations[\"top picks\"] = [m[0] for m in Top_picks_for_you]\n", + "all_recommendations[\"block busters\"] = [m[0] for m in block_buster_hits]\n", + "all_recommendations[\"classics\"] = [m[0] for m in classics]\n", + "all_recommendations[\"what's popular\"] = [m[0] for m in Whats_popular]\n", + "all_recommendations[\"indie hits\"] = [m[0] for m in indie_hits]\n", + "all_recommendations[\"fruity films\"] = [m[0] for m in fruity_films]\n", + "\n", + "all_recommendations.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "That's it! That's all it takes to build a highly scalable, personalized, customizable collaborative filtering recommendation system with Redis and RedisVL.\n" ] }, { "cell_type": "code", - "execution_count": 292, + "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Deleted 4387 keys\n", + "Deleted 4351 keys\n", "Deleted 2000 keys\n", "Deleted 1000 keys\n", "Deleted 500 keys\n", "Deleted 500 keys\n" ] + }, + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "# clean up your index\n", "while remaining := movie_index.clear():\n", - " print(f\"Deleted {remaining} keys\")" + " print(f\"Deleted {remaining} keys\")\n", + "\n", + "while remaining := user_index.clear():\n", + " print(f\"Deleeted {remaining} keys\")\n", + "\n", + "client.delete(\"user_watched_list\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From c17af34aa27fee38daa417695dbb165f473b9d61 Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Mon, 14 Oct 2024 12:33:10 -0700 Subject: [PATCH 05/12] fully working collaborative filtering with bloomfilter notebook --- .../collaborative_filtering.ipynb | 553 +++++++++++------- 1 file changed, 348 insertions(+), 205 deletions(-) diff --git a/python-recipes/recommendation-systems/collaborative_filtering.ipynb b/python-recipes/recommendation-systems/collaborative_filtering.ipynb index 01e97b33..99044c02 100644 --- a/python-recipes/recommendation-systems/collaborative_filtering.ipynb +++ b/python-recipes/recommendation-systems/collaborative_filtering.ipynb @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -89,7 +89,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -176,16 +176,16 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 12, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -224,7 +224,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -232,7 +232,7 @@ "output_type": "stream", "text": [ "we have 671 users with feature vectors of size 100\n", - "we have 8398 movies with feature vectors of size 100\n" + "we have 8393 movies with feature vectors of size 100\n" ] } ], @@ -258,14 +258,14 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "the predicted rating of user 347 on movie 5515 is 1.2662407571780765\n" + "the predicted rating of user 347 on movie 5515 is 0.965787539953316\n" ] } ], @@ -281,15 +281,15 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "user: 347 item: 5515 r_ui = None est = 1.27 {'was_impossible': False}\n", - "1.2662407571780765\n" + "user: 347 item: 5515 r_ui = None est = 0.97 {'was_impossible': False}\n", + "0.965787539953316\n" ] } ], @@ -314,7 +314,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -554,7 +554,7 @@ "[5 rows x 23 columns]" ] }, - "execution_count": 16, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -566,7 +566,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -590,7 +590,7 @@ "dtype: int64" ] }, - "execution_count": 17, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -631,7 +631,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -649,7 +649,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -677,15 +677,15 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "18:57:45 redisvl.index.index INFO Index already exists, overwriting.\n", - "18:57:45 redisvl.index.index INFO Index already exists, overwriting.\n" + "12:30:31 redisvl.index.index INFO Index already exists, overwriting.\n", + "12:30:31 redisvl.index.index INFO Index already exists, overwriting.\n" ] } ], @@ -709,7 +709,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -718,17 +718,17 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "number of movies 8351\n", - "size of movie df 8351\n", - "unique movie ids 8347\n", - "unique movie titles 8104\n", + "number of movies 8348\n", + "size of movie df 8348\n", + "unique movie ids 8342\n", + "unique movie titles 8109\n", "unique movies rated 9065\n" ] }, @@ -795,7 +795,7 @@ " 1\n", " 114709\n", " 862.0\n", - " [0.003070617914363312, -0.2183623175004815, -0...\n", + " [0.16217072665688012, 0.245026260806211, -0.14...\n", " \n", " \n", " 1\n", @@ -817,7 +817,7 @@ " 2\n", " 113497\n", " 8844.0\n", - " [0.013404150790652358, -0.1920666231028718, -0...\n", + " [-0.0495065883180616, 0.017243236163025016, -0...\n", " \n", " \n", " 2\n", @@ -839,7 +839,7 @@ " 3\n", " 113228\n", " 15602.0\n", - " [0.17041991275371088, -0.14362645391937717, -0...\n", + " [0.07067590986084793, 0.20963299716890343, 0.2...\n", " \n", " \n", " 3\n", @@ -861,7 +861,7 @@ " 4\n", " 114885\n", " 31357.0\n", - " [0.029246177676017816, -0.19591132539475606, -...\n", + " [-0.023481240586441465, 0.1194581665494643, -0...\n", " \n", " \n", " 4\n", @@ -883,7 +883,7 @@ " 5\n", " 113041\n", " 11862.0\n", - " [-0.03755917677168938, -0.17405036529466641, 0...\n", + " [0.07510781660794685, 0.19069717883675757, -0....\n", " \n", " \n", "\n", @@ -926,14 +926,14 @@ "4 Father of the Bride Part II 5.7 173 5 113041 \n", "\n", " tmdbId movie_vector \n", - "0 862.0 [0.003070617914363312, -0.2183623175004815, -0... \n", - "1 8844.0 [0.013404150790652358, -0.1920666231028718, -0... \n", - "2 15602.0 [0.17041991275371088, -0.14362645391937717, -0... \n", - "3 31357.0 [0.029246177676017816, -0.19591132539475606, -... \n", - "4 11862.0 [-0.03755917677168938, -0.17405036529466641, 0... " + "0 862.0 [0.16217072665688012, 0.245026260806211, -0.14... \n", + "1 8844.0 [-0.0495065883180616, 0.017243236163025016, -0... \n", + "2 15602.0 [0.07067590986084793, 0.20963299716890343, 0.2... \n", + "3 31357.0 [-0.023481240586441465, 0.1194581665494643, -0... \n", + "4 11862.0 [0.07510781660794685, 0.19069717883675757, -0.... " ] }, - "execution_count": 22, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -970,33 +970,33 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'id': 'movie:9a77231d27154ea1a678907d8e2c31ee', 'vector_distance': '-3.15922021866', 'title': 'Forrest Gump', 'genres': '[\"Comedy\",\"Drama\",\"Romance\"]'}\n", - "{'id': 'movie:51a38fb8f13f4726a8019a8d66f2b05b', 'vector_distance': '-3.15213918686', 'title': 'Cool Hand Luke', 'genres': '[\"Crime\",\"Drama\"]'}\n", - "{'id': 'movie:2e487266815945b0b8b857a848292e3e', 'vector_distance': '-3.07703495026', 'title': 'The Shawshank Redemption', 'genres': '[\"Drama\",\"Crime\"]'}\n", - "{'id': 'movie:c048199f45d340e282e37ff1c54089ca', 'vector_distance': '-3.04389858246', 'title': 'Lock, Stock and Two Smoking Barrels', 'genres': '[\"Comedy\",\"Crime\"]'}\n", - "{'id': 'movie:0996165a33924a79beb757bcefc17ed3', 'vector_distance': '-3.03677082062', 'title': 'Return of the Jedi', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", - "{'id': 'movie:00e1ad78f47745b6a0e41b101eb98ff0', 'vector_distance': '-3.01881790161', 'title': 'In the Line of Fire', 'genres': '[\"Action\",\"Drama\",\"Thriller\",\"Crime\",\"Mystery\"]'}\n", - "{'id': 'movie:8fd381ca8404447caf53b63a901c69eb', 'vector_distance': '-3.01792764664', 'title': \"Mr. Holland's Opus\", 'genres': '[\"Music\",\"Drama\",\"Family\"]'}\n", - "{'id': 'movie:139f8da6282541a3a5881c7c457dd5ea', 'vector_distance': '-3.00384759903', 'title': 'Lifeboat', 'genres': '[\"Drama\",\"War\"]'}\n", - "{'id': 'movie:600e4f66decf454587888263b3fd7c6c', 'vector_distance': '-3.00213766098', 'title': 'Fargo', 'genres': '[\"Crime\",\"Drama\",\"Thriller\"]'}\n", - "{'id': 'movie:657859724d944786afa80a74a8ba7590', 'vector_distance': '-3.00026941299', 'title': 'Crumb', 'genres': '[\"Documentary\"]'}\n", - "{'id': 'movie:a8a6ce2be2e547d7be27cfcd3c4b412e', 'vector_distance': '-2.98649430275', 'title': 'Much Ado About Nothing', 'genres': '[\"Drama\",\"Comedy\",\"Romance\"]'}\n", - "{'id': 'movie:b77ab676009448b8b31f05cfad877d78', 'vector_distance': '-2.96709799767', 'title': 'Dead Man Walking', 'genres': '[\"Drama\"]'}\n", - "{'id': 'movie:203d662a71db49b7afc86e101ed3b61c', 'vector_distance': '-2.96438765526', 'title': 'Star Wars', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", - "{'id': 'movie:4355e3c0948943db8b9c1774e3c616a2', 'vector_distance': '-2.94901204109', 'title': 'The Lord of the Rings: The Fellowship of the Ring', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n", - "{'id': 'movie:81dc7920656a4d389767fd13c2cb24b3', 'vector_distance': '-2.92887830734', 'title': 'Sunset Boulevard', 'genres': '[\"Drama\"]'}\n", - "{'id': 'movie:42d61bf4d1fc42a498fdb2985cd402dd', 'vector_distance': '-2.92803192139', 'title': 'Cinema Paradiso', 'genres': '[\"Drama\",\"Romance\"]'}\n", - "{'id': 'movie:0673b1bb33a144b78397fff8939ec758', 'vector_distance': '-2.92664003372', 'title': 'Eat Drink Man Woman', 'genres': '[\"Comedy\",\"Drama\",\"Romance\"]'}\n", - "{'id': 'movie:8bbac427a25643d48a0582a411f2ce71', 'vector_distance': '-2.92609977722', 'title': \"Ocean's Eleven\", 'genres': '[\"Thriller\",\"Crime\"]'}\n", - "{'id': 'movie:ead41675cf414eca86c3e995f398b09e', 'vector_distance': '-2.92416906357', 'title': 'Three Colors: Red', 'genres': '[\"Drama\",\"Mystery\",\"Romance\"]'}\n", - "{'id': 'movie:9843a33a6dd44f46ad475808cebb06fe', 'vector_distance': '-2.91586065292', 'title': 'Ponyo', 'genres': '[\"Animation\",\"Family\"]'}\n" + "{'id': 'movie:b5b8331ab3044a35bed03e7208dd7079', 'vector_distance': '-3.64372396469', 'title': 'The Lord of the Rings: The Fellowship of the Ring', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n", + "{'id': 'movie:a6e4d98e9bd34503aa0ece6924cdb4c9', 'vector_distance': '-3.59878540039', 'title': 'The Dark Knight', 'genres': '[\"Drama\",\"Action\",\"Crime\",\"Thriller\"]'}\n", + "{'id': 'movie:51e5e1fbd6d940d894122e98fc638e85', 'vector_distance': '-3.59825658798', 'title': '12 Angry Men', 'genres': '[\"Drama\"]'}\n", + "{'id': 'movie:876f2cc13807471ba1dbaedbd92303c8', 'vector_distance': '-3.59230089188', 'title': 'Leon: The Professional', 'genres': '[\"Thriller\",\"Crime\",\"Drama\"]'}\n", + "{'id': 'movie:05a21bd00b8b4ed6938fc52cb532a601', 'vector_distance': '-3.54890108109', 'title': 'The Matrix', 'genres': '[\"Action\",\"Science Fiction\"]'}\n", + "{'id': 'movie:4644b98a4a21435981928b2d2871089d', 'vector_distance': '-3.4690322876', 'title': 'Band of Brothers', 'genres': '[\"Action\",\"Drama\",\"War\"]'}\n", + "{'id': 'movie:b096f81674a24f00a24cd9289d777913', 'vector_distance': '-3.46432924271', 'title': 'Memento', 'genres': '[\"Mystery\",\"Thriller\"]'}\n", + "{'id': 'movie:6861752bd166469aa89ab1ca69950ee8', 'vector_distance': '-3.4593539238', 'title': 'The Princess Bride', 'genres': '[\"Adventure\",\"Family\",\"Fantasy\",\"Comedy\",\"Romance\"]'}\n", + "{'id': 'movie:44911da20d8e4ad3a93a8452a3438feb', 'vector_distance': '-3.44543361664', 'title': 'American History X', 'genres': '[\"Drama\"]'}\n", + "{'id': 'movie:6f04f57431894700b9d23b60ebf8fac0', 'vector_distance': '-3.44274091721', 'title': 'Interstellar', 'genres': '[\"Adventure\",\"Drama\",\"Science Fiction\"]'}\n", + "{'id': 'movie:b2f480a4c97d404d8a5caa248b59a0e3', 'vector_distance': '-3.43494272232', 'title': 'Cinema Paradiso', 'genres': '[\"Drama\",\"Romance\"]'}\n", + "{'id': 'movie:367a0ad625564683b8818edc82b3878b', 'vector_distance': '-3.42747116089', 'title': 'The Prestige', 'genres': '[\"Drama\",\"Mystery\",\"Thriller\"]'}\n", + "{'id': 'movie:ec5ccd6e24fa470eae864f1ed3f7c566', 'vector_distance': '-3.42468452454', 'title': 'The African Queen', 'genres': '[\"Adventure\",\"War\",\"Romance\"]'}\n", + "{'id': 'movie:415b17e8f1a64ec0b7819068ae0ebc2d', 'vector_distance': '-3.4210562706', 'title': 'Happiness', 'genres': '[\"Comedy\",\"Drama\"]'}\n", + "{'id': 'movie:81802cdb06684b7fb3c26e754cb0bc50', 'vector_distance': '-3.41307687759', 'title': 'Forrest Gump', 'genres': '[\"Comedy\",\"Drama\",\"Romance\"]'}\n", + "{'id': 'movie:ce087387efd84691bab6a4228289ca47', 'vector_distance': '-3.40901231766', 'title': 'Thank You for Smoking', 'genres': '[\"Comedy\",\"Drama\"]'}\n", + "{'id': 'movie:846793c4d43e4f3a9c7536682e169789', 'vector_distance': '-3.38840723038', 'title': 'A Close Shave', 'genres': '[\"Family\",\"Animation\",\"Comedy\"]'}\n", + "{'id': 'movie:8e370a42277a43859a19cef8223549b2', 'vector_distance': '-3.38426446915', 'title': 'Up', 'genres': '[\"Animation\",\"Comedy\",\"Family\",\"Adventure\"]'}\n", + "{'id': 'movie:01c1de9dd23046f1ba630daf295b91a1', 'vector_distance': '-3.36946439743', 'title': 'Sin City', 'genres': '[\"Action\",\"Thriller\",\"Crime\"]'}\n", + "{'id': 'movie:a60d16b71e874eb99fad5461dc48b034', 'vector_distance': '-3.363966465', 'title': 'The Departed', 'genres': '[\"Drama\",\"Thriller\",\"Crime\"]'}\n" ] } ], @@ -1032,7 +1032,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -1071,7 +1071,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1106,92 +1106,92 @@ " \n", " \n", " 0\n", - " The Graduate\n", - " The Graduate\n", - " The Graduate\n", - " Pulp Fiction\n", - " All About Eve\n", - " The Grapes of Wrath\n", + " The Professional\n", + " One Flew Over the Cuckoo's Nest\n", + " The Professional\n", + " The Dark Knight\n", + " The Professional\n", + " A Clockwork Orange\n", " \n", " \n", " 1\n", - " Das Boot\n", - " Das Boot\n", - " Das Boot\n", + " One Flew Over the Cuckoo's Nest\n", + " The Godfather\n", + " One Flew Over the Cuckoo's Nest\n", " The Shawshank Redemption\n", - " The Postman\n", - " What's Eating Gilbert Grape\n", + " Seven Samurai\n", + " Pineapple Express\n", " \n", " \n", " 2\n", - " Amadeus\n", - " Amadeus\n", - " Amadeus\n", - " Gone Girl\n", - " Bicycle Thieves\n", - " A Clockwork Orange\n", + " The Godfather\n", + " Leon: The Professional\n", + " The Godfather\n", + " Pulp Fiction\n", + " Akira\n", + " James and the Giant Peach\n", " \n", " \n", " 3\n", - " Fargo\n", - " Fargo\n", - " Dr. Strangelove or: How I Learned to Stop Worr...\n", - " Dawn of the Planet of the Apes\n", - " My Neighbor Totoro\n", - " Bananas\n", + " Leon: The Professional\n", + " The Godfather: Part II\n", + " The Godfather: Part II\n", + " Fight Club\n", + " The Postman\n", + " The Grapes of Wrath\n", " \n", " \n", " 4\n", - " Dr. Strangelove or: How I Learned to Stop Worr...\n", - " Shakespeare in Love\n", - " Cinema Paradiso\n", - " Fight Club\n", - " The Wild Bunch\n", - " Pineapple Express\n", + " The Godfather: Part II\n", + " The Dark Knight\n", + " A Clockwork Orange\n", + " Big Hero 6\n", + " Cube\n", + " Bananas\n", " \n", " \n", " 5\n", - " Cinema Paradiso\n", - " The Last Emperor\n", - " Take the Money and Run\n", + " A Clockwork Orange\n", + " Fargo\n", + " The African Queen\n", " Blade Runner\n", - " M\n", - " James and the Giant Peach\n", + " Castle in the Sky\n", + " What's Eating Gilbert Grape\n", " \n", " \n", " 6\n", - " Take the Money and Run\n", - " The Color Purple\n", - " The Last Emperor\n", - " Whiplash\n", - " Rebel Without a Cause\n", + " Let the Right One In\n", + " The Graduate\n", + " Cool Hand Luke\n", + " Gone Girl\n", + " M\n", " The Apple Dumpling Gang\n", " \n", " \n", " 7\n", - " Shakespeare in Love\n", - " Manhattan\n", - " Raging Bull\n", - " Big Hero 6\n", - " Withnail & I\n", - " Adam's Apples\n", + " The African Queen\n", + " Eternal Sunshine of the Spotless Mind\n", + " The Graduate\n", + " Whiplash\n", + " Once Upon a Time in America\n", + " Soldier of Orange\n", " \n", " \n", " 8\n", - " The Last Emperor\n", - " Annie Hall\n", - " The Color Purple\n", - " Guardians of the Galaxy\n", - " Meet John Doe\n", + " Cool Hand Luke\n", + " Inception\n", + " 12 Angry Men\n", + " The Avengers\n", + " All Quiet on the Western Front\n", " Orange County\n", " \n", " \n", " 9\n", - " Raging Bull\n", - " The Piano\n", - " North by Northwest\n", - " Captain America: Civil War\n", - " Once Upon a Time in America\n", + " The Dark Knight\n", + " Boyz n the Hood\n", + " The Treasure of the Sierra Madre\n", + " Guardians of the Galaxy\n", + " Aguirre: The Wrath of God\n", " Herbie Goes Bananas\n", " \n", " \n", @@ -1199,56 +1199,44 @@ "" ], "text/plain": [ - " top picks block busters \\\n", - "0 The Graduate The Graduate \n", - "1 Das Boot Das Boot \n", - "2 Amadeus Amadeus \n", - "3 Fargo Fargo \n", - "4 Dr. Strangelove or: How I Learned to Stop Worr... Shakespeare in Love \n", - "5 Cinema Paradiso The Last Emperor \n", - "6 Take the Money and Run The Color Purple \n", - "7 Shakespeare in Love Manhattan \n", - "8 The Last Emperor Annie Hall \n", - "9 Raging Bull The Piano \n", - "\n", - " classics \\\n", - "0 The Graduate \n", - "1 Das Boot \n", - "2 Amadeus \n", - "3 Dr. Strangelove or: How I Learned to Stop Worr... \n", - "4 Cinema Paradiso \n", - "5 Take the Money and Run \n", - "6 The Last Emperor \n", - "7 Raging Bull \n", - "8 The Color Purple \n", - "9 North by Northwest \n", + " top picks block busters \\\n", + "0 The Professional One Flew Over the Cuckoo's Nest \n", + "1 One Flew Over the Cuckoo's Nest The Godfather \n", + "2 The Godfather Leon: The Professional \n", + "3 Leon: The Professional The Godfather: Part II \n", + "4 The Godfather: Part II The Dark Knight \n", + "5 A Clockwork Orange Fargo \n", + "6 Let the Right One In The Graduate \n", + "7 The African Queen Eternal Sunshine of the Spotless Mind \n", + "8 Cool Hand Luke Inception \n", + "9 The Dark Knight Boyz n the Hood \n", "\n", - " what's popular indie hits \\\n", - "0 Pulp Fiction All About Eve \n", - "1 The Shawshank Redemption The Postman \n", - "2 Gone Girl Bicycle Thieves \n", - "3 Dawn of the Planet of the Apes My Neighbor Totoro \n", - "4 Fight Club The Wild Bunch \n", - "5 Blade Runner M \n", - "6 Whiplash Rebel Without a Cause \n", - "7 Big Hero 6 Withnail & I \n", - "8 Guardians of the Galaxy Meet John Doe \n", - "9 Captain America: Civil War Once Upon a Time in America \n", + " classics what's popular \\\n", + "0 The Professional The Dark Knight \n", + "1 One Flew Over the Cuckoo's Nest The Shawshank Redemption \n", + "2 The Godfather Pulp Fiction \n", + "3 The Godfather: Part II Fight Club \n", + "4 A Clockwork Orange Big Hero 6 \n", + "5 The African Queen Blade Runner \n", + "6 Cool Hand Luke Gone Girl \n", + "7 The Graduate Whiplash \n", + "8 12 Angry Men The Avengers \n", + "9 The Treasure of the Sierra Madre Guardians of the Galaxy \n", "\n", - " fruity films \n", - "0 The Grapes of Wrath \n", - "1 What's Eating Gilbert Grape \n", - "2 A Clockwork Orange \n", - "3 Bananas \n", - "4 Pineapple Express \n", - "5 James and the Giant Peach \n", - "6 The Apple Dumpling Gang \n", - "7 Adam's Apples \n", - "8 Orange County \n", - "9 Herbie Goes Bananas " + " indie hits fruity films \n", + "0 The Professional A Clockwork Orange \n", + "1 Seven Samurai Pineapple Express \n", + "2 Akira James and the Giant Peach \n", + "3 The Postman The Grapes of Wrath \n", + "4 Cube Bananas \n", + "5 Castle in the Sky What's Eating Gilbert Grape \n", + "6 M The Apple Dumpling Gang \n", + "7 Once Upon a Time in America Soldier of Orange \n", + "8 All Quiet on the Western Front Orange County \n", + "9 Aguirre: The Wrath of God Herbie Goes Bananas " ] }, - "execution_count": 25, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1271,95 +1259,250 @@ "metadata": {}, "source": [ "## Keeping Things Fresh\n", - "You've probably noticed that a few movies get repeated in these lists. That's not surprising as all our results are personalized and things like `popularity` and `user_rating` and `revenue` are likely highly correlated. And it's more that likely that at least some of the recommendations we're expecting to be highly rated by a given user is one they've already watched and rated highly.\n", + "You've probably noticed that a few movies get repeated in these lists. That's not surprising as all our results are personalized and things like `popularity` and `user_rating` and `revenue` are likely highly correlated. And it's more that likely that at least some of the recommendations we're expecting to be highly rated by a given user are ones they've already watched and rated highly.\n", "\n", - "Luckily Redis offers an easy anwer to keeping recommendations new and interesting, and that answer is Bloom Filters." + "Luckily Redis offers an easy answer to keeping recommendations new and interesting, and that answer is Bloom Filters." ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 19, "metadata": {}, "outputs": [ { - "ename": "AttributeError", - "evalue": "'bool' object has no attribute 'add'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[34], line 40\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28mfilter\u001b[39m \u001b[38;5;241m=\u001b[39m client\u001b[38;5;241m.\u001b[39mbf()\u001b[38;5;241m.\u001b[39mcreate(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124muser_watched_list:\u001b[39m\u001b[38;5;132;01m{user_id}\u001b[39;00m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;241m0.01\u001b[39m, \u001b[38;5;241m1000\u001b[39m)\n\u001b[1;32m 39\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m movie_id \u001b[38;5;129;01min\u001b[39;00m watched_movies:\n\u001b[0;32m---> 40\u001b[0m \u001b[38;5;28;43mfilter\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43madd\u001b[49m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00muser_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmovie_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 42\u001b[0m Top_picks_for_you \u001b[38;5;241m=\u001b[39m get_unique_recommendations(user_id\u001b[38;5;241m=\u001b[39muser_id) \u001b[38;5;66;03m# general SVD results, no filter\u001b[39;00m\n\u001b[1;32m 43\u001b[0m block_buster_hits \u001b[38;5;241m=\u001b[39m get_unique_recommendations(user_id\u001b[38;5;241m=\u001b[39muser_id, filters\u001b[38;5;241m=\u001b[39mblock_buster_filter)\n", - "\u001b[0;31mAttributeError\u001b[0m: 'bool' object has no attribute 'add'" + "name": "stdout", + "output_type": "stream", + "text": [ + "BFCommands.exists() missing 1 required positional argument: 'item'\n" ] } ], "source": [ - "\n", - "# create a bloom filter for a given user and add their watched list to it\n", - "def create_bloom_filter(user_id, watched_movies):\n", - " if not client.bf().exists(f\"user_watched_list\"):\n", - " filter = client.bf().create(f\"user_watched_list\", 0.01, 1000)\n", - " for movie_id in watched_movies:\n", - " client.bf().add(f\"user_watched_list\", f\"{user_id}:{movie_id}\")\n", - " return filter\n", - "\n", "# rewrite the get_recommendations() function to use a bloom filter and apply it before we return results\n", "def get_unique_recommendations(user_id, filters=None, num_results=10):\n", " user_vector = user_vectors[user_id].tolist()\n", - " bloom_filter_name = f\"user:{user_id}:watched\"\n", + " watched_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()\n", + "\n", + " client.bf().insert('user_watched_list', [f\"{user_id}:{movie_id}\" for movie_id in watched_movies])\n", "\n", " query = RangeQuery(vector=user_vector,\n", " vector_field_name='movie_vector',\n", " num_results=num_results * 2, # fetch more results to filter out watched movies\n", " filter_expression=filters,\n", - " #return_fields=['title', 'overview', 'genres', 'movie_id'])\n", - " return_fields=['title', 'movieId'])\n", - "\n", + " return_fields=['title', '$.movie_id', '$.movieId', 'imdb_id', 'imdbId','overview', 'genres'],\n", + " ) # TODO figure out why i need to add '$.' to some fields, but not others\n", " results = movie_index.query(query)\n", "\n", " # filter out movies that the user has already watched\n", " recommendations = []\n", " for r in results:\n", - " print(r)\n", - " if not bloom_client.bfExists(bloom_filter_name, r['movieId']):\n", + " if not client.bf().exists('user_watched_list', r['$.movieId']):\n", " recommendations.append((r['title'], r['overview'], r['genres'], r['vector_distance']))\n", " if len(recommendations) >= num_results:\n", " break\n", - "\n", " return recommendations\n", "\n", "# example usage\n", - "user_id = 42\n", - "watched_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()\n", + "# create a bloom filter for this user\n", + "try:\n", + " client.bf().exists(f\"user_watched_list\")\n", + "except Exception as e:\n", + " print(e)\n", + " pass\n", "\n", - "filter = client.bf().create('user_watched_list:{user_id}', 0.01, 1000)\n", - "for movie_id in watched_movies:\n", - " filter.add(f'{user_id}:{movie_id}')\n", + "user_id = 42\n", "\n", - "Top_picks_for_you = get_unique_recommendations(user_id=user_id) # general SVD results, no filter\n", + "top_picks_for_you = get_unique_recommendations(user_id=user_id) # general SVD results, no filter\n", "block_buster_hits = get_unique_recommendations(user_id=user_id, filters=block_buster_filter)\n", "classics = get_unique_recommendations(user_id=user_id, filters=classics_filter)\n", - "Whats_popular = get_unique_recommendations(user_id=user_id, filters=popular_filter)\n", + "whats_popular = get_unique_recommendations(user_id=user_id, filters=popular_filter)\n", "indie_hits = get_unique_recommendations(user_id=user_id, filters=indie_filter)\n", "fruity_films = get_unique_recommendations(user_id=user_id, filters=fruity)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "vscode": { "languageId": "ruby" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
top picksblock bustersclassicswhat's popularindie hitsfruity films
0The ProfessionalOne Flew Over the Cuckoo's NestThe ProfessionalThe Dark KnightThe ProfessionalA Clockwork Orange
1One Flew Over the Cuckoo's NestThe GodfatherOne Flew Over the Cuckoo's NestThe Shawshank RedemptionSeven SamuraiPineapple Express
2The GodfatherLeon: The ProfessionalThe GodfatherPulp FictionAkiraJames and the Giant Peach
3Leon: The ProfessionalThe Godfather: Part IIThe Godfather: Part IIFight ClubThe PostmanThe Grapes of Wrath
4The Godfather: Part IIThe Dark KnightA Clockwork OrangeBig Hero 6CubeBananas
5A Clockwork OrangeFargoThe African QueenBlade RunnerCastle in the SkyWhat's Eating Gilbert Grape
6Let the Right One InThe GraduateCool Hand LukeGone GirlMThe Apple Dumpling Gang
7The African QueenEternal Sunshine of the Spotless MindThe GraduateWhiplashOnce Upon a Time in AmericaSoldier of Orange
8Cool Hand LukeInception12 Angry MenThe AvengersAll Quiet on the Western FrontOrange County
9The Dark KnightBoyz n the HoodThe Treasure of the Sierra MadreGuardians of the GalaxyAguirre: The Wrath of GodHerbie Goes Bananas
\n", + "
" + ], + "text/plain": [ + " top picks block busters \\\n", + "0 The Professional One Flew Over the Cuckoo's Nest \n", + "1 One Flew Over the Cuckoo's Nest The Godfather \n", + "2 The Godfather Leon: The Professional \n", + "3 Leon: The Professional The Godfather: Part II \n", + "4 The Godfather: Part II The Dark Knight \n", + "5 A Clockwork Orange Fargo \n", + "6 Let the Right One In The Graduate \n", + "7 The African Queen Eternal Sunshine of the Spotless Mind \n", + "8 Cool Hand Luke Inception \n", + "9 The Dark Knight Boyz n the Hood \n", + "\n", + " classics what's popular \\\n", + "0 The Professional The Dark Knight \n", + "1 One Flew Over the Cuckoo's Nest The Shawshank Redemption \n", + "2 The Godfather Pulp Fiction \n", + "3 The Godfather: Part II Fight Club \n", + "4 A Clockwork Orange Big Hero 6 \n", + "5 The African Queen Blade Runner \n", + "6 Cool Hand Luke Gone Girl \n", + "7 The Graduate Whiplash \n", + "8 12 Angry Men The Avengers \n", + "9 The Treasure of the Sierra Madre Guardians of the Galaxy \n", + "\n", + " indie hits fruity films \n", + "0 The Professional A Clockwork Orange \n", + "1 Seven Samurai Pineapple Express \n", + "2 Akira James and the Giant Peach \n", + "3 The Postman The Grapes of Wrath \n", + "4 Cube Bananas \n", + "5 Castle in the Sky What's Eating Gilbert Grape \n", + "6 M The Apple Dumpling Gang \n", + "7 Once Upon a Time in America Soldier of Orange \n", + "8 All Quiet on the Western Front Orange County \n", + "9 Aguirre: The Wrath of God Herbie Goes Bananas " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# put all these titles into a single pandas dataframe , where each column is one category\n", "all_recommendations = pd.DataFrame(columns=[\"top picks\", \"block busters\", \"classics\", \"what's popular\", \"indie hits\", \"fruity films\"])\n", - "all_recommendations[\"top picks\"] = [m[0] for m in Top_picks_for_you]\n", + "all_recommendations[\"top picks\"] = [m[0] for m in top_picks_for_you]\n", "all_recommendations[\"block busters\"] = [m[0] for m in block_buster_hits]\n", "all_recommendations[\"classics\"] = [m[0] for m in classics]\n", - "all_recommendations[\"what's popular\"] = [m[0] for m in Whats_popular]\n", + "all_recommendations[\"what's popular\"] = [m[0] for m in whats_popular]\n", "all_recommendations[\"indie hits\"] = [m[0] for m in indie_hits]\n", "all_recommendations[\"fruity films\"] = [m[0] for m in fruity_films]\n", "\n", @@ -1376,14 +1519,14 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Deleted 4351 keys\n", + "Deleted 4348 keys\n", "Deleted 2000 keys\n", "Deleted 1000 keys\n", "Deleted 500 keys\n", @@ -1396,7 +1539,7 @@ "1" ] }, - "execution_count": 27, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } From a36f4bbb185d25288e2051daf987c19c167a823c Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Mon, 14 Oct 2024 15:11:39 -0700 Subject: [PATCH 06/12] cleans up schema and notebook cells --- .../collaborative_filtering.ipynb | 714 ++++++++++-------- .../collaborative_filtering_schema.yaml | 4 +- .../recommendation-systems/user_schema.yaml | 20 - 3 files changed, 398 insertions(+), 340 deletions(-) delete mode 100644 python-recipes/recommendation-systems/user_schema.yaml diff --git a/python-recipes/recommendation-systems/collaborative_filtering.ipynb b/python-recipes/recommendation-systems/collaborative_filtering.ipynb index 99044c02..afa93fe4 100644 --- a/python-recipes/recommendation-systems/collaborative_filtering.ipynb +++ b/python-recipes/recommendation-systems/collaborative_filtering.ipynb @@ -93,9 +93,7 @@ "metadata": {}, "outputs": [], "source": [ - "ratings_file = 'ratings_small.csv'\n", - "\n", - "ratings_df = fetch_dataframe(ratings_file)\n", + "ratings_df = fetch_dataframe('ratings_small.csv') # for a larger example use 'ratings.csv' instead\n", "\n", "# only keep the columns we need: userId, movieId, rating\n", "ratings_df = ratings_df[['userId', 'movieId', 'rating']]\n", @@ -164,9 +162,9 @@ "\n", "| movie_1_feature_1 | movie_2_feature_1 | movie_3_feature_1 | ... | movie_M_feature_1 |\n", "| --- | --- | --- | --- | --- |\n", - "| movie_1_feature_2 | movie_2_feature_2 | movie_3_feature_2 | ... | movie_M_feature_1 |\n", - "| movie_1_feature_3 | movie_2_feature_3 | movie_3_feature_3 | ... | movie_M_feature_1 |\n", - "| movie_1_feature_4 | movie_2_feature_4 | movie_3_feature_4 | ... | movie_M_feature_1 |\n", + "| movie_1_feature_2 | movie_2_feature_2 | movie_3_feature_2 | ... | movie_M_feature_2 |\n", + "| movie_1_feature_3 | movie_2_feature_3 | movie_3_feature_3 | ... | movie_M_feature_3 |\n", + "| movie_1_feature_4 | movie_2_feature_4 | movie_3_feature_4 | ... | movie_M_feature_4 |\n", "| ... | . | . | ... | . |\n", "| movie_1_feature_k | movie_2_feature_k | movie_3_feature_k | ... | movie_M_feature_k |\n", "\n", @@ -182,7 +180,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 5, @@ -195,7 +193,7 @@ "train_set, test_set = train_test_split(ratings_data, test_size=0.2)\n", "\n", "# use SVD (Singular Value Decomposition) for collaborative filtering\n", - "svd = SVD(n_factors=100, biased=False) # We'll set biased to False so that predictions are of the form \"rating_prediction = user_vector dot item_vector\"\n", + "svd = SVD(n_factors=100, biased=False) # we'll set biased to False so that predictions are of the form \"rating_prediction = user_vector dot item_vector\"\n", "\n", "# train the algorithm on the train_set\n", "svd.fit(train_set)" @@ -232,7 +230,7 @@ "output_type": "stream", "text": [ "we have 671 users with feature vectors of size 100\n", - "we have 8393 movies with feature vectors of size 100\n" + "we have 8376 movies with feature vectors of size 100\n" ] } ], @@ -265,7 +263,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "the predicted rating of user 347 on movie 5515 is 0.965787539953316\n" + "the predicted rating of user 347 on movie 5515 is 1.2554222750662518\n" ] } ], @@ -279,29 +277,6 @@ "print(f'the predicted rating of user {347} on movie {5515} is {predicted_rating}')" ] }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "user: 347 item: 5515 r_ui = None est = 0.97 {'was_impossible': False}\n", - "0.965787539953316\n" - ] - } - ], - "source": [ - "# sanity check my math matches Surprise package math\n", - "print(svd.predict(347, 5515))\n", - "\n", - "inner_uid = train_set.to_inner_uid(347)\n", - "inner_iid = train_set.to_inner_iid(5515)\n", - "print(np.dot(user_vectors[inner_uid], movie_vectors[inner_iid])) # surprise casts userId and movieId to inner ids" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -635,8 +610,8 @@ "metadata": {}, "outputs": [], "source": [ + "links_df = fetch_dataframe('links_small.csv') # for a larger example use 'links.csv' instead\n", "\n", - "links_df = fetch_dataframe('links_small.csv')\n", "movies_df = movies_df.merge(links_df, left_on='imdb_id', right_on='imdbId', how='inner')" ] }, @@ -651,7 +626,213 @@ "cell_type": "code", "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
budgetgenresidimdb_idoriginal_languageoverviewpopularityrelease_daterevenueruntimestatustaglinetitlevote_averagevote_countmovieIdimdbIdtmdbIdmovie_vector
030000000[Animation, Comedy, Family]862114709enLed by Woody, Andy's toys live happily in his ...21.946943815040000.037355403381.0ReleasedToy Story7.754151114709862.0[-0.09139158006123944, 0.3113782797006747, -0....
165000000[Adventure, Fantasy, Family]8844113497enWhen siblings Judy and Peter discover an encha...17.015539819014400.0262797249104.0ReleasedRoll the dice and unleash the excitement!Jumanji6.9241321134978844.0[-0.5145776514053282, 0.18805717045856102, 0.0...
20[Romance, Comedy]15602113228enA family wedding reignites the ancient feud be...11.712900819619200.00101.0ReleasedStill Yelling. Still Fighting. Still Ready for...Grumpier Old Men6.592311322815602.0[-0.09342489820078766, 0.1563727417086737, -0....
316000000[Comedy, Drama, Romance]31357114885enCheated on, mistreated and stepped on, the wom...3.859495819619200.081452156127.0ReleasedFriends are the people who let you be yourself...Waiting to Exhale6.134411488531357.0[-0.033617228695296826, 0.20003386580703916, 0...
40[Comedy]11862113041enJust when George Banks has recovered from his ...8.387519792403200.076578911106.0ReleasedJust When His World Is Back To Normal... He's ...Father of the Bride Part II5.7173511304111862.0[0.03270775039139693, 0.16435040013526048, 0.0...
\n", + "
" + ], + "text/plain": [ + " budget genres id imdb_id original_language \\\n", + "0 30000000 [Animation, Comedy, Family] 862 114709 en \n", + "1 65000000 [Adventure, Fantasy, Family] 8844 113497 en \n", + "2 0 [Romance, Comedy] 15602 113228 en \n", + "3 16000000 [Comedy, Drama, Romance] 31357 114885 en \n", + "4 0 [Comedy] 11862 113041 en \n", + "\n", + " overview popularity \\\n", + "0 Led by Woody, Andy's toys live happily in his ... 21.946943 \n", + "1 When siblings Judy and Peter discover an encha... 17.015539 \n", + "2 A family wedding reignites the ancient feud be... 11.712900 \n", + "3 Cheated on, mistreated and stepped on, the wom... 3.859495 \n", + "4 Just when George Banks has recovered from his ... 8.387519 \n", + "\n", + " release_date revenue runtime status \\\n", + "0 815040000.0 373554033 81.0 Released \n", + "1 819014400.0 262797249 104.0 Released \n", + "2 819619200.0 0 101.0 Released \n", + "3 819619200.0 81452156 127.0 Released \n", + "4 792403200.0 76578911 106.0 Released \n", + "\n", + " tagline \\\n", + "0 \n", + "1 Roll the dice and unleash the excitement! \n", + "2 Still Yelling. Still Fighting. Still Ready for... \n", + "3 Friends are the people who let you be yourself... \n", + "4 Just When His World Is Back To Normal... He's ... \n", + "\n", + " title vote_average vote_count movieId imdbId \\\n", + "0 Toy Story 7.7 5415 1 114709 \n", + "1 Jumanji 6.9 2413 2 113497 \n", + "2 Grumpier Old Men 6.5 92 3 113228 \n", + "3 Waiting to Exhale 6.1 34 4 114885 \n", + "4 Father of the Bride Part II 5.7 173 5 113041 \n", + "\n", + " tmdbId movie_vector \n", + "0 862.0 [-0.09139158006123944, 0.3113782797006747, -0.... \n", + "1 8844.0 [-0.5145776514053282, 0.18805717045856102, 0.0... \n", + "2 15602.0 [-0.09342489820078766, 0.1563727417086737, -0.... \n", + "3 31357.0 [-0.033617228695296826, 0.20003386580703916, 0... \n", + "4 11862.0 [0.03270775039139693, 0.16435040013526048, 0.0... " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# build a dataframe out of the user vectors and their userIds\n", "user_vectors_and_ids = {train_set.to_raw_uid(inner_id): user_vectors[inner_id].tolist() for inner_id in train_set.all_users()}\n", @@ -662,7 +843,9 @@ "movie_vector_df = pd.Series(movie_vectors_and_ids).to_frame('movie_vector')\n", "\n", "# merge the movie vector series with the movies dataframe using the movieId and id fields\n", - "movies_df = movies_df.merge(movie_vector_df, left_on='movieId', right_index=True, how='inner')\n" + "movies_df = movies_df.merge(movie_vector_df, left_on='movieId', right_index=True, how='inner')\n", + "movies_df['movieId'] = movies_df['movieId'].apply(lambda x: str(x)) # need to cast to a string as this is a tag field in our search schema\n", + "movies_df.head()" ] }, { @@ -684,8 +867,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "12:30:31 redisvl.index.index INFO Index already exists, overwriting.\n", - "12:30:31 redisvl.index.index INFO Index already exists, overwriting.\n" + "15:08:18 redisvl.index.index INFO Index already exists, overwriting.\n" ] } ], @@ -699,12 +881,7 @@ "movie_schema = IndexSchema.from_yaml(\"collaborative_filtering_schema.yaml\")\n", "\n", "movie_index = SearchIndex(movie_schema, redis_client=client)\n", - "movie_index.create(overwrite=True, drop=True)\n", - "\n", - "user_schema = IndexSchema.from_yaml(\"user_schema.yaml\")\n", - "\n", - "user_index = SearchIndex(user_schema, redis_client=client)\n", - "user_index.create(overwrite=True, drop=True)" + "movie_index.create(overwrite=True, drop=True)" ] }, { @@ -725,10 +902,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "number of movies 8348\n", - "size of movie df 8348\n", - "unique movie ids 8342\n", - "unique movie titles 8109\n", + "number of movies 8334\n", + "size of movie df 8334\n", + "unique movie ids 8327\n", + "unique movie titles 8082\n", "unique movies rated 9065\n" ] }, @@ -795,7 +972,7 @@ " 1\n", " 114709\n", " 862.0\n", - " [0.16217072665688012, 0.245026260806211, -0.14...\n", + " [-0.09139158006123944, 0.3113782797006747, -0....\n", " \n", " \n", " 1\n", @@ -817,7 +994,7 @@ " 2\n", " 113497\n", " 8844.0\n", - " [-0.0495065883180616, 0.017243236163025016, -0...\n", + " [-0.5145776514053282, 0.18805717045856102, 0.0...\n", " \n", " \n", " 2\n", @@ -839,7 +1016,7 @@ " 3\n", " 113228\n", " 15602.0\n", - " [0.07067590986084793, 0.20963299716890343, 0.2...\n", + " [-0.09342489820078766, 0.1563727417086737, -0....\n", " \n", " \n", " 3\n", @@ -861,7 +1038,7 @@ " 4\n", " 114885\n", " 31357.0\n", - " [-0.023481240586441465, 0.1194581665494643, -0...\n", + " [-0.033617228695296826, 0.20003386580703916, 0...\n", " \n", " \n", " 4\n", @@ -883,7 +1060,7 @@ " 5\n", " 113041\n", " 11862.0\n", - " [0.07510781660794685, 0.19069717883675757, -0....\n", + " [0.03270775039139693, 0.16435040013526048, 0.0...\n", " \n", " \n", "\n", @@ -918,19 +1095,19 @@ "3 Friends are the people who let you be yourself... \n", "4 Just When His World Is Back To Normal... He's ... \n", "\n", - " title vote_average vote_count movieId imdbId \\\n", - "0 Toy Story 7.7 5415 1 114709 \n", - "1 Jumanji 6.9 2413 2 113497 \n", - "2 Grumpier Old Men 6.5 92 3 113228 \n", - "3 Waiting to Exhale 6.1 34 4 114885 \n", - "4 Father of the Bride Part II 5.7 173 5 113041 \n", + " title vote_average vote_count movieId imdbId \\\n", + "0 Toy Story 7.7 5415 1 114709 \n", + "1 Jumanji 6.9 2413 2 113497 \n", + "2 Grumpier Old Men 6.5 92 3 113228 \n", + "3 Waiting to Exhale 6.1 34 4 114885 \n", + "4 Father of the Bride Part II 5.7 173 5 113041 \n", "\n", " tmdbId movie_vector \n", - "0 862.0 [0.16217072665688012, 0.245026260806211, -0.14... \n", - "1 8844.0 [-0.0495065883180616, 0.017243236163025016, -0... \n", - "2 15602.0 [0.07067590986084793, 0.20963299716890343, 0.2... \n", - "3 31357.0 [-0.023481240586441465, 0.1194581665494643, -0... \n", - "4 11862.0 [0.07510781660794685, 0.19069717883675757, -0.... " + "0 862.0 [-0.09139158006123944, 0.3113782797006747, -0.... \n", + "1 8844.0 [-0.5145776514053282, 0.18805717045856102, 0.0... \n", + "2 15602.0 [-0.09342489820078766, 0.1563727417086737, -0.... \n", + "3 31357.0 [-0.033617228695296826, 0.20003386580703916, 0... \n", + "4 11862.0 [0.03270775039139693, 0.16435040013526048, 0.0... " ] }, "execution_count": 15, @@ -939,7 +1116,7 @@ } ], "source": [ - "# sanity check we merged all my dataframes properly and have the right sizes of moives, users, vectors, ids, etc.\n", + "# sanity check we merged all dataframes properly and have the right sizes of movies, users, vectors, ids, etc.\n", "number_of_movies = len(movies_df.to_dict(orient='records'))\n", "size_of_movie_df = movies_df.shape[0]\n", "\n", @@ -977,26 +1154,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'id': 'movie:b5b8331ab3044a35bed03e7208dd7079', 'vector_distance': '-3.64372396469', 'title': 'The Lord of the Rings: The Fellowship of the Ring', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n", - "{'id': 'movie:a6e4d98e9bd34503aa0ece6924cdb4c9', 'vector_distance': '-3.59878540039', 'title': 'The Dark Knight', 'genres': '[\"Drama\",\"Action\",\"Crime\",\"Thriller\"]'}\n", - "{'id': 'movie:51e5e1fbd6d940d894122e98fc638e85', 'vector_distance': '-3.59825658798', 'title': '12 Angry Men', 'genres': '[\"Drama\"]'}\n", - "{'id': 'movie:876f2cc13807471ba1dbaedbd92303c8', 'vector_distance': '-3.59230089188', 'title': 'Leon: The Professional', 'genres': '[\"Thriller\",\"Crime\",\"Drama\"]'}\n", - "{'id': 'movie:05a21bd00b8b4ed6938fc52cb532a601', 'vector_distance': '-3.54890108109', 'title': 'The Matrix', 'genres': '[\"Action\",\"Science Fiction\"]'}\n", - "{'id': 'movie:4644b98a4a21435981928b2d2871089d', 'vector_distance': '-3.4690322876', 'title': 'Band of Brothers', 'genres': '[\"Action\",\"Drama\",\"War\"]'}\n", - "{'id': 'movie:b096f81674a24f00a24cd9289d777913', 'vector_distance': '-3.46432924271', 'title': 'Memento', 'genres': '[\"Mystery\",\"Thriller\"]'}\n", - "{'id': 'movie:6861752bd166469aa89ab1ca69950ee8', 'vector_distance': '-3.4593539238', 'title': 'The Princess Bride', 'genres': '[\"Adventure\",\"Family\",\"Fantasy\",\"Comedy\",\"Romance\"]'}\n", - "{'id': 'movie:44911da20d8e4ad3a93a8452a3438feb', 'vector_distance': '-3.44543361664', 'title': 'American History X', 'genres': '[\"Drama\"]'}\n", - "{'id': 'movie:6f04f57431894700b9d23b60ebf8fac0', 'vector_distance': '-3.44274091721', 'title': 'Interstellar', 'genres': '[\"Adventure\",\"Drama\",\"Science Fiction\"]'}\n", - "{'id': 'movie:b2f480a4c97d404d8a5caa248b59a0e3', 'vector_distance': '-3.43494272232', 'title': 'Cinema Paradiso', 'genres': '[\"Drama\",\"Romance\"]'}\n", - "{'id': 'movie:367a0ad625564683b8818edc82b3878b', 'vector_distance': '-3.42747116089', 'title': 'The Prestige', 'genres': '[\"Drama\",\"Mystery\",\"Thriller\"]'}\n", - "{'id': 'movie:ec5ccd6e24fa470eae864f1ed3f7c566', 'vector_distance': '-3.42468452454', 'title': 'The African Queen', 'genres': '[\"Adventure\",\"War\",\"Romance\"]'}\n", - "{'id': 'movie:415b17e8f1a64ec0b7819068ae0ebc2d', 'vector_distance': '-3.4210562706', 'title': 'Happiness', 'genres': '[\"Comedy\",\"Drama\"]'}\n", - "{'id': 'movie:81802cdb06684b7fb3c26e754cb0bc50', 'vector_distance': '-3.41307687759', 'title': 'Forrest Gump', 'genres': '[\"Comedy\",\"Drama\",\"Romance\"]'}\n", - "{'id': 'movie:ce087387efd84691bab6a4228289ca47', 'vector_distance': '-3.40901231766', 'title': 'Thank You for Smoking', 'genres': '[\"Comedy\",\"Drama\"]'}\n", - "{'id': 'movie:846793c4d43e4f3a9c7536682e169789', 'vector_distance': '-3.38840723038', 'title': 'A Close Shave', 'genres': '[\"Family\",\"Animation\",\"Comedy\"]'}\n", - "{'id': 'movie:8e370a42277a43859a19cef8223549b2', 'vector_distance': '-3.38426446915', 'title': 'Up', 'genres': '[\"Animation\",\"Comedy\",\"Family\",\"Adventure\"]'}\n", - "{'id': 'movie:01c1de9dd23046f1ba630daf295b91a1', 'vector_distance': '-3.36946439743', 'title': 'Sin City', 'genres': '[\"Action\",\"Thriller\",\"Crime\"]'}\n", - "{'id': 'movie:a60d16b71e874eb99fad5461dc48b034', 'vector_distance': '-3.363966465', 'title': 'The Departed', 'genres': '[\"Drama\",\"Thriller\",\"Crime\"]'}\n" + "{'id': 'movie:879bc7ef3bac4639a76b8d39eb22ae25', 'vector_distance': '-3.82712745667', 'title': 'Star Wars', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", + "{'id': 'movie:db33b0943cf942a5b410e71669c2f47f', 'vector_distance': '-3.75959968567', 'title': 'The Lord of the Rings: The Fellowship of the Ring', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n", + "{'id': 'movie:994ee37edbfe42efaf023ba750e43b08', 'vector_distance': '-3.74698734283', 'title': 'The Lord of the Rings: The Two Towers', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n", + "{'id': 'movie:1e0c3f95110e48d4bacefcb38817448f', 'vector_distance': '-3.736120224', 'title': 'The Empire Strikes Back', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", + "{'id': 'movie:43b966cec4454bc4a59949f037ab5d80', 'vector_distance': '-3.55685997009', 'title': 'The Shawshank Redemption', 'genres': '[\"Drama\",\"Crime\"]'}\n", + "{'id': 'movie:1a42cf127f7c422e9542934c560dccf3', 'vector_distance': '-3.4610490799', 'title': 'The Lord of the Rings: The Return of the King', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n", + "{'id': 'movie:720bb022230c4755ac5e039043ea63a5', 'vector_distance': '-3.3770198822', 'title': 'Return of the Jedi', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", + "{'id': 'movie:0300dfccc16d48e69ff6b2a8f89f3307', 'vector_distance': '-3.32906913757', 'title': 'Shrek', 'genres': '[\"Adventure\",\"Animation\",\"Comedy\",\"Family\",\"Fantasy\"]'}\n", + "{'id': 'movie:1658113a6de2434ab52cdd0050ac87d9', 'vector_distance': '-3.3269367218', 'title': 'The Sixth Sense', 'genres': '[\"Mystery\",\"Thriller\",\"Drama\"]'}\n", + "{'id': 'movie:eb1d3a76e17447558ba10b331acb3f93', 'vector_distance': '-3.3075504303', 'title': 'Raiders of the Lost Ark', 'genres': '[\"Adventure\",\"Action\"]'}\n", + "{'id': 'movie:fca0a88760ba4dcbb852d13d1c862aa8', 'vector_distance': '-3.29939317703', 'title': 'The Silence of the Lambs', 'genres': '[\"Crime\",\"Drama\",\"Thriller\"]'}\n", + "{'id': 'movie:a930a67907ef449eaa5dda12af79f5af', 'vector_distance': '-3.25780773163', 'title': 'Lock, Stock and Two Smoking Barrels', 'genres': '[\"Comedy\",\"Crime\"]'}\n" ] } ], @@ -1009,7 +1178,7 @@ "# this is what we want. The predicted rating on a scale of 0 to 5 is then -(score - 1) == -score + 1\n", "query = RangeQuery(vector=user_vector,\n", " vector_field_name='movie_vector',\n", - " num_results=20,\n", + " num_results=12,\n", " return_score=True,\n", " return_fields=['title', 'genres']\n", " )\n", @@ -1106,134 +1275,134 @@ " \n", " \n", " 0\n", - " The Professional\n", - " One Flew Over the Cuckoo's Nest\n", - " The Professional\n", - " The Dark Knight\n", - " The Professional\n", - " A Clockwork Orange\n", + " The Fugitive\n", + " The Fugitive\n", + " The Philadelphia Story\n", + " The Shawshank Redemption\n", + " Shine\n", + " What's Eating Gilbert Grape\n", " \n", " \n", " 1\n", - " One Flew Over the Cuckoo's Nest\n", - " The Godfather\n", - " One Flew Over the Cuckoo's Nest\n", - " The Shawshank Redemption\n", - " Seven Samurai\n", - " Pineapple Express\n", + " Terminator 2: Judgment Day\n", + " Terminator 2: Judgment Day\n", + " Butch Cassidy and the Sundance Kid\n", + " Pulp Fiction\n", + " La Haine\n", + " The Grapes of Wrath\n", " \n", " \n", " 2\n", - " The Godfather\n", - " Leon: The Professional\n", - " The Godfather\n", - " Pulp Fiction\n", - " Akira\n", - " James and the Giant Peach\n", + " Clear and Present Danger\n", + " Clear and Present Danger\n", + " Star Wars\n", + " Blade Runner\n", + " Castle in the Sky\n", + " Pineapple Express\n", " \n", " \n", " 3\n", - " Leon: The Professional\n", - " The Godfather: Part II\n", - " The Godfather: Part II\n", - " Fight Club\n", - " The Postman\n", - " The Grapes of Wrath\n", + " The Silence of the Lambs\n", + " The Silence of the Lambs\n", + " The Bridge on the River Kwai\n", + " The Dark Knight\n", + " All About Eve\n", + " A Clockwork Orange\n", " \n", " \n", " 4\n", - " The Godfather: Part II\n", - " The Dark Knight\n", - " A Clockwork Orange\n", - " Big Hero 6\n", - " Cube\n", - " Bananas\n", + " Schindler's List\n", + " Schindler's List\n", + " The Treasure of the Sierra Madre\n", + " Whiplash\n", + " M\n", + " James and the Giant Peach\n", " \n", " \n", " 5\n", - " A Clockwork Orange\n", - " Fargo\n", - " The African Queen\n", - " Blade Runner\n", - " Castle in the Sky\n", - " What's Eating Gilbert Grape\n", + " Se7en\n", + " Se7en\n", + " A Christmas Story\n", + " Fight Club\n", + " The Contender\n", + " Bananas\n", " \n", " \n", " 6\n", - " Let the Right One In\n", - " The Graduate\n", - " Cool Hand Luke\n", - " Gone Girl\n", - " M\n", + " The Philadelphia Story\n", + " Speed\n", + " The Little Mermaid\n", + " Big Hero 6\n", + " The Postman\n", " The Apple Dumpling Gang\n", " \n", " \n", " 7\n", - " The African Queen\n", - " Eternal Sunshine of the Spotless Mind\n", - " The Graduate\n", - " Whiplash\n", - " Once Upon a Time in America\n", - " Soldier of Orange\n", + " A Close Shave\n", + " Fargo\n", + " Roger & Me\n", + " The Avengers\n", + " Maverick\n", + " Herbie Goes Bananas\n", " \n", " \n", " 8\n", - " Cool Hand Luke\n", - " Inception\n", - " 12 Angry Men\n", - " The Avengers\n", - " All Quiet on the Western Front\n", + " The Usual Suspects\n", + " Amélie\n", + " Dead Poets Society\n", + " Gone Girl\n", + " The Meaning of Life\n", " Orange County\n", " \n", " \n", " 9\n", - " The Dark Knight\n", - " Boyz n the Hood\n", - " The Treasure of the Sierra Madre\n", + " Speed\n", + " Jurassic Park\n", + " Stand by Me\n", " Guardians of the Galaxy\n", - " Aguirre: The Wrath of God\n", - " Herbie Goes Bananas\n", + " Frost/Nixon\n", + " The Apple Dumpling Gang Rides Again\n", " \n", " \n", "\n", "" ], "text/plain": [ - " top picks block busters \\\n", - "0 The Professional One Flew Over the Cuckoo's Nest \n", - "1 One Flew Over the Cuckoo's Nest The Godfather \n", - "2 The Godfather Leon: The Professional \n", - "3 Leon: The Professional The Godfather: Part II \n", - "4 The Godfather: Part II The Dark Knight \n", - "5 A Clockwork Orange Fargo \n", - "6 Let the Right One In The Graduate \n", - "7 The African Queen Eternal Sunshine of the Spotless Mind \n", - "8 Cool Hand Luke Inception \n", - "9 The Dark Knight Boyz n the Hood \n", + " top picks block busters \\\n", + "0 The Fugitive The Fugitive \n", + "1 Terminator 2: Judgment Day Terminator 2: Judgment Day \n", + "2 Clear and Present Danger Clear and Present Danger \n", + "3 The Silence of the Lambs The Silence of the Lambs \n", + "4 Schindler's List Schindler's List \n", + "5 Se7en Se7en \n", + "6 The Philadelphia Story Speed \n", + "7 A Close Shave Fargo \n", + "8 The Usual Suspects Amélie \n", + "9 Speed Jurassic Park \n", "\n", - " classics what's popular \\\n", - "0 The Professional The Dark Knight \n", - "1 One Flew Over the Cuckoo's Nest The Shawshank Redemption \n", - "2 The Godfather Pulp Fiction \n", - "3 The Godfather: Part II Fight Club \n", - "4 A Clockwork Orange Big Hero 6 \n", - "5 The African Queen Blade Runner \n", - "6 Cool Hand Luke Gone Girl \n", - "7 The Graduate Whiplash \n", - "8 12 Angry Men The Avengers \n", - "9 The Treasure of the Sierra Madre Guardians of the Galaxy \n", + " classics what's popular \\\n", + "0 The Philadelphia Story The Shawshank Redemption \n", + "1 Butch Cassidy and the Sundance Kid Pulp Fiction \n", + "2 Star Wars Blade Runner \n", + "3 The Bridge on the River Kwai The Dark Knight \n", + "4 The Treasure of the Sierra Madre Whiplash \n", + "5 A Christmas Story Fight Club \n", + "6 The Little Mermaid Big Hero 6 \n", + "7 Roger & Me The Avengers \n", + "8 Dead Poets Society Gone Girl \n", + "9 Stand by Me Guardians of the Galaxy \n", "\n", - " indie hits fruity films \n", - "0 The Professional A Clockwork Orange \n", - "1 Seven Samurai Pineapple Express \n", - "2 Akira James and the Giant Peach \n", - "3 The Postman The Grapes of Wrath \n", - "4 Cube Bananas \n", - "5 Castle in the Sky What's Eating Gilbert Grape \n", - "6 M The Apple Dumpling Gang \n", - "7 Once Upon a Time in America Soldier of Orange \n", - "8 All Quiet on the Western Front Orange County \n", - "9 Aguirre: The Wrath of God Herbie Goes Bananas " + " indie hits fruity films \n", + "0 Shine What's Eating Gilbert Grape \n", + "1 La Haine The Grapes of Wrath \n", + "2 Castle in the Sky Pineapple Express \n", + "3 All About Eve A Clockwork Orange \n", + "4 M James and the Giant Peach \n", + "5 The Contender Bananas \n", + "6 The Postman The Apple Dumpling Gang \n", + "7 Maverick Herbie Goes Bananas \n", + "8 The Meaning of Life Orange County \n", + "9 Frost/Nixon The Apple Dumpling Gang Rides Again " ] }, "execution_count": 18, @@ -1242,7 +1411,7 @@ } ], "source": [ - "# put all these titles into a single pandas dataframe , where each column is one category\n", + "# put all these titles into a single pandas dataframe, where each column is one category\n", "all_recommendations = pd.DataFrame(columns=[\"top picks\", \"block busters\", \"classics\", \"what's popular\", \"indie hits\", \"fruity films\"])\n", "all_recommendations[\"top picks\"] = [m[0] for m in Top_picks_for_you]\n", "all_recommendations[\"block busters\"] = [m[0] for m in block_buster_hits]\n", @@ -1268,15 +1437,7 @@ "cell_type": "code", "execution_count": 19, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "BFCommands.exists() missing 1 required positional argument: 'item'\n" - ] - } - ], + "outputs": [], "source": [ "# rewrite the get_recommendations() function to use a bloom filter and apply it before we return results\n", "def get_unique_recommendations(user_id, filters=None, num_results=10):\n", @@ -1287,37 +1448,38 @@ "\n", " query = RangeQuery(vector=user_vector,\n", " vector_field_name='movie_vector',\n", - " num_results=num_results * 2, # fetch more results to filter out watched movies\n", + " num_results=num_results * 5, # fetch more results to filter out watched movies\n", " filter_expression=filters,\n", - " return_fields=['title', '$.movie_id', '$.movieId', 'imdb_id', 'imdbId','overview', 'genres'],\n", + " return_fields=['title', 'overview', 'genres', 'movieId'],\n", " ) # TODO figure out why i need to add '$.' to some fields, but not others\n", " results = movie_index.query(query)\n", "\n", " # filter out movies that the user has already watched\n", " recommendations = []\n", " for r in results:\n", - " if not client.bf().exists('user_watched_list', r['$.movieId']):\n", + " if not client.bf().exists('user_watched_list', f\"{user_id}:{r['movieId']}\"):\n", " recommendations.append((r['title'], r['overview'], r['genres'], r['vector_distance']))\n", " if len(recommendations) >= num_results:\n", " break\n", + "\n", + " # add these films to the bloom filter\n", + " client.bf().insert('user_watched_list', [f\"{user_id}:{r['movieId']}\" for r in results])\n", " return recommendations\n", "\n", "# example usage\n", "# create a bloom filter for this user\n", "try:\n", - " client.bf().exists(f\"user_watched_list\")\n", + " client.bf().create(f\"user_watched_list\", 0.01, 10000)\n", "except Exception as e:\n", " print(e)\n", - " pass\n", "\n", "user_id = 42\n", "\n", - "top_picks_for_you = get_unique_recommendations(user_id=user_id) # general SVD results, no filter\n", - "block_buster_hits = get_unique_recommendations(user_id=user_id, filters=block_buster_filter)\n", - "classics = get_unique_recommendations(user_id=user_id, filters=classics_filter)\n", - "whats_popular = get_unique_recommendations(user_id=user_id, filters=popular_filter)\n", - "indie_hits = get_unique_recommendations(user_id=user_id, filters=indie_filter)\n", - "fruity_films = get_unique_recommendations(user_id=user_id, filters=fruity)" + "top_picks_for_you = get_unique_recommendations(user_id=user_id, num_results=5) # general SVD results, no filter\n", + "block_buster_hits = get_unique_recommendations(user_id=user_id, filters=block_buster_filter, num_results=5)\n", + "classics = get_unique_recommendations(user_id=user_id, filters=classics_filter, num_results=5)\n", + "whats_popular = get_unique_recommendations(user_id=user_id, filters=popular_filter, num_results=5)\n", + "indie_hits = get_unique_recommendations(user_id=user_id, filters=indie_filter, num_results=5)" ] }, { @@ -1355,140 +1517,67 @@ " classics\n", " what's popular\n", " indie hits\n", - " fruity films\n", " \n", " \n", " \n", " \n", " 0\n", - " The Professional\n", - " One Flew Over the Cuckoo's Nest\n", - " The Professional\n", - " The Dark Knight\n", - " The Professional\n", - " A Clockwork Orange\n", + " The Silence of the Lambs\n", + " Star Trek: Generations\n", + " The Bridge on the River Kwai\n", + " Blade Runner\n", + " Shine\n", " \n", " \n", " 1\n", - " One Flew Over the Cuckoo's Nest\n", - " The Godfather\n", - " One Flew Over the Cuckoo's Nest\n", - " The Shawshank Redemption\n", - " Seven Samurai\n", - " Pineapple Express\n", + " Se7en\n", + " Jumanji\n", + " The Treasure of the Sierra Madre\n", + " Whiplash\n", + " La Haine\n", " \n", " \n", " 2\n", - " The Godfather\n", - " Leon: The Professional\n", - " The Godfather\n", - " Pulp Fiction\n", - " Akira\n", - " James and the Giant Peach\n", - " \n", - " \n", - " 3\n", - " Leon: The Professional\n", - " The Godfather: Part II\n", - " The Godfather: Part II\n", + " The Philadelphia Story\n", + " Outbreak\n", + " A Christmas Story\n", " Fight Club\n", - " The Postman\n", - " The Grapes of Wrath\n", + " Castle in the Sky\n", " \n", " \n", - " 4\n", - " The Godfather: Part II\n", - " The Dark Knight\n", - " A Clockwork Orange\n", + " 3\n", + " A Close Shave\n", + " The Lion King\n", + " The Little Mermaid\n", " Big Hero 6\n", - " Cube\n", - " Bananas\n", - " \n", - " \n", - " 5\n", - " A Clockwork Orange\n", - " Fargo\n", - " The African Queen\n", - " Blade Runner\n", - " Castle in the Sky\n", - " What's Eating Gilbert Grape\n", + " All About Eve\n", " \n", " \n", - " 6\n", - " Let the Right One In\n", - " The Graduate\n", - " Cool Hand Luke\n", + " 4\n", + " The Usual Suspects\n", + " Men in Black\n", + " Roger & Me\n", " Gone Girl\n", " M\n", - " The Apple Dumpling Gang\n", - " \n", - " \n", - " 7\n", - " The African Queen\n", - " Eternal Sunshine of the Spotless Mind\n", - " The Graduate\n", - " Whiplash\n", - " Once Upon a Time in America\n", - " Soldier of Orange\n", - " \n", - " \n", - " 8\n", - " Cool Hand Luke\n", - " Inception\n", - " 12 Angry Men\n", - " The Avengers\n", - " All Quiet on the Western Front\n", - " Orange County\n", - " \n", - " \n", - " 9\n", - " The Dark Knight\n", - " Boyz n the Hood\n", - " The Treasure of the Sierra Madre\n", - " Guardians of the Galaxy\n", - " Aguirre: The Wrath of God\n", - " Herbie Goes Bananas\n", " \n", " \n", "\n", "" ], "text/plain": [ - " top picks block busters \\\n", - "0 The Professional One Flew Over the Cuckoo's Nest \n", - "1 One Flew Over the Cuckoo's Nest The Godfather \n", - "2 The Godfather Leon: The Professional \n", - "3 Leon: The Professional The Godfather: Part II \n", - "4 The Godfather: Part II The Dark Knight \n", - "5 A Clockwork Orange Fargo \n", - "6 Let the Right One In The Graduate \n", - "7 The African Queen Eternal Sunshine of the Spotless Mind \n", - "8 Cool Hand Luke Inception \n", - "9 The Dark Knight Boyz n the Hood \n", - "\n", - " classics what's popular \\\n", - "0 The Professional The Dark Knight \n", - "1 One Flew Over the Cuckoo's Nest The Shawshank Redemption \n", - "2 The Godfather Pulp Fiction \n", - "3 The Godfather: Part II Fight Club \n", - "4 A Clockwork Orange Big Hero 6 \n", - "5 The African Queen Blade Runner \n", - "6 Cool Hand Luke Gone Girl \n", - "7 The Graduate Whiplash \n", - "8 12 Angry Men The Avengers \n", - "9 The Treasure of the Sierra Madre Guardians of the Galaxy \n", + " top picks block busters \\\n", + "0 The Silence of the Lambs Star Trek: Generations \n", + "1 Se7en Jumanji \n", + "2 The Philadelphia Story Outbreak \n", + "3 A Close Shave The Lion King \n", + "4 The Usual Suspects Men in Black \n", "\n", - " indie hits fruity films \n", - "0 The Professional A Clockwork Orange \n", - "1 Seven Samurai Pineapple Express \n", - "2 Akira James and the Giant Peach \n", - "3 The Postman The Grapes of Wrath \n", - "4 Cube Bananas \n", - "5 Castle in the Sky What's Eating Gilbert Grape \n", - "6 M The Apple Dumpling Gang \n", - "7 Once Upon a Time in America Soldier of Orange \n", - "8 All Quiet on the Western Front Orange County \n", - "9 Aguirre: The Wrath of God Herbie Goes Bananas " + " classics what's popular indie hits \n", + "0 The Bridge on the River Kwai Blade Runner Shine \n", + "1 The Treasure of the Sierra Madre Whiplash La Haine \n", + "2 A Christmas Story Fight Club Castle in the Sky \n", + "3 The Little Mermaid Big Hero 6 All About Eve \n", + "4 Roger & Me Gone Girl M " ] }, "execution_count": 20, @@ -1498,13 +1587,12 @@ ], "source": [ "# put all these titles into a single pandas dataframe , where each column is one category\n", - "all_recommendations = pd.DataFrame(columns=[\"top picks\", \"block busters\", \"classics\", \"what's popular\", \"indie hits\", \"fruity films\"])\n", + "all_recommendations = pd.DataFrame(columns=[\"top picks\", \"block busters\", \"classics\", \"what's popular\", \"indie hits\"])\n", "all_recommendations[\"top picks\"] = [m[0] for m in top_picks_for_you]\n", "all_recommendations[\"block busters\"] = [m[0] for m in block_buster_hits]\n", "all_recommendations[\"classics\"] = [m[0] for m in classics]\n", "all_recommendations[\"what's popular\"] = [m[0] for m in whats_popular]\n", "all_recommendations[\"indie hits\"] = [m[0] for m in indie_hits]\n", - "all_recommendations[\"fruity films\"] = [m[0] for m in fruity_films]\n", "\n", "all_recommendations.head(10)" ] @@ -1526,7 +1614,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Deleted 4348 keys\n", + "Deleted 4334 keys\n", "Deleted 2000 keys\n", "Deleted 1000 keys\n", "Deleted 500 keys\n", @@ -1549,18 +1637,8 @@ "while remaining := movie_index.clear():\n", " print(f\"Deleted {remaining} keys\")\n", "\n", - "while remaining := user_index.clear():\n", - " print(f\"Deleeted {remaining} keys\")\n", - "\n", "client.delete(\"user_watched_list\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/python-recipes/recommendation-systems/collaborative_filtering_schema.yaml b/python-recipes/recommendation-systems/collaborative_filtering_schema.yaml index f10d686b..0a6f61b4 100644 --- a/python-recipes/recommendation-systems/collaborative_filtering_schema.yaml +++ b/python-recipes/recommendation-systems/collaborative_filtering_schema.yaml @@ -4,9 +4,9 @@ index: storage_type: json fields: - - name: genres + - name: movieId type: tag - - name: movie_id + - name: genres type: tag - name: original_language type: tag diff --git a/python-recipes/recommendation-systems/user_schema.yaml b/python-recipes/recommendation-systems/user_schema.yaml deleted file mode 100644 index 6d5c9ebd..00000000 --- a/python-recipes/recommendation-systems/user_schema.yaml +++ /dev/null @@ -1,20 +0,0 @@ -index: - name: users - prefix: user - storage_type: json - -fields: - - name: user_id - type: tag - - name: ratings - type: numeric - name: watched_list - type: text - - - name: user_vector - type: vector - attrs: - dims: 100 - distance_metric: ip - algorithm: flat - dtype: float32 \ No newline at end of file From fcaffb45885f667b49e38357f90d1c621ebf5cfd Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Mon, 14 Oct 2024 15:34:40 -0700 Subject: [PATCH 07/12] replaces for loop bloom filter check with mexists() --- .../collaborative_filtering.ipynb | 396 +++++++++--------- 1 file changed, 202 insertions(+), 194 deletions(-) diff --git a/python-recipes/recommendation-systems/collaborative_filtering.ipynb b/python-recipes/recommendation-systems/collaborative_filtering.ipynb index afa93fe4..09344699 100644 --- a/python-recipes/recommendation-systems/collaborative_filtering.ipynb +++ b/python-recipes/recommendation-systems/collaborative_filtering.ipynb @@ -180,7 +180,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 5, @@ -230,7 +230,7 @@ "output_type": "stream", "text": [ "we have 671 users with feature vectors of size 100\n", - "we have 8376 movies with feature vectors of size 100\n" + "we have 8377 movies with feature vectors of size 100\n" ] } ], @@ -263,7 +263,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "the predicted rating of user 347 on movie 5515 is 1.2554222750662518\n" + "the predicted rating of user 347 on movie 5515 is 1.3640325071309123\n" ] } ], @@ -289,7 +289,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -529,7 +529,7 @@ "[5 rows x 23 columns]" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -541,7 +541,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -565,7 +565,7 @@ "dtype: int64" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -606,7 +606,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -624,7 +624,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -690,7 +690,7 @@ " 1\n", " 114709\n", " 862.0\n", - " [-0.09139158006123944, 0.3113782797006747, -0....\n", + " [0.03713469204683083, 0.10796564373254629, 0.2...\n", " \n", " \n", " 1\n", @@ -712,7 +712,7 @@ " 2\n", " 113497\n", " 8844.0\n", - " [-0.5145776514053282, 0.18805717045856102, 0.0...\n", + " [-0.010117012753361906, -0.03687474969254127, ...\n", " \n", " \n", " 2\n", @@ -734,7 +734,7 @@ " 3\n", " 113228\n", " 15602.0\n", - " [-0.09342489820078766, 0.1563727417086737, -0....\n", + " [0.13139654322372601, 0.14560140137289648, 0.1...\n", " \n", " \n", " 3\n", @@ -756,7 +756,7 @@ " 4\n", " 114885\n", " 31357.0\n", - " [-0.033617228695296826, 0.20003386580703916, 0...\n", + " [0.1564855291020289, -0.01096475924961168, 0.2...\n", " \n", " \n", " 4\n", @@ -778,7 +778,7 @@ " 5\n", " 113041\n", " 11862.0\n", - " [0.03270775039139693, 0.16435040013526048, 0.0...\n", + " [0.07205704581865023, 0.25224445082871455, 0.0...\n", " \n", " \n", "\n", @@ -821,14 +821,14 @@ "4 Father of the Bride Part II 5.7 173 5 113041 \n", "\n", " tmdbId movie_vector \n", - "0 862.0 [-0.09139158006123944, 0.3113782797006747, -0.... \n", - "1 8844.0 [-0.5145776514053282, 0.18805717045856102, 0.0... \n", - "2 15602.0 [-0.09342489820078766, 0.1563727417086737, -0.... \n", - "3 31357.0 [-0.033617228695296826, 0.20003386580703916, 0... \n", - "4 11862.0 [0.03270775039139693, 0.16435040013526048, 0.0... " + "0 862.0 [0.03713469204683083, 0.10796564373254629, 0.2... \n", + "1 8844.0 [-0.010117012753361906, -0.03687474969254127, ... \n", + "2 15602.0 [0.13139654322372601, 0.14560140137289648, 0.1... \n", + "3 31357.0 [0.1564855291020289, -0.01096475924961168, 0.2... \n", + "4 11862.0 [0.07205704581865023, 0.25224445082871455, 0.0... " ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -860,14 +860,14 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "15:08:18 redisvl.index.index INFO Index already exists, overwriting.\n" + "15:33:21 redisvl.index.index INFO Index already exists, overwriting.\n" ] } ], @@ -886,7 +886,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -895,17 +895,17 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "number of movies 8334\n", - "size of movie df 8334\n", - "unique movie ids 8327\n", - "unique movie titles 8082\n", + "number of movies 8337\n", + "size of movie df 8337\n", + "unique movie ids 8331\n", + "unique movie titles 8100\n", "unique movies rated 9065\n" ] }, @@ -972,7 +972,7 @@ " 1\n", " 114709\n", " 862.0\n", - " [-0.09139158006123944, 0.3113782797006747, -0....\n", + " [0.03713469204683083, 0.10796564373254629, 0.2...\n", " \n", " \n", " 1\n", @@ -994,7 +994,7 @@ " 2\n", " 113497\n", " 8844.0\n", - " [-0.5145776514053282, 0.18805717045856102, 0.0...\n", + " [-0.010117012753361906, -0.03687474969254127, ...\n", " \n", " \n", " 2\n", @@ -1016,7 +1016,7 @@ " 3\n", " 113228\n", " 15602.0\n", - " [-0.09342489820078766, 0.1563727417086737, -0....\n", + " [0.13139654322372601, 0.14560140137289648, 0.1...\n", " \n", " \n", " 3\n", @@ -1038,7 +1038,7 @@ " 4\n", " 114885\n", " 31357.0\n", - " [-0.033617228695296826, 0.20003386580703916, 0...\n", + " [0.1564855291020289, -0.01096475924961168, 0.2...\n", " \n", " \n", " 4\n", @@ -1060,7 +1060,7 @@ " 5\n", " 113041\n", " 11862.0\n", - " [0.03270775039139693, 0.16435040013526048, 0.0...\n", + " [0.07205704581865023, 0.25224445082871455, 0.0...\n", " \n", " \n", "\n", @@ -1103,14 +1103,14 @@ "4 Father of the Bride Part II 5.7 173 5 113041 \n", "\n", " tmdbId movie_vector \n", - "0 862.0 [-0.09139158006123944, 0.3113782797006747, -0.... \n", - "1 8844.0 [-0.5145776514053282, 0.18805717045856102, 0.0... \n", - "2 15602.0 [-0.09342489820078766, 0.1563727417086737, -0.... \n", - "3 31357.0 [-0.033617228695296826, 0.20003386580703916, 0... \n", - "4 11862.0 [0.03270775039139693, 0.16435040013526048, 0.0... " + "0 862.0 [0.03713469204683083, 0.10796564373254629, 0.2... \n", + "1 8844.0 [-0.010117012753361906, -0.03687474969254127, ... \n", + "2 15602.0 [0.13139654322372601, 0.14560140137289648, 0.1... \n", + "3 31357.0 [0.1564855291020289, -0.01096475924961168, 0.2... \n", + "4 11862.0 [0.07205704581865023, 0.25224445082871455, 0.0... " ] }, - "execution_count": 15, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1147,25 +1147,25 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'id': 'movie:879bc7ef3bac4639a76b8d39eb22ae25', 'vector_distance': '-3.82712745667', 'title': 'Star Wars', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", - "{'id': 'movie:db33b0943cf942a5b410e71669c2f47f', 'vector_distance': '-3.75959968567', 'title': 'The Lord of the Rings: The Fellowship of the Ring', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n", - "{'id': 'movie:994ee37edbfe42efaf023ba750e43b08', 'vector_distance': '-3.74698734283', 'title': 'The Lord of the Rings: The Two Towers', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n", - "{'id': 'movie:1e0c3f95110e48d4bacefcb38817448f', 'vector_distance': '-3.736120224', 'title': 'The Empire Strikes Back', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", - "{'id': 'movie:43b966cec4454bc4a59949f037ab5d80', 'vector_distance': '-3.55685997009', 'title': 'The Shawshank Redemption', 'genres': '[\"Drama\",\"Crime\"]'}\n", - "{'id': 'movie:1a42cf127f7c422e9542934c560dccf3', 'vector_distance': '-3.4610490799', 'title': 'The Lord of the Rings: The Return of the King', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n", - "{'id': 'movie:720bb022230c4755ac5e039043ea63a5', 'vector_distance': '-3.3770198822', 'title': 'Return of the Jedi', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", - "{'id': 'movie:0300dfccc16d48e69ff6b2a8f89f3307', 'vector_distance': '-3.32906913757', 'title': 'Shrek', 'genres': '[\"Adventure\",\"Animation\",\"Comedy\",\"Family\",\"Fantasy\"]'}\n", - "{'id': 'movie:1658113a6de2434ab52cdd0050ac87d9', 'vector_distance': '-3.3269367218', 'title': 'The Sixth Sense', 'genres': '[\"Mystery\",\"Thriller\",\"Drama\"]'}\n", - "{'id': 'movie:eb1d3a76e17447558ba10b331acb3f93', 'vector_distance': '-3.3075504303', 'title': 'Raiders of the Lost Ark', 'genres': '[\"Adventure\",\"Action\"]'}\n", - "{'id': 'movie:fca0a88760ba4dcbb852d13d1c862aa8', 'vector_distance': '-3.29939317703', 'title': 'The Silence of the Lambs', 'genres': '[\"Crime\",\"Drama\",\"Thriller\"]'}\n", - "{'id': 'movie:a930a67907ef449eaa5dda12af79f5af', 'vector_distance': '-3.25780773163', 'title': 'Lock, Stock and Two Smoking Barrels', 'genres': '[\"Comedy\",\"Crime\"]'}\n" + "{'id': 'movie:123a01ce087f4d09a833970c182f0eb2', 'vector_distance': '-2.13837456703', 'title': 'A Close Shave', 'genres': '[\"Family\",\"Animation\",\"Comedy\"]'}\n", + "{'id': 'movie:f6fb0a03ca0c41a4b1d63249ede39d2f', 'vector_distance': '-2.11249995232', 'title': \"Schindler's List\", 'genres': '[\"Drama\",\"History\",\"War\"]'}\n", + "{'id': 'movie:4d302b9754534983bf70b2304d04633e', 'vector_distance': '-2.09581518173', 'title': 'The African Queen', 'genres': '[\"Adventure\",\"War\",\"Romance\"]'}\n", + "{'id': 'movie:3eb10be0511641e48c41bb2de628bf6f', 'vector_distance': '-2.08978199959', 'title': 'The Lord of the Rings: The Fellowship of the Ring', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n", + "{'id': 'movie:6206dd42b51048edb819adc5fbe07ba7', 'vector_distance': '-2.07609891891', 'title': 'Forrest Gump', 'genres': '[\"Comedy\",\"Drama\",\"Romance\"]'}\n", + "{'id': 'movie:905af238977f40f793080388d0aa1380', 'vector_distance': '-2.05023360252', 'title': 'The Wrong Trousers', 'genres': '[\"Animation\",\"Comedy\",\"Family\"]'}\n", + "{'id': 'movie:ad7f5971e4b64a44a318e3e48105a114', 'vector_distance': '-2.03544998169', 'title': 'Star Wars', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", + "{'id': 'movie:ac19e02ffd4c4833abdd3ecc4702abe9', 'vector_distance': '-1.98030018806', 'title': 'Monty Python and the Holy Grail', 'genres': '[\"Adventure\",\"Comedy\",\"Fantasy\"]'}\n", + "{'id': 'movie:7d6b88e1d652486f96756fc2b5a7f087', 'vector_distance': '-1.98028421402', 'title': 'Mad Max 2: The Road Warrior', 'genres': '[\"Adventure\",\"Action\",\"Thriller\",\"Science Fiction\"]'}\n", + "{'id': 'movie:595fd594f6c8406e91d7baa6bc63efdb', 'vector_distance': '-1.96802783012', 'title': 'Fargo', 'genres': '[\"Crime\",\"Drama\",\"Thriller\"]'}\n", + "{'id': 'movie:04e884d71097481c8c07d8babb723f02', 'vector_distance': '-1.93948292732', 'title': 'Roger & Me', 'genres': '[\"Documentary\",\"History\"]'}\n", + "{'id': 'movie:015dcb2c5d30445787c5392ac551abbb', 'vector_distance': '-1.92847204208', 'title': 'The Imitation Game', 'genres': '[\"History\",\"Drama\",\"Thriller\",\"War\"]'}\n" ] } ], @@ -1201,7 +1201,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -1240,7 +1240,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1275,137 +1275,137 @@ " \n", " \n", " 0\n", - " The Fugitive\n", - " The Fugitive\n", - " The Philadelphia Story\n", " The Shawshank Redemption\n", - " Shine\n", - " What's Eating Gilbert Grape\n", + " Good Will Hunting\n", + " Yojimbo\n", + " The Shawshank Redemption\n", + " Yojimbo\n", + " A Clockwork Orange\n", " \n", " \n", " 1\n", - " Terminator 2: Judgment Day\n", - " Terminator 2: Judgment Day\n", - " Butch Cassidy and the Sundance Kid\n", - " Pulp Fiction\n", - " La Haine\n", - " The Grapes of Wrath\n", + " Yojimbo\n", + " Annie Hall\n", + " Monty Python and the Holy Grail\n", + " The Dark Knight\n", + " My Neighbor Totoro\n", + " Pineapple Express\n", " \n", " \n", " 2\n", - " Clear and Present Danger\n", - " Clear and Present Danger\n", - " Star Wars\n", - " Blade Runner\n", - " Castle in the Sky\n", - " Pineapple Express\n", + " Monty Python and the Holy Grail\n", + " Indiana Jones and the Last Crusade\n", + " Raising Arizona\n", + " Pulp Fiction\n", + " The Meaning of Life\n", + " What's Eating Gilbert Grape\n", " \n", " \n", " 3\n", - " The Silence of the Lambs\n", - " The Silence of the Lambs\n", - " The Bridge on the River Kwai\n", - " The Dark Knight\n", - " All About Eve\n", - " A Clockwork Orange\n", + " Big Night\n", + " The Graduate\n", + " To Kill a Mockingbird\n", + " Blade Runner\n", + " Rebel Without a Cause\n", + " James and the Giant Peach\n", " \n", " \n", " 4\n", - " Schindler's List\n", - " Schindler's List\n", - " The Treasure of the Sierra Madre\n", - " Whiplash\n", - " M\n", - " James and the Giant Peach\n", + " Raising Arizona\n", + " Rear Window\n", + " Annie Hall\n", + " Fight Club\n", + " The Professional\n", + " The Grapes of Wrath\n", " \n", " \n", " 5\n", - " Se7en\n", - " Se7en\n", - " A Christmas Story\n", - " Fight Club\n", - " The Contender\n", + " Ed Wood\n", + " Star Trek\n", + " Indiana Jones and the Last Crusade\n", + " The Avengers\n", + " Sanjuro\n", " Bananas\n", " \n", " \n", " 6\n", - " The Philadelphia Story\n", - " Speed\n", - " The Little Mermaid\n", - " Big Hero 6\n", - " The Postman\n", + " Good Will Hunting\n", + " American Beauty\n", + " The Graduate\n", + " Whiplash\n", + " All About Eve\n", " The Apple Dumpling Gang\n", " \n", " \n", " 7\n", - " A Close Shave\n", - " Fargo\n", - " Roger & Me\n", - " The Avengers\n", - " Maverick\n", - " Herbie Goes Bananas\n", + " To Kill a Mockingbird\n", + " Schindler's List\n", + " Rear Window\n", + " Big Hero 6\n", + " All Quiet on the Western Front\n", + " Orange County\n", " \n", " \n", " 8\n", - " The Usual Suspects\n", - " Amélie\n", - " Dead Poets Society\n", + " Annie Hall\n", + " Twelve Monkeys\n", + " The Bridge on the River Kwai\n", " Gone Girl\n", - " The Meaning of Life\n", - " Orange County\n", + " Cowboy Bebop: The Movie\n", + " Herbie Goes Bananas\n", " \n", " \n", " 9\n", - " Speed\n", - " Jurassic Park\n", - " Stand by Me\n", + " Indiana Jones and the Last Crusade\n", + " The Princess Bride\n", + " Roger & Me\n", " Guardians of the Galaxy\n", - " Frost/Nixon\n", - " The Apple Dumpling Gang Rides Again\n", + " City Lights\n", + " Adam's Apples\n", " \n", " \n", "\n", "" ], "text/plain": [ - " top picks block busters \\\n", - "0 The Fugitive The Fugitive \n", - "1 Terminator 2: Judgment Day Terminator 2: Judgment Day \n", - "2 Clear and Present Danger Clear and Present Danger \n", - "3 The Silence of the Lambs The Silence of the Lambs \n", - "4 Schindler's List Schindler's List \n", - "5 Se7en Se7en \n", - "6 The Philadelphia Story Speed \n", - "7 A Close Shave Fargo \n", - "8 The Usual Suspects Amélie \n", - "9 Speed Jurassic Park \n", + " top picks block busters \\\n", + "0 The Shawshank Redemption Good Will Hunting \n", + "1 Yojimbo Annie Hall \n", + "2 Monty Python and the Holy Grail Indiana Jones and the Last Crusade \n", + "3 Big Night The Graduate \n", + "4 Raising Arizona Rear Window \n", + "5 Ed Wood Star Trek \n", + "6 Good Will Hunting American Beauty \n", + "7 To Kill a Mockingbird Schindler's List \n", + "8 Annie Hall Twelve Monkeys \n", + "9 Indiana Jones and the Last Crusade The Princess Bride \n", "\n", " classics what's popular \\\n", - "0 The Philadelphia Story The Shawshank Redemption \n", - "1 Butch Cassidy and the Sundance Kid Pulp Fiction \n", - "2 Star Wars Blade Runner \n", - "3 The Bridge on the River Kwai The Dark Knight \n", - "4 The Treasure of the Sierra Madre Whiplash \n", - "5 A Christmas Story Fight Club \n", - "6 The Little Mermaid Big Hero 6 \n", - "7 Roger & Me The Avengers \n", - "8 Dead Poets Society Gone Girl \n", - "9 Stand by Me Guardians of the Galaxy \n", + "0 Yojimbo The Shawshank Redemption \n", + "1 Monty Python and the Holy Grail The Dark Knight \n", + "2 Raising Arizona Pulp Fiction \n", + "3 To Kill a Mockingbird Blade Runner \n", + "4 Annie Hall Fight Club \n", + "5 Indiana Jones and the Last Crusade The Avengers \n", + "6 The Graduate Whiplash \n", + "7 Rear Window Big Hero 6 \n", + "8 The Bridge on the River Kwai Gone Girl \n", + "9 Roger & Me Guardians of the Galaxy \n", "\n", - " indie hits fruity films \n", - "0 Shine What's Eating Gilbert Grape \n", - "1 La Haine The Grapes of Wrath \n", - "2 Castle in the Sky Pineapple Express \n", - "3 All About Eve A Clockwork Orange \n", - "4 M James and the Giant Peach \n", - "5 The Contender Bananas \n", - "6 The Postman The Apple Dumpling Gang \n", - "7 Maverick Herbie Goes Bananas \n", - "8 The Meaning of Life Orange County \n", - "9 Frost/Nixon The Apple Dumpling Gang Rides Again " + " indie hits fruity films \n", + "0 Yojimbo A Clockwork Orange \n", + "1 My Neighbor Totoro Pineapple Express \n", + "2 The Meaning of Life What's Eating Gilbert Grape \n", + "3 Rebel Without a Cause James and the Giant Peach \n", + "4 The Professional The Grapes of Wrath \n", + "5 Sanjuro Bananas \n", + "6 All About Eve The Apple Dumpling Gang \n", + "7 All Quiet on the Western Front Orange County \n", + "8 Cowboy Bebop: The Movie Herbie Goes Bananas \n", + "9 City Lights Adam's Apples " ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1435,7 +1435,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -1444,6 +1444,7 @@ " user_vector = user_vectors[user_id].tolist()\n", " watched_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()\n", "\n", + " # filter out movies that the user has already watched\n", " client.bf().insert('user_watched_list', [f\"{user_id}:{movie_id}\" for movie_id in watched_movies])\n", "\n", " query = RangeQuery(vector=user_vector,\n", @@ -1451,19 +1452,18 @@ " num_results=num_results * 5, # fetch more results to filter out watched movies\n", " filter_expression=filters,\n", " return_fields=['title', 'overview', 'genres', 'movieId'],\n", - " ) # TODO figure out why i need to add '$.' to some fields, but not others\n", + " )\n", " results = movie_index.query(query)\n", "\n", - " # filter out movies that the user has already watched\n", - " recommendations = []\n", - " for r in results:\n", - " if not client.bf().exists('user_watched_list', f\"{user_id}:{r['movieId']}\"):\n", - " recommendations.append((r['title'], r['overview'], r['genres'], r['vector_distance']))\n", - " if len(recommendations) >= num_results:\n", - " break\n", - "\n", - " # add these films to the bloom filter\n", - " client.bf().insert('user_watched_list', [f\"{user_id}:{r['movieId']}\" for r in results])\n", + " matches = client.bf().mexists(\"user_watched_list\", *[f\"{user_id}:{r['movieId']}\" for r in results])\n", + "\n", + " recommendations = [\n", + " (r['title'], r['overview'], r['genres'], r['vector_distance'], r['movieId'])\n", + " for i, r in enumerate(results) if matches[i] == 0\n", + " ][:num_results]\n", + "\n", + " # add these recommendations to the bloom filter so they don't appear again\n", + " client.bf().insert('user_watched_list', [f\"{user_id}:{r[4]}\" for r in recommendations])\n", " return recommendations\n", "\n", "# example usage\n", @@ -1471,7 +1471,8 @@ "try:\n", " client.bf().create(f\"user_watched_list\", 0.01, 10000)\n", "except Exception as e:\n", - " print(e)\n", + " client.delete(\"user_watched_list\")\n", + " client.bf().create(f\"user_watched_list\", 0.01, 10000)\n", "\n", "user_id = 42\n", "\n", @@ -1484,7 +1485,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": { "vscode": { "languageId": "ruby" @@ -1522,65 +1523,72 @@ " \n", " \n", " 0\n", - " The Silence of the Lambs\n", - " Star Trek: Generations\n", - " The Bridge on the River Kwai\n", + " Yojimbo\n", + " Annie Hall\n", + " To Kill a Mockingbird\n", " Blade Runner\n", - " Shine\n", + " My Neighbor Totoro\n", " \n", " \n", " 1\n", - " Se7en\n", - " Jumanji\n", - " The Treasure of the Sierra Madre\n", - " Whiplash\n", - " La Haine\n", + " Monty Python and the Holy Grail\n", + " The Graduate\n", + " The Bridge on the River Kwai\n", + " Fight Club\n", + " The Meaning of Life\n", " \n", " \n", " 2\n", - " The Philadelphia Story\n", - " Outbreak\n", - " A Christmas Story\n", - " Fight Club\n", - " Castle in the Sky\n", + " Big Night\n", + " Rear Window\n", + " Roger & Me\n", + " Whiplash\n", + " Rebel Without a Cause\n", " \n", " \n", " 3\n", - " A Close Shave\n", - " The Lion King\n", - " The Little Mermaid\n", + " Raising Arizona\n", + " American Beauty\n", + " Cinema Paradiso\n", " Big Hero 6\n", - " All About Eve\n", + " The Professional\n", " \n", " \n", " 4\n", - " The Usual Suspects\n", - " Men in Black\n", - " Roger & Me\n", + " Ed Wood\n", + " Twelve Monkeys\n", + " Dr. Strangelove or: How I Learned to Stop Worr...\n", " Gone Girl\n", - " M\n", + " Sanjuro\n", " \n", " \n", "\n", "" ], "text/plain": [ - " top picks block busters \\\n", - "0 The Silence of the Lambs Star Trek: Generations \n", - "1 Se7en Jumanji \n", - "2 The Philadelphia Story Outbreak \n", - "3 A Close Shave The Lion King \n", - "4 The Usual Suspects Men in Black \n", + " top picks block busters \\\n", + "0 Yojimbo Annie Hall \n", + "1 Monty Python and the Holy Grail The Graduate \n", + "2 Big Night Rear Window \n", + "3 Raising Arizona American Beauty \n", + "4 Ed Wood Twelve Monkeys \n", + "\n", + " classics what's popular \\\n", + "0 To Kill a Mockingbird Blade Runner \n", + "1 The Bridge on the River Kwai Fight Club \n", + "2 Roger & Me Whiplash \n", + "3 Cinema Paradiso Big Hero 6 \n", + "4 Dr. Strangelove or: How I Learned to Stop Worr... Gone Girl \n", "\n", - " classics what's popular indie hits \n", - "0 The Bridge on the River Kwai Blade Runner Shine \n", - "1 The Treasure of the Sierra Madre Whiplash La Haine \n", - "2 A Christmas Story Fight Club Castle in the Sky \n", - "3 The Little Mermaid Big Hero 6 All About Eve \n", - "4 Roger & Me Gone Girl M " + " indie hits \n", + "0 My Neighbor Totoro \n", + "1 The Meaning of Life \n", + "2 Rebel Without a Cause \n", + "3 The Professional \n", + "4 Sanjuro " ] }, - "execution_count": 20, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1607,14 +1615,14 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Deleted 4334 keys\n", + "Deleted 4337 keys\n", "Deleted 2000 keys\n", "Deleted 1000 keys\n", "Deleted 500 keys\n", @@ -1627,7 +1635,7 @@ "1" ] }, - "execution_count": 21, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } From 2edd5b248a5946f653d2622ef1038a4a0ae985dc Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Mon, 14 Oct 2024 16:59:28 -0700 Subject: [PATCH 08/12] adds surprise to requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 216712e5..08d8a236 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,4 +20,4 @@ redisvl>=0.3.0 pytest ragas datasets - +scikit-surprise From e497231493a283f513a7f7c1b75a30b288fcfd52 Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Tue, 15 Oct 2024 11:35:08 -0700 Subject: [PATCH 09/12] fixes typo in schema --- .../recommendation-systems/collaborative_filtering_schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-recipes/recommendation-systems/collaborative_filtering_schema.yaml b/python-recipes/recommendation-systems/collaborative_filtering_schema.yaml index 0a6f61b4..af58d793 100644 --- a/python-recipes/recommendation-systems/collaborative_filtering_schema.yaml +++ b/python-recipes/recommendation-systems/collaborative_filtering_schema.yaml @@ -37,4 +37,4 @@ fields: dims: 100 distance_metric: ip algorithm: flat - dtype: float32 \ No newline at end of file + datatype: float32 \ No newline at end of file From 267947093f651d8c096788e937da358d6d6d4dfa Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Wed, 16 Oct 2024 11:12:23 -0700 Subject: [PATCH 10/12] adds reference to README, fixes typos --- README.md | 1 + .../collaborative_filtering.ipynb | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index deac85df..0b576ce7 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,7 @@ For further insights on enhancing RAG applications with dense content representa | Recipe | Description | | --- | --- | | [/recommendation-systems/content_filtering.ipynb](python-recipes/recommendation-systems/content_filtering.ipynb) | Intro content filtering example with redisvl | +| [/recommendation-systems/collaborative_filtering.ipynb](python-recipes/recommendation-systems/collaborative_filtering.ipynb) | Intro collaborative filtering example with redisvl | ### See also An exciting example of how Redis can power production-ready systems is highlighted in our collaboration with [NVIDIA](https://developer.nvidia.com/blog/offline-to-online-feature-storage-for-real-time-recommendation-systems-with-nvidia-merlin/) to construct a state-of-the-art recommendation system. diff --git a/python-recipes/recommendation-systems/collaborative_filtering.ipynb b/python-recipes/recommendation-systems/collaborative_filtering.ipynb index 09344699..45fae026 100644 --- a/python-recipes/recommendation-systems/collaborative_filtering.ipynb +++ b/python-recipes/recommendation-systems/collaborative_filtering.ipynb @@ -17,7 +17,7 @@ "source": [ "Recommendation systems are a common application of machine learning and serve many industries from e-commerce to music streaming platforms.\n", "\n", - "There are many different architechtures that can be followed to build a recommendation system.\n", + "There are many different architectures that can be followed to build a recommendation system. In a previous example notebook we demonstrated how to do [content filtering with RedisVL](content_filtering.ipynb). We encourage you to start there before diving into this notebook.\n", "\n", "In this notebook we'll demonstrate how to build a [collaborative filtering](https://en.wikipedia.org/wiki/Collaborative_filtering)\n", "recommendation system and use the large IMDB movies dataset as our example data.\n", @@ -268,7 +268,7 @@ } ], "source": [ - "# surprise casts userId and movieId to inner ids, so we have to use their mapping to now which rows to use\n", + "# surprise casts userId and movieId to inner ids, so we have to use their mapping to know which rows to use\n", "inner_uid = train_set.to_inner_uid(347) # userId\n", "inner_iid = train_set.to_inner_iid(5515) # movieId\n", "\n", @@ -582,8 +582,8 @@ "movies_df['overview'] = movies_df['overview'].fillna('')\n", "movies_df['popularity'] = movies_df['popularity'].fillna(0)\n", "movies_df['release_date'] = movies_df['release_date'].fillna('1900-01-01').apply(lambda x: datetime.datetime.strptime(x, \"%Y-%m-%d\").timestamp())\n", - "movies_df['revenue'] = movies_df['revenue'].fillna(0) # fill with average?\n", - "movies_df['runtime'] = movies_df['runtime'].fillna(0) # fill with average?\n", + "movies_df['revenue'] = movies_df['revenue'].fillna(0)\n", + "movies_df['runtime'] = movies_df['runtime'].fillna(0)\n", "movies_df['status'] = movies_df['status'].fillna('unknown')\n", "movies_df['tagline'] = movies_df['tagline'].fillna('')\n", "movies_df['title'] = movies_df['title'].fillna('')\n", @@ -1196,7 +1196,7 @@ "## Adding All the Bells & Whistles\n", "Vector search handles the bulk of our collaborative filtering recommendation system and is a great approach to generating personalized recommendations that are unique to each user.\n", "\n", - "To up our RecSys game even further we can leverage RedisVl filter logic to give more control to what users are shown. Why have only one feed of recommended movies when you can have several, each with its own theme and personalized to each user." + "To up our RecSys game even further we can leverage RedisVL Filter logic to give more control to what users are shown. Why have only one feed of recommended movies when you can have several, each with its own theme and personalized to each user." ] }, { @@ -1428,7 +1428,7 @@ "metadata": {}, "source": [ "## Keeping Things Fresh\n", - "You've probably noticed that a few movies get repeated in these lists. That's not surprising as all our results are personalized and things like `popularity` and `user_rating` and `revenue` are likely highly correlated. And it's more that likely that at least some of the recommendations we're expecting to be highly rated by a given user are ones they've already watched and rated highly.\n", + "You've probably noticed that a few movies get repeated in these lists. That's not surprising as all our results are personalized and things like `popularity` and `user_rating` and `revenue` are likely highly correlated. And it's more than likely that at least some of the recommendations we're expecting to be highly rated by a given user are ones they've already watched and rated highly.\n", "\n", "Luckily Redis offers an easy answer to keeping recommendations new and interesting, and that answer is Bloom Filters." ] From 39704ae09d73439192848579f3924c086fbc6a42 Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Wed, 16 Oct 2024 15:15:37 -0700 Subject: [PATCH 11/12] stores user vector and watched list in Redis json --- .../collaborative_filtering.ipynb | 403 ++++++++++-------- 1 file changed, 216 insertions(+), 187 deletions(-) diff --git a/python-recipes/recommendation-systems/collaborative_filtering.ipynb b/python-recipes/recommendation-systems/collaborative_filtering.ipynb index 45fae026..0a00bbd1 100644 --- a/python-recipes/recommendation-systems/collaborative_filtering.ipynb +++ b/python-recipes/recommendation-systems/collaborative_filtering.ipynb @@ -180,7 +180,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 5, @@ -230,7 +230,7 @@ "output_type": "stream", "text": [ "we have 671 users with feature vectors of size 100\n", - "we have 8377 movies with feature vectors of size 100\n" + "we have 8415 movies with feature vectors of size 100\n" ] } ], @@ -263,7 +263,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "the predicted rating of user 347 on movie 5515 is 1.3640325071309123\n" + "the predicted rating of user 347 on movie 5515 is 1.5939846458534452\n" ] } ], @@ -690,7 +690,7 @@ " 1\n", " 114709\n", " 862.0\n", - " [0.03713469204683083, 0.10796564373254629, 0.2...\n", + " [-0.12329348744399116, -0.03395287506133206, 0...\n", " \n", " \n", " 1\n", @@ -712,7 +712,7 @@ " 2\n", " 113497\n", " 8844.0\n", - " [-0.010117012753361906, -0.03687474969254127, ...\n", + " [-0.20839075686685218, 0.2842778495633789, 0.2...\n", " \n", " \n", " 2\n", @@ -734,7 +734,7 @@ " 3\n", " 113228\n", " 15602.0\n", - " [0.13139654322372601, 0.14560140137289648, 0.1...\n", + " [-0.3250115780939791, 0.11093873287053337, 0.4...\n", " \n", " \n", " 3\n", @@ -756,7 +756,7 @@ " 4\n", " 114885\n", " 31357.0\n", - " [0.1564855291020289, -0.01096475924961168, 0.2...\n", + " [-0.08088437767077983, 0.1911468768682881, 0.2...\n", " \n", " \n", " 4\n", @@ -778,7 +778,7 @@ " 5\n", " 113041\n", " 11862.0\n", - " [0.07205704581865023, 0.25224445082871455, 0.0...\n", + " [-0.007213409719480573, 0.20232376643634847, 0...\n", " \n", " \n", "\n", @@ -821,11 +821,11 @@ "4 Father of the Bride Part II 5.7 173 5 113041 \n", "\n", " tmdbId movie_vector \n", - "0 862.0 [0.03713469204683083, 0.10796564373254629, 0.2... \n", - "1 8844.0 [-0.010117012753361906, -0.03687474969254127, ... \n", - "2 15602.0 [0.13139654322372601, 0.14560140137289648, 0.1... \n", - "3 31357.0 [0.1564855291020289, -0.01096475924961168, 0.2... \n", - "4 11862.0 [0.07205704581865023, 0.25224445082871455, 0.0... " + "0 862.0 [-0.12329348744399116, -0.03395287506133206, 0... \n", + "1 8844.0 [-0.20839075686685218, 0.2842778495633789, 0.2... \n", + "2 15602.0 [-0.3250115780939791, 0.11093873287053337, 0.4... \n", + "3 31357.0 [-0.08088437767077983, 0.1911468768682881, 0.2... \n", + "4 11862.0 [-0.007213409719480573, 0.20232376643634847, 0... " ] }, "execution_count": 11, @@ -867,7 +867,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "15:33:21 redisvl.index.index INFO Index already exists, overwriting.\n" + "15:07:36 redisvl.index.index INFO Index already exists, overwriting.\n" ] } ], @@ -881,31 +881,24 @@ "movie_schema = IndexSchema.from_yaml(\"collaborative_filtering_schema.yaml\")\n", "\n", "movie_index = SearchIndex(movie_schema, redis_client=client)\n", - "movie_index.create(overwrite=True, drop=True)" + "movie_index.create(overwrite=True, drop=True)\n", + "\n", + "movie_keys = movie_index.load(movies_df.to_dict(orient='records'))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, - "outputs": [], - "source": [ - "keys = movie_index.load(movies_df.to_dict(orient='records'))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "number of movies 8337\n", - "size of movie df 8337\n", - "unique movie ids 8331\n", - "unique movie titles 8100\n", + "number of movies 8370\n", + "size of movie df 8370\n", + "unique movie ids 8364\n", + "unique movie titles 8125\n", "unique movies rated 9065\n" ] }, @@ -972,7 +965,7 @@ " 1\n", " 114709\n", " 862.0\n", - " [0.03713469204683083, 0.10796564373254629, 0.2...\n", + " [-0.12329348744399116, -0.03395287506133206, 0...\n", " \n", " \n", " 1\n", @@ -994,7 +987,7 @@ " 2\n", " 113497\n", " 8844.0\n", - " [-0.010117012753361906, -0.03687474969254127, ...\n", + " [-0.20839075686685218, 0.2842778495633789, 0.2...\n", " \n", " \n", " 2\n", @@ -1016,7 +1009,7 @@ " 3\n", " 113228\n", " 15602.0\n", - " [0.13139654322372601, 0.14560140137289648, 0.1...\n", + " [-0.3250115780939791, 0.11093873287053337, 0.4...\n", " \n", " \n", " 3\n", @@ -1038,7 +1031,7 @@ " 4\n", " 114885\n", " 31357.0\n", - " [0.1564855291020289, -0.01096475924961168, 0.2...\n", + " [-0.08088437767077983, 0.1911468768682881, 0.2...\n", " \n", " \n", " 4\n", @@ -1060,7 +1053,7 @@ " 5\n", " 113041\n", " 11862.0\n", - " [0.07205704581865023, 0.25224445082871455, 0.0...\n", + " [-0.007213409719480573, 0.20232376643634847, 0...\n", " \n", " \n", "\n", @@ -1103,14 +1096,14 @@ "4 Father of the Bride Part II 5.7 173 5 113041 \n", "\n", " tmdbId movie_vector \n", - "0 862.0 [0.03713469204683083, 0.10796564373254629, 0.2... \n", - "1 8844.0 [-0.010117012753361906, -0.03687474969254127, ... \n", - "2 15602.0 [0.13139654322372601, 0.14560140137289648, 0.1... \n", - "3 31357.0 [0.1564855291020289, -0.01096475924961168, 0.2... \n", - "4 11862.0 [0.07205704581865023, 0.25224445082871455, 0.0... " + "0 862.0 [-0.12329348744399116, -0.03395287506133206, 0... \n", + "1 8844.0 [-0.20839075686685218, 0.2842778495633789, 0.2... \n", + "2 15602.0 [-0.3250115780939791, 0.11093873287053337, 0.4... \n", + "3 31357.0 [-0.08088437767077983, 0.1911468768682881, 0.2... \n", + "4 11862.0 [-0.007213409719480573, 0.20232376643634847, 0... " ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1134,6 +1127,35 @@ "movies_df.head()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For a complete solution we'll store the user vectors and their watched list in Redis also. We won't be searching over these user vectors so no need to define an index for them. A direct JSON look up will suffice." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from redis.commands.json.path import Path\n", + "\n", + "# use a Redis pipeline to store user data and verify it in a single transaction\n", + "with client.pipeline() as pipe:\n", + " for user_id, user_vector in user_vectors_and_ids.items():\n", + " user_key = f\"user:{user_id}\"\n", + " watched_list_ids = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()\n", + "\n", + " user_data = {\n", + " \"user_vector\": user_vector,\n", + " \"watched_list_ids\": watched_list_ids\n", + " }\n", + " pipe.json().set(user_key, Path.root_path(), user_data)\n", + " pipe.execute()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1154,25 +1176,25 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'id': 'movie:123a01ce087f4d09a833970c182f0eb2', 'vector_distance': '-2.13837456703', 'title': 'A Close Shave', 'genres': '[\"Family\",\"Animation\",\"Comedy\"]'}\n", - "{'id': 'movie:f6fb0a03ca0c41a4b1d63249ede39d2f', 'vector_distance': '-2.11249995232', 'title': \"Schindler's List\", 'genres': '[\"Drama\",\"History\",\"War\"]'}\n", - "{'id': 'movie:4d302b9754534983bf70b2304d04633e', 'vector_distance': '-2.09581518173', 'title': 'The African Queen', 'genres': '[\"Adventure\",\"War\",\"Romance\"]'}\n", - "{'id': 'movie:3eb10be0511641e48c41bb2de628bf6f', 'vector_distance': '-2.08978199959', 'title': 'The Lord of the Rings: The Fellowship of the Ring', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n", - "{'id': 'movie:6206dd42b51048edb819adc5fbe07ba7', 'vector_distance': '-2.07609891891', 'title': 'Forrest Gump', 'genres': '[\"Comedy\",\"Drama\",\"Romance\"]'}\n", - "{'id': 'movie:905af238977f40f793080388d0aa1380', 'vector_distance': '-2.05023360252', 'title': 'The Wrong Trousers', 'genres': '[\"Animation\",\"Comedy\",\"Family\"]'}\n", - "{'id': 'movie:ad7f5971e4b64a44a318e3e48105a114', 'vector_distance': '-2.03544998169', 'title': 'Star Wars', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", - "{'id': 'movie:ac19e02ffd4c4833abdd3ecc4702abe9', 'vector_distance': '-1.98030018806', 'title': 'Monty Python and the Holy Grail', 'genres': '[\"Adventure\",\"Comedy\",\"Fantasy\"]'}\n", - "{'id': 'movie:7d6b88e1d652486f96756fc2b5a7f087', 'vector_distance': '-1.98028421402', 'title': 'Mad Max 2: The Road Warrior', 'genres': '[\"Adventure\",\"Action\",\"Thriller\",\"Science Fiction\"]'}\n", - "{'id': 'movie:595fd594f6c8406e91d7baa6bc63efdb', 'vector_distance': '-1.96802783012', 'title': 'Fargo', 'genres': '[\"Crime\",\"Drama\",\"Thriller\"]'}\n", - "{'id': 'movie:04e884d71097481c8c07d8babb723f02', 'vector_distance': '-1.93948292732', 'title': 'Roger & Me', 'genres': '[\"Documentary\",\"History\"]'}\n", - "{'id': 'movie:015dcb2c5d30445787c5392ac551abbb', 'vector_distance': '-1.92847204208', 'title': 'The Imitation Game', 'genres': '[\"History\",\"Drama\",\"Thriller\",\"War\"]'}\n" + "{'id': 'movie:255865ce253c4b7bbefaff7884035b0c', 'vector_distance': '-3.8687338829', 'title': 'Spirited Away', 'genres': '[\"Fantasy\",\"Adventure\",\"Animation\",\"Family\"]'}\n", + "{'id': 'movie:c833029c842143fdaf7bb5acedb051ce', 'vector_distance': '-3.73652648926', 'title': 'The Princess Bride', 'genres': '[\"Adventure\",\"Family\",\"Fantasy\",\"Comedy\",\"Romance\"]'}\n", + "{'id': 'movie:cf48a5443467433ca57c1741104cf123', 'vector_distance': '-3.66395378113', 'title': 'The Usual Suspects', 'genres': '[\"Drama\",\"Crime\",\"Thriller\"]'}\n", + "{'id': 'movie:a8707fc2440043a78e1b7ee92c5038cf', 'vector_distance': '-3.62124490738', 'title': 'The Shawshank Redemption', 'genres': '[\"Drama\",\"Crime\"]'}\n", + "{'id': 'movie:772b299da4e8427082e13fc542c80a9e', 'vector_distance': '-3.59598970413', 'title': 'A Beautiful Mind', 'genres': '[\"Drama\",\"Romance\"]'}\n", + "{'id': 'movie:9818b2cc529f4ef8af6b1e42618c7e19', 'vector_distance': '-3.57971763611', 'title': 'Cinema Paradiso', 'genres': '[\"Drama\",\"Romance\"]'}\n", + "{'id': 'movie:cd43f8fee7024fc0a3edd2cb155491cf', 'vector_distance': '-3.54007005692', 'title': 'The Empire Strikes Back', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", + "{'id': 'movie:ca1c554ca7ef4da29e1a543147da54d9', 'vector_distance': '-3.53854608536', 'title': 'Like Water for Chocolate', 'genres': '[\"Drama\",\"Romance\"]'}\n", + "{'id': 'movie:f03df438a38349a4992222b6d37e81eb', 'vector_distance': '-3.4644536972', 'title': 'Roger & Me', 'genres': '[\"Documentary\",\"History\"]'}\n", + "{'id': 'movie:e1b6f8ad41d2425a8b470f0d206038bf', 'vector_distance': '-3.45273590088', 'title': 'The Lord of the Rings: The Fellowship of the Ring', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n", + "{'id': 'movie:1d2091b80efd4052b2c54390f8f25172', 'vector_distance': '-3.44259595871', 'title': 'Star Wars', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", + "{'id': 'movie:b41bb158cd0b4362955e3800fd2cfb9d', 'vector_distance': '-3.40954303741', 'title': 'The Lord of the Rings: The Two Towers', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n" ] } ], "source": [ "from redisvl.query import RangeQuery\n", "\n", - "user_vector = user_vectors[352].tolist()\n", + "user_vector = client.json().get(f\"user:{352}\")[\"user_vector\"]\n", "\n", "# the distance metric 'ip' inner product is computing \"score = 1 - u * v\" and returning the minimum, which corresponds to the max of \"u * v\"\n", "# this is what we want. The predicted rating on a scale of 0 to 5 is then -(score - 1) == -score + 1\n", @@ -1209,7 +1231,7 @@ "from redisvl.query.filter import Tag, Num, Text\n", "\n", "def get_recommendations(user_id, filters=None, num_results=10):\n", - " user_vector = user_vectors[user_id].tolist()\n", + " user_vector = client.json().get(f\"user:{user_id}\")[\"user_vector\"]\n", " query = RangeQuery(vector=user_vector,\n", " vector_field_name='movie_vector',\n", " num_results=num_results,\n", @@ -1276,133 +1298,145 @@ " \n", " 0\n", " The Shawshank Redemption\n", - " Good Will Hunting\n", - " Yojimbo\n", + " The Godfather\n", + " Cinema Paradiso\n", " The Shawshank Redemption\n", " Yojimbo\n", - " A Clockwork Orange\n", + " What's Eating Gilbert Grape\n", " \n", " \n", " 1\n", - " Yojimbo\n", - " Annie Hall\n", - " Monty Python and the Holy Grail\n", + " Cinema Paradiso\n", + " The Lord of the Rings: The Fellowship of the Ring\n", + " The Godfather\n", " The Dark Knight\n", - " My Neighbor Totoro\n", - " Pineapple Express\n", + " La Haine\n", + " The Grapes of Wrath\n", " \n", " \n", " 2\n", - " Monty Python and the Holy Grail\n", - " Indiana Jones and the Last Crusade\n", - " Raising Arizona\n", + " Band of Brothers\n", + " Schindler's List\n", + " The Empire Strikes Back\n", " Pulp Fiction\n", - " The Meaning of Life\n", - " What's Eating Gilbert Grape\n", + " The Postman\n", + " A Clockwork Orange\n", " \n", " \n", " 3\n", - " Big Night\n", - " The Graduate\n", - " To Kill a Mockingbird\n", - " Blade Runner\n", - " Rebel Without a Cause\n", - " James and the Giant Peach\n", + " A Grand Day Out\n", + " The Empire Strikes Back\n", + " Star Wars\n", + " Whiplash\n", + " Seven Samurai\n", + " Bananas\n", " \n", " \n", " 4\n", - " Raising Arizona\n", - " Rear Window\n", - " Annie Hall\n", - " Fight Club\n", - " The Professional\n", - " The Grapes of Wrath\n", + " The Godfather\n", + " The Lord of the Rings: The Two Towers\n", + " The Philadelphia Story\n", + " Blade Runner\n", + " Shine\n", + " Pineapple Express\n", " \n", " \n", " 5\n", - " Ed Wood\n", - " Star Trek\n", - " Indiana Jones and the Last Crusade\n", - " The Avengers\n", - " Sanjuro\n", - " Bananas\n", + " The Lord of the Rings: The Fellowship of the Ring\n", + " Star Wars\n", + " Mr. Smith Goes to Washington\n", + " Big Hero 6\n", + " My Neighbor Totoro\n", + " James and the Giant Peach\n", " \n", " \n", " 6\n", - " Good Will Hunting\n", - " American Beauty\n", - " The Graduate\n", - " Whiplash\n", - " All About Eve\n", + " Schindler's List\n", + " The Fugitive\n", + " Empire of the Sun\n", + " Fight Club\n", + " The Professional\n", " The Apple Dumpling Gang\n", " \n", " \n", " 7\n", - " To Kill a Mockingbird\n", - " Schindler's List\n", - " Rear Window\n", - " Big Hero 6\n", - " All Quiet on the Western Front\n", + " The Empire Strikes Back\n", + " The Matrix\n", + " Stand by Me\n", + " The Avengers\n", + " All About Eve\n", " Orange County\n", " \n", " \n", " 8\n", - " Annie Hall\n", - " Twelve Monkeys\n", - " The Bridge on the River Kwai\n", - " Gone Girl\n", - " Cowboy Bebop: The Movie\n", - " Herbie Goes Bananas\n", + " The Lord of the Rings: The Two Towers\n", + " The Dark Knight\n", + " The Princess Bride\n", + " Guardians of the Galaxy\n", + " Rebel Without a Cause\n", + " The Apple Dumpling Gang Rides Again\n", " \n", " \n", " 9\n", - " Indiana Jones and the Last Crusade\n", - " The Princess Bride\n", - " Roger & Me\n", - " Guardians of the Galaxy\n", - " City Lights\n", - " Adam's Apples\n", + " The Usual Suspects\n", + " Pulp Fiction\n", + " Raiders of the Lost Ark\n", + " Gone Girl\n", + " Bicycle Thieves\n", + " Herbie Goes Bananas\n", " \n", " \n", "\n", "" ], "text/plain": [ - " top picks block busters \\\n", - "0 The Shawshank Redemption Good Will Hunting \n", - "1 Yojimbo Annie Hall \n", - "2 Monty Python and the Holy Grail Indiana Jones and the Last Crusade \n", - "3 Big Night The Graduate \n", - "4 Raising Arizona Rear Window \n", - "5 Ed Wood Star Trek \n", - "6 Good Will Hunting American Beauty \n", - "7 To Kill a Mockingbird Schindler's List \n", - "8 Annie Hall Twelve Monkeys \n", - "9 Indiana Jones and the Last Crusade The Princess Bride \n", + " top picks \\\n", + "0 The Shawshank Redemption \n", + "1 Cinema Paradiso \n", + "2 Band of Brothers \n", + "3 A Grand Day Out \n", + "4 The Godfather \n", + "5 The Lord of the Rings: The Fellowship of the Ring \n", + "6 Schindler's List \n", + "7 The Empire Strikes Back \n", + "8 The Lord of the Rings: The Two Towers \n", + "9 The Usual Suspects \n", "\n", - " classics what's popular \\\n", - "0 Yojimbo The Shawshank Redemption \n", - "1 Monty Python and the Holy Grail The Dark Knight \n", - "2 Raising Arizona Pulp Fiction \n", - "3 To Kill a Mockingbird Blade Runner \n", - "4 Annie Hall Fight Club \n", - "5 Indiana Jones and the Last Crusade The Avengers \n", - "6 The Graduate Whiplash \n", - "7 Rear Window Big Hero 6 \n", - "8 The Bridge on the River Kwai Gone Girl \n", - "9 Roger & Me Guardians of the Galaxy \n", + " block busters \\\n", + "0 The Godfather \n", + "1 The Lord of the Rings: The Fellowship of the Ring \n", + "2 Schindler's List \n", + "3 The Empire Strikes Back \n", + "4 The Lord of the Rings: The Two Towers \n", + "5 Star Wars \n", + "6 The Fugitive \n", + "7 The Matrix \n", + "8 The Dark Knight \n", + "9 Pulp Fiction \n", "\n", - " indie hits fruity films \n", - "0 Yojimbo A Clockwork Orange \n", - "1 My Neighbor Totoro Pineapple Express \n", - "2 The Meaning of Life What's Eating Gilbert Grape \n", - "3 Rebel Without a Cause James and the Giant Peach \n", - "4 The Professional The Grapes of Wrath \n", - "5 Sanjuro Bananas \n", - "6 All About Eve The Apple Dumpling Gang \n", - "7 All Quiet on the Western Front Orange County \n", - "8 Cowboy Bebop: The Movie Herbie Goes Bananas \n", - "9 City Lights Adam's Apples " + " classics what's popular \\\n", + "0 Cinema Paradiso The Shawshank Redemption \n", + "1 The Godfather The Dark Knight \n", + "2 The Empire Strikes Back Pulp Fiction \n", + "3 Star Wars Whiplash \n", + "4 The Philadelphia Story Blade Runner \n", + "5 Mr. Smith Goes to Washington Big Hero 6 \n", + "6 Empire of the Sun Fight Club \n", + "7 Stand by Me The Avengers \n", + "8 The Princess Bride Guardians of the Galaxy \n", + "9 Raiders of the Lost Ark Gone Girl \n", + "\n", + " indie hits fruity films \n", + "0 Yojimbo What's Eating Gilbert Grape \n", + "1 La Haine The Grapes of Wrath \n", + "2 The Postman A Clockwork Orange \n", + "3 Seven Samurai Bananas \n", + "4 Shine Pineapple Express \n", + "5 My Neighbor Totoro James and the Giant Peach \n", + "6 The Professional The Apple Dumpling Gang \n", + "7 All About Eve Orange County \n", + "8 Rebel Without a Cause The Apple Dumpling Gang Rides Again \n", + "9 Bicycle Thieves Herbie Goes Bananas " ] }, "execution_count": 17, @@ -1441,8 +1475,9 @@ "source": [ "# rewrite the get_recommendations() function to use a bloom filter and apply it before we return results\n", "def get_unique_recommendations(user_id, filters=None, num_results=10):\n", - " user_vector = user_vectors[user_id].tolist()\n", - " watched_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()\n", + " user_data = client.json().get(f\"user:{user_id}\")\n", + " user_vector = user_data[\"user_vector\"]\n", + " watched_movies = user_data[\"watched_list_ids\"]\n", "\n", " # filter out movies that the user has already watched\n", " client.bf().insert('user_watched_list', [f\"{user_id}:{movie_id}\" for movie_id in watched_movies])\n", @@ -1523,69 +1558,62 @@ " \n", " \n", " 0\n", + " Cinema Paradiso\n", + " Se7en\n", + " Mr. Smith Goes to Washington\n", + " Whiplash\n", " Yojimbo\n", - " Annie Hall\n", - " To Kill a Mockingbird\n", - " Blade Runner\n", - " My Neighbor Totoro\n", " \n", " \n", " 1\n", - " Monty Python and the Holy Grail\n", - " The Graduate\n", - " The Bridge on the River Kwai\n", - " Fight Club\n", - " The Meaning of Life\n", + " A Grand Day Out\n", + " Stand by Me\n", + " Empire of the Sun\n", + " Blade Runner\n", + " La Haine\n", " \n", " \n", " 2\n", - " Big Night\n", - " Rear Window\n", - " Roger & Me\n", - " Whiplash\n", - " Rebel Without a Cause\n", + " The Godfather\n", + " The Prestige\n", + " The Godfather: Part II\n", + " Big Hero 6\n", + " The Postman\n", " \n", " \n", " 3\n", - " Raising Arizona\n", - " American Beauty\n", - " Cinema Paradiso\n", - " Big Hero 6\n", - " The Professional\n", + " The Usual Suspects\n", + " The Princess Bride\n", + " Roger & Me\n", + " Fight Club\n", + " Seven Samurai\n", " \n", " \n", " 4\n", - " Ed Wood\n", - " Twelve Monkeys\n", - " Dr. Strangelove or: How I Learned to Stop Worr...\n", + " The Philadelphia Story\n", + " Rain Man\n", + " It Happened One Night\n", " Gone Girl\n", - " Sanjuro\n", + " Shine\n", " \n", " \n", "\n", "" ], "text/plain": [ - " top picks block busters \\\n", - "0 Yojimbo Annie Hall \n", - "1 Monty Python and the Holy Grail The Graduate \n", - "2 Big Night Rear Window \n", - "3 Raising Arizona American Beauty \n", - "4 Ed Wood Twelve Monkeys \n", + " top picks block busters classics \\\n", + "0 Cinema Paradiso Se7en Mr. Smith Goes to Washington \n", + "1 A Grand Day Out Stand by Me Empire of the Sun \n", + "2 The Godfather The Prestige The Godfather: Part II \n", + "3 The Usual Suspects The Princess Bride Roger & Me \n", + "4 The Philadelphia Story Rain Man It Happened One Night \n", "\n", - " classics what's popular \\\n", - "0 To Kill a Mockingbird Blade Runner \n", - "1 The Bridge on the River Kwai Fight Club \n", - "2 Roger & Me Whiplash \n", - "3 Cinema Paradiso Big Hero 6 \n", - "4 Dr. Strangelove or: How I Learned to Stop Worr... Gone Girl \n", - "\n", - " indie hits \n", - "0 My Neighbor Totoro \n", - "1 The Meaning of Life \n", - "2 Rebel Without a Cause \n", - "3 The Professional \n", - "4 Sanjuro " + " what's popular indie hits \n", + "0 Whiplash Yojimbo \n", + "1 Blade Runner La Haine \n", + "2 Big Hero 6 The Postman \n", + "3 Fight Club Seven Samurai \n", + "4 Gone Girl Shine " ] }, "execution_count": 19, @@ -1602,7 +1630,7 @@ "all_recommendations[\"what's popular\"] = [m[0] for m in whats_popular]\n", "all_recommendations[\"indie hits\"] = [m[0] for m in indie_hits]\n", "\n", - "all_recommendations.head(10)" + "all_recommendations.head()" ] }, { @@ -1622,7 +1650,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Deleted 4337 keys\n", + "Deleted 4370 keys\n", "Deleted 2000 keys\n", "Deleted 1000 keys\n", "Deleted 500 keys\n", @@ -1632,7 +1660,7 @@ { "data": { "text/plain": [ - "1" + "671" ] }, "execution_count": 20, @@ -1645,7 +1673,8 @@ "while remaining := movie_index.clear():\n", " print(f\"Deleted {remaining} keys\")\n", "\n", - "client.delete(\"user_watched_list\")" + "client.delete(\"user_watched_list\")\n", + "client.delete(*[f\"user:{user_id}\" for user_id in user_vectors_and_ids.keys()])" ] } ], From 9eb40678785ff0fc4c3f1bb5f5b222026ded1e35 Mon Sep 17 00:00:00 2001 From: Justin Cechmanek Date: Fri, 18 Oct 2024 12:09:19 -0700 Subject: [PATCH 12/12] computes predicted rating from vector distance. Updates comments about tags vs bloom filters --- .../collaborative_filtering.ipynb | 354 +++++++++--------- 1 file changed, 179 insertions(+), 175 deletions(-) diff --git a/python-recipes/recommendation-systems/collaborative_filtering.ipynb b/python-recipes/recommendation-systems/collaborative_filtering.ipynb index 0a00bbd1..e96054d3 100644 --- a/python-recipes/recommendation-systems/collaborative_filtering.ipynb +++ b/python-recipes/recommendation-systems/collaborative_filtering.ipynb @@ -180,7 +180,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 5, @@ -210,7 +210,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that the the SVD algorithm has computed our `[U]` and `[M]` matrices - which are both really just lists of vectors - we can load them into our Redis instance.\n", + "Now that the SVD algorithm has computed our `[U]` and `[M]` matrices - which are both really just lists of vectors - we can load them into our Redis instance.\n", "\n", "The Surprise SVD model stores user and movie vectors in two attributes:\n", "\n", @@ -230,7 +230,7 @@ "output_type": "stream", "text": [ "we have 671 users with feature vectors of size 100\n", - "we have 8415 movies with feature vectors of size 100\n" + "we have 8397 movies with feature vectors of size 100\n" ] } ], @@ -263,7 +263,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "the predicted rating of user 347 on movie 5515 is 1.5939846458534452\n" + "the predicted rating of user 347 on movie 5515 is 1.1069607933289707\n" ] } ], @@ -690,7 +690,7 @@ " 1\n", " 114709\n", " 862.0\n", - " [-0.12329348744399116, -0.03395287506133206, 0...\n", + " [0.12184447241197785, -0.16994406060791697, 0....\n", " \n", " \n", " 1\n", @@ -712,7 +712,7 @@ " 2\n", " 113497\n", " 8844.0\n", - " [-0.20839075686685218, 0.2842778495633789, 0.2...\n", + " [0.14683581574270926, -0.06365576587872183, 0....\n", " \n", " \n", " 2\n", @@ -734,7 +734,7 @@ " 3\n", " 113228\n", " 15602.0\n", - " [-0.3250115780939791, 0.11093873287053337, 0.4...\n", + " [0.16698051985699827, -0.02406109383254372, 0....\n", " \n", " \n", " 3\n", @@ -756,7 +756,7 @@ " 4\n", " 114885\n", " 31357.0\n", - " [-0.08088437767077983, 0.1911468768682881, 0.2...\n", + " [-0.10740791019437969, 0.09007945525146789, 0....\n", " \n", " \n", " 4\n", @@ -778,7 +778,7 @@ " 5\n", " 113041\n", " 11862.0\n", - " [-0.007213409719480573, 0.20232376643634847, 0...\n", + " [0.11311012532803581, 0.025998675845395405, 0....\n", " \n", " \n", "\n", @@ -821,11 +821,11 @@ "4 Father of the Bride Part II 5.7 173 5 113041 \n", "\n", " tmdbId movie_vector \n", - "0 862.0 [-0.12329348744399116, -0.03395287506133206, 0... \n", - "1 8844.0 [-0.20839075686685218, 0.2842778495633789, 0.2... \n", - "2 15602.0 [-0.3250115780939791, 0.11093873287053337, 0.4... \n", - "3 31357.0 [-0.08088437767077983, 0.1911468768682881, 0.2... \n", - "4 11862.0 [-0.007213409719480573, 0.20232376643634847, 0... " + "0 862.0 [0.12184447241197785, -0.16994406060791697, 0.... \n", + "1 8844.0 [0.14683581574270926, -0.06365576587872183, 0.... \n", + "2 15602.0 [0.16698051985699827, -0.02406109383254372, 0.... \n", + "3 31357.0 [-0.10740791019437969, 0.09007945525146789, 0.... \n", + "4 11862.0 [0.11311012532803581, 0.025998675845395405, 0.... " ] }, "execution_count": 11, @@ -867,7 +867,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "15:07:36 redisvl.index.index INFO Index already exists, overwriting.\n" + "12:05:35 redisvl.index.index INFO Index already exists, overwriting.\n" ] } ], @@ -895,10 +895,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "number of movies 8370\n", - "size of movie df 8370\n", - "unique movie ids 8364\n", - "unique movie titles 8125\n", + "number of movies 8358\n", + "size of movie df 8358\n", + "unique movie ids 8352\n", + "unique movie titles 8115\n", "unique movies rated 9065\n" ] }, @@ -965,7 +965,7 @@ " 1\n", " 114709\n", " 862.0\n", - " [-0.12329348744399116, -0.03395287506133206, 0...\n", + " [0.12184447241197785, -0.16994406060791697, 0....\n", " \n", " \n", " 1\n", @@ -987,7 +987,7 @@ " 2\n", " 113497\n", " 8844.0\n", - " [-0.20839075686685218, 0.2842778495633789, 0.2...\n", + " [0.14683581574270926, -0.06365576587872183, 0....\n", " \n", " \n", " 2\n", @@ -1009,7 +1009,7 @@ " 3\n", " 113228\n", " 15602.0\n", - " [-0.3250115780939791, 0.11093873287053337, 0.4...\n", + " [0.16698051985699827, -0.02406109383254372, 0....\n", " \n", " \n", " 3\n", @@ -1031,7 +1031,7 @@ " 4\n", " 114885\n", " 31357.0\n", - " [-0.08088437767077983, 0.1911468768682881, 0.2...\n", + " [-0.10740791019437969, 0.09007945525146789, 0....\n", " \n", " \n", " 4\n", @@ -1053,7 +1053,7 @@ " 5\n", " 113041\n", " 11862.0\n", - " [-0.007213409719480573, 0.20232376643634847, 0...\n", + " [0.11311012532803581, 0.025998675845395405, 0....\n", " \n", " \n", "\n", @@ -1096,11 +1096,11 @@ "4 Father of the Bride Part II 5.7 173 5 113041 \n", "\n", " tmdbId movie_vector \n", - "0 862.0 [-0.12329348744399116, -0.03395287506133206, 0... \n", - "1 8844.0 [-0.20839075686685218, 0.2842778495633789, 0.2... \n", - "2 15602.0 [-0.3250115780939791, 0.11093873287053337, 0.4... \n", - "3 31357.0 [-0.08088437767077983, 0.1911468768682881, 0.2... \n", - "4 11862.0 [-0.007213409719480573, 0.20232376643634847, 0... " + "0 862.0 [0.12184447241197785, -0.16994406060791697, 0.... \n", + "1 8844.0 [0.14683581574270926, -0.06365576587872183, 0.... \n", + "2 15602.0 [0.16698051985699827, -0.02406109383254372, 0.... \n", + "3 31357.0 [-0.10740791019437969, 0.09007945525146789, 0.... \n", + "4 11862.0 [0.11311012532803581, 0.025998675845395405, 0.... " ] }, "execution_count": 13, @@ -1176,18 +1176,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'id': 'movie:255865ce253c4b7bbefaff7884035b0c', 'vector_distance': '-3.8687338829', 'title': 'Spirited Away', 'genres': '[\"Fantasy\",\"Adventure\",\"Animation\",\"Family\"]'}\n", - "{'id': 'movie:c833029c842143fdaf7bb5acedb051ce', 'vector_distance': '-3.73652648926', 'title': 'The Princess Bride', 'genres': '[\"Adventure\",\"Family\",\"Fantasy\",\"Comedy\",\"Romance\"]'}\n", - "{'id': 'movie:cf48a5443467433ca57c1741104cf123', 'vector_distance': '-3.66395378113', 'title': 'The Usual Suspects', 'genres': '[\"Drama\",\"Crime\",\"Thriller\"]'}\n", - "{'id': 'movie:a8707fc2440043a78e1b7ee92c5038cf', 'vector_distance': '-3.62124490738', 'title': 'The Shawshank Redemption', 'genres': '[\"Drama\",\"Crime\"]'}\n", - "{'id': 'movie:772b299da4e8427082e13fc542c80a9e', 'vector_distance': '-3.59598970413', 'title': 'A Beautiful Mind', 'genres': '[\"Drama\",\"Romance\"]'}\n", - "{'id': 'movie:9818b2cc529f4ef8af6b1e42618c7e19', 'vector_distance': '-3.57971763611', 'title': 'Cinema Paradiso', 'genres': '[\"Drama\",\"Romance\"]'}\n", - "{'id': 'movie:cd43f8fee7024fc0a3edd2cb155491cf', 'vector_distance': '-3.54007005692', 'title': 'The Empire Strikes Back', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", - "{'id': 'movie:ca1c554ca7ef4da29e1a543147da54d9', 'vector_distance': '-3.53854608536', 'title': 'Like Water for Chocolate', 'genres': '[\"Drama\",\"Romance\"]'}\n", - "{'id': 'movie:f03df438a38349a4992222b6d37e81eb', 'vector_distance': '-3.4644536972', 'title': 'Roger & Me', 'genres': '[\"Documentary\",\"History\"]'}\n", - "{'id': 'movie:e1b6f8ad41d2425a8b470f0d206038bf', 'vector_distance': '-3.45273590088', 'title': 'The Lord of the Rings: The Fellowship of the Ring', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n", - "{'id': 'movie:1d2091b80efd4052b2c54390f8f25172', 'vector_distance': '-3.44259595871', 'title': 'Star Wars', 'genres': '[\"Adventure\",\"Action\",\"Science Fiction\"]'}\n", - "{'id': 'movie:b41bb158cd0b4362955e3800fd2cfb9d', 'vector_distance': '-3.40954303741', 'title': 'The Lord of the Rings: The Two Towers', 'genres': '[\"Adventure\",\"Fantasy\",\"Action\"]'}\n" + "vector distance: -3.63527393,\t predicted rating: 4.63527393,\t title: Fight Club, \n", + "vector distance: -3.60445881,\t predicted rating: 4.60445881,\t title: All About Eve, \n", + "vector distance: -3.60197020,\t predicted rating: 4.60197020,\t title: Lock, Stock and Two Smoking Barrels, \n", + "vector distance: -3.59518766,\t predicted rating: 4.59518766,\t title: Midnight in Paris, \n", + "vector distance: -3.58543396,\t predicted rating: 4.58543396,\t title: It Happened One Night, \n", + "vector distance: -3.54092789,\t predicted rating: 4.54092789,\t title: Anne Frank Remembered, \n", + "vector distance: -3.51044893,\t predicted rating: 4.51044893,\t title: Pulp Fiction, \n", + "vector distance: -3.50941706,\t predicted rating: 4.50941706,\t title: Raging Bull, \n", + "vector distance: -3.49180365,\t predicted rating: 4.49180365,\t title: Cool Hand Luke, \n", + "vector distance: -3.47437143,\t predicted rating: 4.47437143,\t title: Rear Window, \n", + "vector distance: -3.41378117,\t predicted rating: 4.41378117,\t title: The Usual Suspects, \n", + "vector distance: -3.40533876,\t predicted rating: 4.40533876,\t title: Princess Mononoke, \n" ] } ], @@ -1208,7 +1208,9 @@ "results = movie_index.query(query)\n", "\n", "for r in results:\n", - " print(r)" + " # compute our predicted rating on a scale of 0 to 5 from our vector distance\n", + " r['predicted_rating'] = - float(r['vector_distance']) + 1.\n", + " print(f\"vector distance: {float(r['vector_distance']):.08f},\\t predicted rating: {r['predicted_rating']:.08f},\\t title: {r['title']}, \")" ] }, { @@ -1298,145 +1300,145 @@ " \n", " 0\n", " The Shawshank Redemption\n", - " The Godfather\n", + " Forrest Gump\n", " Cinema Paradiso\n", " The Shawshank Redemption\n", - " Yojimbo\n", + " Castle in the Sky\n", " What's Eating Gilbert Grape\n", " \n", " \n", " 1\n", - " Cinema Paradiso\n", - " The Lord of the Rings: The Fellowship of the Ring\n", - " The Godfather\n", - " The Dark Knight\n", - " La Haine\n", - " The Grapes of Wrath\n", + " Forrest Gump\n", + " The Silence of the Lambs\n", + " The African Queen\n", + " Pulp Fiction\n", + " My Neighbor Totoro\n", + " A Clockwork Orange\n", " \n", " \n", " 2\n", - " Band of Brothers\n", - " Schindler's List\n", - " The Empire Strikes Back\n", + " Cinema Paradiso\n", " Pulp Fiction\n", - " The Postman\n", - " A Clockwork Orange\n", + " Raiders of the Lost Ark\n", + " The Dark Knight\n", + " All Quiet on the Western Front\n", + " The Grapes of Wrath\n", " \n", " \n", " 3\n", - " A Grand Day Out\n", + " Lock, Stock and Two Smoking Barrels\n", + " Raiders of the Lost Ark\n", " The Empire Strikes Back\n", - " Star Wars\n", - " Whiplash\n", - " Seven Samurai\n", - " Bananas\n", + " Fight Club\n", + " Army of Darkness\n", + " Pineapple Express\n", " \n", " \n", " 4\n", - " The Godfather\n", - " The Lord of the Rings: The Two Towers\n", - " The Philadelphia Story\n", - " Blade Runner\n", - " Shine\n", - " Pineapple Express\n", + " The African Queen\n", + " The Empire Strikes Back\n", + " Indiana Jones and the Last Crusade\n", + " Whiplash\n", + " All About Eve\n", + " James and the Giant Peach\n", " \n", " \n", " 5\n", - " The Lord of the Rings: The Fellowship of the Ring\n", + " The Silence of the Lambs\n", + " Indiana Jones and the Last Crusade\n", " Star Wars\n", - " Mr. Smith Goes to Washington\n", - " Big Hero 6\n", - " My Neighbor Totoro\n", - " James and the Giant Peach\n", + " Blade Runner\n", + " The Professional\n", + " Bananas\n", " \n", " \n", " 6\n", + " Pulp Fiction\n", " Schindler's List\n", - " The Fugitive\n", - " Empire of the Sun\n", - " Fight Club\n", - " The Professional\n", - " The Apple Dumpling Gang\n", + " The Manchurian Candidate\n", + " The Avengers\n", + " Shine\n", + " Orange County\n", " \n", " \n", " 7\n", - " The Empire Strikes Back\n", - " The Matrix\n", - " Stand by Me\n", - " The Avengers\n", - " All About Eve\n", - " Orange County\n", + " Raiders of the Lost Ark\n", + " The Lord of the Rings: The Return of the King\n", + " The Godfather: Part II\n", + " Guardians of the Galaxy\n", + " Yojimbo\n", + " Herbie Goes Bananas\n", " \n", " \n", " 8\n", + " The Empire Strikes Back\n", " The Lord of the Rings: The Two Towers\n", - " The Dark Knight\n", - " The Princess Bride\n", - " Guardians of the Galaxy\n", - " Rebel Without a Cause\n", - " The Apple Dumpling Gang Rides Again\n", + " Castle in the Sky\n", + " Gone Girl\n", + " Belle de Jour\n", + " The Apple Dumpling Gang\n", " \n", " \n", " 9\n", - " The Usual Suspects\n", - " Pulp Fiction\n", - " Raiders of the Lost Ark\n", - " Gone Girl\n", - " Bicycle Thieves\n", - " Herbie Goes Bananas\n", + " Indiana Jones and the Last Crusade\n", + " Terminator 2: Judgment Day\n", + " Back to the Future\n", + " Big Hero 6\n", + " Local Hero\n", + " Adam's Apples\n", " \n", " \n", "\n", "" ], "text/plain": [ - " top picks \\\n", - "0 The Shawshank Redemption \n", - "1 Cinema Paradiso \n", - "2 Band of Brothers \n", - "3 A Grand Day Out \n", - "4 The Godfather \n", - "5 The Lord of the Rings: The Fellowship of the Ring \n", - "6 Schindler's List \n", - "7 The Empire Strikes Back \n", - "8 The Lord of the Rings: The Two Towers \n", - "9 The Usual Suspects \n", + " top picks \\\n", + "0 The Shawshank Redemption \n", + "1 Forrest Gump \n", + "2 Cinema Paradiso \n", + "3 Lock, Stock and Two Smoking Barrels \n", + "4 The African Queen \n", + "5 The Silence of the Lambs \n", + "6 Pulp Fiction \n", + "7 Raiders of the Lost Ark \n", + "8 The Empire Strikes Back \n", + "9 Indiana Jones and the Last Crusade \n", "\n", - " block busters \\\n", - "0 The Godfather \n", - "1 The Lord of the Rings: The Fellowship of the Ring \n", - "2 Schindler's List \n", - "3 The Empire Strikes Back \n", - "4 The Lord of the Rings: The Two Towers \n", - "5 Star Wars \n", - "6 The Fugitive \n", - "7 The Matrix \n", - "8 The Dark Knight \n", - "9 Pulp Fiction \n", + " block busters \\\n", + "0 Forrest Gump \n", + "1 The Silence of the Lambs \n", + "2 Pulp Fiction \n", + "3 Raiders of the Lost Ark \n", + "4 The Empire Strikes Back \n", + "5 Indiana Jones and the Last Crusade \n", + "6 Schindler's List \n", + "7 The Lord of the Rings: The Return of the King \n", + "8 The Lord of the Rings: The Two Towers \n", + "9 Terminator 2: Judgment Day \n", "\n", - " classics what's popular \\\n", - "0 Cinema Paradiso The Shawshank Redemption \n", - "1 The Godfather The Dark Knight \n", - "2 The Empire Strikes Back Pulp Fiction \n", - "3 Star Wars Whiplash \n", - "4 The Philadelphia Story Blade Runner \n", - "5 Mr. Smith Goes to Washington Big Hero 6 \n", - "6 Empire of the Sun Fight Club \n", - "7 Stand by Me The Avengers \n", - "8 The Princess Bride Guardians of the Galaxy \n", - "9 Raiders of the Lost Ark Gone Girl \n", + " classics what's popular \\\n", + "0 Cinema Paradiso The Shawshank Redemption \n", + "1 The African Queen Pulp Fiction \n", + "2 Raiders of the Lost Ark The Dark Knight \n", + "3 The Empire Strikes Back Fight Club \n", + "4 Indiana Jones and the Last Crusade Whiplash \n", + "5 Star Wars Blade Runner \n", + "6 The Manchurian Candidate The Avengers \n", + "7 The Godfather: Part II Guardians of the Galaxy \n", + "8 Castle in the Sky Gone Girl \n", + "9 Back to the Future Big Hero 6 \n", "\n", - " indie hits fruity films \n", - "0 Yojimbo What's Eating Gilbert Grape \n", - "1 La Haine The Grapes of Wrath \n", - "2 The Postman A Clockwork Orange \n", - "3 Seven Samurai Bananas \n", - "4 Shine Pineapple Express \n", - "5 My Neighbor Totoro James and the Giant Peach \n", - "6 The Professional The Apple Dumpling Gang \n", - "7 All About Eve Orange County \n", - "8 Rebel Without a Cause The Apple Dumpling Gang Rides Again \n", - "9 Bicycle Thieves Herbie Goes Bananas " + " indie hits fruity films \n", + "0 Castle in the Sky What's Eating Gilbert Grape \n", + "1 My Neighbor Totoro A Clockwork Orange \n", + "2 All Quiet on the Western Front The Grapes of Wrath \n", + "3 Army of Darkness Pineapple Express \n", + "4 All About Eve James and the Giant Peach \n", + "5 The Professional Bananas \n", + "6 Shine Orange County \n", + "7 Yojimbo Herbie Goes Bananas \n", + "8 Belle de Jour The Apple Dumpling Gang \n", + "9 Local Hero Adam's Apples " ] }, "execution_count": 17, @@ -1464,6 +1466,8 @@ "## Keeping Things Fresh\n", "You've probably noticed that a few movies get repeated in these lists. That's not surprising as all our results are personalized and things like `popularity` and `user_rating` and `revenue` are likely highly correlated. And it's more than likely that at least some of the recommendations we're expecting to be highly rated by a given user are ones they've already watched and rated highly.\n", "\n", + "We need a way to filter out movies that a user has already seen, and movies that we've already recommended to them before.\n", + "We could use a Tag filter on our queries to filter out movies by their id, but this gets cumbersome quickly.\n", "Luckily Redis offers an easy answer to keeping recommendations new and interesting, and that answer is Bloom Filters." ] }, @@ -1479,12 +1483,12 @@ " user_vector = user_data[\"user_vector\"]\n", " watched_movies = user_data[\"watched_list_ids\"]\n", "\n", - " # filter out movies that the user has already watched\n", + " # use a Bloom Filter to filter out movies that the user has already watched\n", " client.bf().insert('user_watched_list', [f\"{user_id}:{movie_id}\" for movie_id in watched_movies])\n", "\n", " query = RangeQuery(vector=user_vector,\n", " vector_field_name='movie_vector',\n", - " num_results=num_results * 5, # fetch more results to filter out watched movies\n", + " num_results=num_results * 5, # fetch more results to account for watched movies\n", " filter_expression=filters,\n", " return_fields=['title', 'overview', 'genres', 'movieId'],\n", " )\n", @@ -1502,7 +1506,7 @@ " return recommendations\n", "\n", "# example usage\n", - "# create a bloom filter for this user\n", + "# create a bloom filter for all our users\n", "try:\n", " client.bf().create(f\"user_watched_list\", 0.01, 10000)\n", "except Exception as e:\n", @@ -1559,41 +1563,41 @@ " \n", " 0\n", " Cinema Paradiso\n", - " Se7en\n", - " Mr. Smith Goes to Washington\n", - " Whiplash\n", - " Yojimbo\n", + " The Manchurian Candidate\n", + " Castle in the Sky\n", + " Fight Club\n", + " All Quiet on the Western Front\n", " \n", " \n", " 1\n", - " A Grand Day Out\n", - " Stand by Me\n", - " Empire of the Sun\n", - " Blade Runner\n", - " La Haine\n", + " Lock, Stock and Two Smoking Barrels\n", + " Toy Story\n", + " 12 Angry Men\n", + " Whiplash\n", + " Army of Darkness\n", " \n", " \n", " 2\n", - " The Godfather\n", - " The Prestige\n", + " The African Queen\n", " The Godfather: Part II\n", - " Big Hero 6\n", - " The Postman\n", + " My Neighbor Totoro\n", + " Blade Runner\n", + " All About Eve\n", " \n", " \n", " 3\n", - " The Usual Suspects\n", - " The Princess Bride\n", - " Roger & Me\n", - " Fight Club\n", - " Seven Samurai\n", + " The Silence of the Lambs\n", + " Back to the Future\n", + " It Happened One Night\n", + " Gone Girl\n", + " The Professional\n", " \n", " \n", " 4\n", - " The Philadelphia Story\n", - " Rain Man\n", - " It Happened One Night\n", - " Gone Girl\n", + " Eat Drink Man Woman\n", + " The Godfather\n", + " Stand by Me\n", + " Big Hero 6\n", " Shine\n", " \n", " \n", @@ -1601,19 +1605,19 @@ "" ], "text/plain": [ - " top picks block busters classics \\\n", - "0 Cinema Paradiso Se7en Mr. Smith Goes to Washington \n", - "1 A Grand Day Out Stand by Me Empire of the Sun \n", - "2 The Godfather The Prestige The Godfather: Part II \n", - "3 The Usual Suspects The Princess Bride Roger & Me \n", - "4 The Philadelphia Story Rain Man It Happened One Night \n", + " top picks block busters \\\n", + "0 Cinema Paradiso The Manchurian Candidate \n", + "1 Lock, Stock and Two Smoking Barrels Toy Story \n", + "2 The African Queen The Godfather: Part II \n", + "3 The Silence of the Lambs Back to the Future \n", + "4 Eat Drink Man Woman The Godfather \n", "\n", - " what's popular indie hits \n", - "0 Whiplash Yojimbo \n", - "1 Blade Runner La Haine \n", - "2 Big Hero 6 The Postman \n", - "3 Fight Club Seven Samurai \n", - "4 Gone Girl Shine " + " classics what's popular indie hits \n", + "0 Castle in the Sky Fight Club All Quiet on the Western Front \n", + "1 12 Angry Men Whiplash Army of Darkness \n", + "2 My Neighbor Totoro Blade Runner All About Eve \n", + "3 It Happened One Night Gone Girl The Professional \n", + "4 Stand by Me Big Hero 6 Shine " ] }, "execution_count": 19, @@ -1650,7 +1654,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Deleted 4370 keys\n", + "Deleted 4358 keys\n", "Deleted 2000 keys\n", "Deleted 1000 keys\n", "Deleted 500 keys\n",