Film: Recommender System

import matplotlib
if not hasattr(matplotlib.RcParams, "_get"):
    matplotlib.RcParams._get = dict.get

Film: Recommender System#

Vi importerer først de nødvendige kodepakker.

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

Her defineres listerne med film og brugere samt datamatricen med anmeldelser.

# Liste af filmnavne
movies = [
    "Barbie",
    "Star Wars",
    "Avengers",
    "Løvernes Konge",
    "Druk",
    "Alene Hjemme",
    "Inderst Inde",
    "Love Actually",
    "Oppenheimer",
    "Ternet Ninja"
]

R = np.array([
    [1, 1, 0, -1, 0, 1, 0, -1, 1, 0],
    [0, 1, 1, 1, 1, -1, 0, -1, 0, 1],
    [-1, 0, 0, 1, 1, 1, 1, 1, 0, 1],
    [1, -1, 0, 1, 0, 1, 1, 1, 0, 1],
    [0, 0, 1, 1, 0, 1, 1, 0, 1, 1],
    [0, 1, 1, 0, 0, 1, 0, 0, 0, 1],
    [1, 0, 0, 1, 1, 1, 1, 0, 0, 1],
    [0, 1, 1, 1, 0, 1, -1, 0, 0, 1],
    [1, -1, 0, 1, 0, 1, 1, 1, 0, 1],
    [1, 1, 0, 1, 1, 1, 1, 0, -1, 1],
    [1, 1, 1, 1, 1, 1, 0, 0, 1, 1],
    [0, 1, 1, 0, 1, -1, 0, -1, 0, 1],
    [1, 0, 1, 1, 1, 1, -1, 0, -1, 1],
    [0, 1, 1, 0, 1, 1, 0, 1, 0, 1],
    [-1, 1, 1, 1, 0, 1, -1, -1, 0, 1],
    [1, 1, 0, 1, 0, 1, 1, 0, 1, 0],
    [0, 1, 0, 1, 0, 1, 1, -1, 1, 1],
    [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [0, 1, 1, 1, 1, 1, 1, 0, 1, 1],
    [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 0, 0, 1, 1, 1, 1, 1, 0, 1],
    [0, 0, 1, 1, 1, 0, 0, 1, 0, 1],
    [-1, 1, 1, 0, 1, 1, 1, 0, 1, -1],
    [1, 1, 1, 1, 0, 0, 1, 0, 0, 0],
    [-1, 1, 1, 1, 1, 1, 0, 0, 1, 1],
    [0, 1, 0, 1, 1, 1, 0, 1, 1, 0],
    [0, 1, 1, 0, 1, 1, -1, -1, 1, 1],
    [1, 0, 1, 0, 0, -1, 1, 0, 0, 0],
    [1, 1, 1, 0, 0, 1, 0, 0, 0, 1],
    [1, 1, 1, 1, 1, 1, 1, -1, 1, 1],
    [-1, 1, 1, 1, -1, 0, -1, -1, 1, 1],
    [-1, 1, 1, 1, -1, 1, 0, -1, 1, 0],
    [0, 1, 0, 1, 1, 1, 1, 1, 1, 0],
    [-1, 1, 1, 1, 1, 1, 0, 0, 1, -1],
    [-1, 1, 1, 1, 1, 1, -1, -1, 1, 1],
    [-1, 1, 1, 0, 1, 1, 1, 0, 1, 1],
    [1, -1, 1, 1, -1, 1, 1, 0, 1, 1],
    [-1, 1, 1, 1, 0, -1, 1, 0, 1, 1],
    [1, 1, 0, 1, 0, 1, -1, 0, 0, 1],
    [1, 0, 1, 0, -1, 1, 1, 0, 1, 0],
    [1, 1, 1, 1, 0, 1, 1, -1, 0, 0],
    [1, 0, 1, 1, 0, 1, 1, 0, 0, 1],
    [0, 0, 1, 1, 1, 1, 0, 0, 1, 1],
    [0, 0, 0, 1, 1, 1, 1, 0, -1, 1],
    [1, 0, 1, 1, 1, 1, 1, 0, 0, -1],
    [-1, 0, 1, 1, 0, 1, 1, 0, 0, 1],
    [-1, -1, 0, 1, 0, 1, 0, 0, 0, 1],
    [1, 1, 0, 1, 1, 1, 1, 1, 0, 1],
    [0, 0, 0, 1, 0, 1, 1, 0, 1, 0],
    [0, 1, 0, 1, 0, -1, 0, -1, 0, 0],
    [1, 0, 0, 1, 0, 1, 1, 0, -1, 1],
    [1, 1, 1, 0, 0, -1, -1, -1, 1, 1],
    [0, 1, 0, 1, 1, 1, 0, 1, 0, 1],
    [0, -1, -1, 1, 1, 1, 1, 0, 1, 1],
    [0, 0, 0, 0, 1, 1, 0, 1, 0, 1],
    [-1, 1, 1, 1, 0, -1, 1, 0, 1, 1],
    [1, 1, 1, 1, 1, 1, 1, 0, 1, 1],
    [-1, 1, 1, 1, 1, -1, -1, -1, 1, 1]
], dtype=float)

N, M = R.shape

users = [f"U{i}" for i in range(1, N+1)]

Datamatricen centreres. Vi ignorerer ikke-sete film, altså værdien 0.

# -----------------------------
# Preprocessing
# -----------------------------
R_nan = R.copy()
R_nan[R_nan == 0] = np.nan

global_mean = np.nanmean(R_nan)
R_centered = R_nan - global_mean
R_filled = np.nan_to_num(R_centered, nan=0.0)

Vi beregner SVD på den centrerede data-matrix.

# -----------------------------
# SVD
# -----------------------------
U, S, Vt = np.linalg.svd(R_filled, full_matrices=False)
V = Vt.T

Vi kan vælge en bruger og undersøge lighed (similarity) til andre brugere. Der kan eksperimenteres med forskellige valg af brugere.

# -----------------------------
# User similarity
# -----------------------------
# EKSPERIMETER MED FORSKELLIGE BRUGERE HER:
user_index = 11

k = 3
SimUsers = cosine_similarity(U[:, :k])
np.fill_diagonal(SimUsers, 0)

plt.figure()
plt.scatter(range(1, N + 1), SimUsers[user_index, :], s=100, edgecolor='k')
step = 3
plt.xticks(range(1, N + 1, step), [users[i] for i in range(0, N, step)], rotation=45)
plt.xlabel('Users')
plt.ylabel('Similarity')
plt.title(f'Similar users to {users[user_index]}')
plt.grid(True, which='both')
plt.axis([0, N + 1, -1, 1])
plt.tight_layout()
plt.show()

closest_user_ind = np.argmax(SimUsers[user_index, :])
print('-' * 80)
print(f"Selected user: {users[user_index]}, Movies: {R[user_index, :]}")
print(f"Closest user: {users[closest_user_ind]}, Movies: {R[closest_user_ind, :]}")
print('-' * 80)

Ligeledes kan vi se på ligheden mellem filmene.

# -----------------------------
# Movie similarity
# -----------------------------
# EKSPERIMETER MED FORSKELLIGE FILMER HER:
movie_index = 2  # Python is 0-based

SimMovies = cosine_similarity(V[:, :k])
np.fill_diagonal(SimMovies, 0)

plt.figure()
plt.scatter(range(1, M + 1), SimMovies[movie_index, :], s=100, edgecolor='k')
plt.xticks(range(1, M + 1), movies, rotation=45)
plt.xlabel('Movies')
plt.ylabel('Similarity')
plt.title(f'Similar movies to {movies[movie_index]}')
plt.grid(True, which='both')
plt.axis([0, M + 1, -1, 1])
plt.tight_layout()
plt.show()

sorted_inds = np.argsort(SimMovies[movie_index, :])[::-1]
similar_movies = [movies[i] for i in sorted_inds if i != movie_index]
print('-' * 80)
print(f"Selected movie: {movies[movie_index]}")
print(f"Similar movies in order: {', '.join(similar_movies)}")
print('-' * 80)

Vi kan bruge heatmaps til at se på ligheden mellem alle brugere og alle film på en gang.

# -----------------------------
# Heatmaps
# -----------------------------
plt.figure()
plt.imshow(SimMovies, vmin=-1, vmax=1, cmap='bwr')
plt.colorbar()
plt.xticks(range(M), movies, rotation=45)
plt.yticks(range(M), movies)
plt.tight_layout()
plt.show()

plt.figure()
plt.imshow(SimUsers, vmin=-1, vmax=1, cmap='bwr')
plt.colorbar()
step = 3
plt.xticks(range(0, N, step), [users[i] for i in range(0, N, step)], rotation=90)
plt.yticks(range(0, N, step), [users[i] for i in range(0, N, step)])
plt.tight_layout()
plt.show()

Mere tydeligt kan vi se lighederne når vi projicerer dataen på et lavere dimensionelt rum - her \(2\) dimensioner - som er udspændt af de to første Principal Components.

# -----------------------------
# Low-dimensional embeddings
# -----------------------------
dim = 2
normalize_emb = False

VS = V[:, :dim].copy()
if normalize_emb:
    VS /= np.linalg.norm(VS, axis=1, keepdims=True)

offset = np.random.randn(M, dim)
offset = 0.1 * offset / np.linalg.norm(offset, axis=1, keepdims=True)

plt.figure()
plt.scatter(VS[:, 0], VS[:, 1], s=100, edgecolor='k')
for i in range(M):
    plt.text(VS[i, 0] + offset[i, 0], VS[i, 1] + offset[i, 1], movies[i], fontsize=12)

US = U[:, :dim].copy()
US /= np.linalg.norm(US, axis=1, keepdims=True)
if normalize_emb:
    US /= np.linalg.norm(US, axis=1, keepdims=True)

offset = np.random.randn(N, dim)
offset = 0.15 * offset / np.linalg.norm(offset, axis=1, keepdims=True)

plt.scatter(US[:, 0], US[:, 1], s=100, edgecolor='k')
for i in range(N):
    plt.text(US[i, 0] + offset[i, 0], US[i, 1] + offset[i, 1], users[i], fontsize=12)

plt.axis([-1.25, 1.25, -1.25, 1.25])
plt.grid(True, which='both')
plt.tight_layout()
plt.show()