Show/Hide the code
1
2
3
4
| from fastai.collab import *
from fastai.tabular.all import *
path = untar_data(URLs.ML_100k)
|
Show/Hide the code
1
2
3
4
5
6
7
| ratings = pd.read_csv(
path / "u.data",
delimiter="\t",
header=None,
names=["user", "movie", "rating", "timestamp"],
)
ratings.head()
|
| user | movie | rating | timestamp |
|---|
| 0 | 196 | 242 | 3 | 881250949 |
| 1 | 186 | 302 | 3 | 891717742 |
| 2 | 22 | 377 | 1 | 878887116 |
| 3 | 244 | 51 | 2 | 880606923 |
| 4 | 166 | 346 | 1 | 886397596 |
Show/Hide the code
1
2
| pivot = pd.pivot_table(ratings, values="rating", index="user", columns="movie")
pivot
|
| movie | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 1673 | 1674 | 1675 | 1676 | 1677 | 1678 | 1679 | 1680 | 1681 | 1682 |
|---|
| user | | | | | | | | | | | | | | | | | | | | | |
| 1 | 5.0 | 3.0 | 4.0 | 3.0 | 3.0 | 5.0 | 4.0 | 1.0 | 5.0 | 3.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 4.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5 | 4.0 | 3.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 939 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 5.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 940 | NaN | NaN | NaN | 2.0 | NaN | NaN | 4.0 | 5.0 | 3.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 941 | 5.0 | NaN | NaN | NaN | NaN | NaN | 4.0 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 942 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 943 | NaN | 5.0 | NaN | NaN | NaN | NaN | NaN | NaN | 3.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
943 rows × 1682 columns
Show/Hide the code
1
2
3
4
| dense_rows = pivot.notna().sum(axis=1)
dense_cols = pivot.notna().sum(axis=0)
selected = pivot.loc[dense_rows.nlargest(20).index, dense_cols.nlargest(20).index]
print(selected.fillna(" ").to_string())
|
movie 50 258 100 181 294 286 288 1 300 121 174 127 56 7 98 237 117 172 222 204
user
405 5.0 5.0 5.0 5.0 5.0 4.0 4.0 5.0 5.0
655 4.0 2.0 3.0 3.0 3.0 3.0 3.0 2.0 3.0 3.0 3.0 5.0 3.0 3.0 4.0 3.0 2.0 4.0 2.0 3.0
13 5.0 4.0 5.0 5.0 2.0 3.0 1.0 3.0 1.0 5.0 4.0 5.0 5.0 2.0 4.0 5.0 3.0 5.0 3.0 5.0
450 5.0 4.0 4.0 4.0 4.0 4.0 3.0 4.0 4.0 3.0 5.0 5.0 4.0 4.0 4.0 5.0 4.0 4.0 3.0 4.0
276 5.0 5.0 5.0 5.0 4.0 4.0 5.0 4.0 4.0 5.0 5.0 5.0 5.0 5.0 5.0 4.0 5.0 4.0 5.0
416 5.0 5.0 5.0 5.0 4.0 5.0 5.0 5.0 4.0 5.0 5.0 5.0 5.0 4.0 5.0 3.0 5.0 5.0 5.0
537 4.0 4.0 4.0 2.0 1.0 3.0 2.0 2.0 1.0 1.0 3.0 5.0 5.0 4.0 3.0 3.0 2.0 3.0 2.0 3.0
303 5.0 4.0 5.0 5.0 4.0 5.0 4.0 5.0 1.0 3.0 5.0 5.0 5.0 4.0 5.0 5.0 3.0 5.0 3.0 4.0
234 4.0 2.0 4.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 4.0 3.0 2.0 4.0 3.0 2.0 3.0 3.0 2.0
393 5.0 4.0 1.0 4.0 4.0 3.0 3.0 4.0 2.0 4.0 4.0 4.0 5.0 4.0 4.0
181 3.0 3.0 2.0 1.0 4.0 3.0 3.0 4.0 4.0 5.0 2.0 4.0
279 3.0 4.0 3.0 2.0 3.0 3.0 4.0 4.0 4.0 5.0 5.0 2.0 1.0 3.0
429 5.0 4.0 5.0 5.0 3.0 3.0 3.0 3.0 4.0 4.0 4.0 2.0 4.0 3.0 4.0 5.0 4.0 4.0
846 5.0 3.0 5.0 3.0 4.0 5.0 5.0 5.0 4.0 4.0 3.0
7 5.0 4.0 5.0 3.0 1.0 4.0 4.0 4.0 5.0 5.0 5.0 5.0 5.0 4.0 5.0 4.0 5.0
94 5.0 5.0 5.0 4.0 4.0 3.0 4.0 2.0 4.0 5.0 5.0 4.0 4.0 4.0 3.0 4.0
682 5.0 3.0 3.0 5.0 3.0 4.0 4.0 2.0 4.0 4.0 5.0 4.0 4.0 4.0 3.0 4.0 5.0 4.0 3.0
308 5.0 5.0 4.0 3.0 4.0 4.0 3.0 4.0 4.0 5.0 4.0 3.0 3.0 3.0 4.0 4.0
92 5.0 4.0 5.0 4.0 3.0 3.0 4.0 5.0 5.0 5.0 4.0 5.0 4.0 4.0 4.0 4.0 4.0
293 5.0 3.0 4.0 3.0 2.0 3.0 3.0 2.0 2.0 3.0 5.0 5.0 4.0 3.0 4.0 3.0 3.0 5.0 3.0 3.0
Show/Hide the code
1
2
3
4
5
6
7
8
9
| movies = pd.read_csv(
path / "u.item",
delimiter="|",
encoding="latin-1",
usecols=(0, 1),
names=("movie", "title"),
header=None,
)
movies.head()
|
| movie | title |
|---|
| 0 | 1 | Toy Story (1995) |
| 1 | 2 | GoldenEye (1995) |
| 2 | 3 | Four Rooms (1995) |
| 3 | 4 | Get Shorty (1995) |
| 4 | 5 | Copycat (1995) |
Show/Hide the code
1
2
| ratings = ratings.merge(movies)
ratings.head()
|
| user | movie | rating | timestamp | title |
|---|
| 0 | 196 | 242 | 3 | 881250949 | Kolya (1996) |
| 1 | 186 | 302 | 3 | 891717742 | L.A. Confidential (1997) |
| 2 | 22 | 377 | 1 | 878887116 | Heavyweights (1994) |
| 3 | 244 | 51 | 2 | 880606923 | Legends of the Fall (1994) |
| 4 | 166 | 346 | 1 | 886397596 | Jackie Brown (1997) |
Show/Hide the code
1
2
3
4
5
6
| # Assumptions here:
# By default, it takes the first column for the user,
# the second column for the item and the third column for the ratings.
# In this case, we specify title column instead of the second column for the item
dls = CollabDataLoaders.from_df(ratings, item_name="title", bs=64)
dls.show_batch()
|
| user | title | rating |
|---|
| 0 | 597 | Godfather, The (1972) | 4 |
| 1 | 814 | Evil Dead II (1987) | 2 |
| 2 | 234 | Mother (1996) | 2 |
| 3 | 176 | Cop Land (1997) | 3 |
| 4 | 778 | Cool Runnings (1993) | 1 |
| 5 | 521 | Die Hard 2 (1990) | 4 |
| 6 | 904 | Bed of Roses (1996) | 5 |
| 7 | 151 | Independence Day (ID4) (1996) | 5 |
| 8 | 880 | Devil's Own, The (1997) | 2 |
| 9 | 15 | Peacemaker, The (1997) | 3 |
Show/Hide the code
1
2
3
4
5
6
| n_users = len(dls.classes["user"])
n_movies = len(dls.classes["title"])
n_factors = 5
user_factors = torch.randn(n_users, n_factors)
movie_factors = torch.randn(n_movies, n_factors)
|
Show/Hide the code
1
2
| one_hot_3 = one_hot(3, n_users).float()
user_factors.T @ one_hot_3
|
tensor([-1.1552, -1.3241, -0.1439, 1.1268, -0.5780])
Show/Hide the code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
| class DotProduct(Module):
def __init__(self, n_users, n_movies, n_factors, y_range=(0, 5.5)):
self.user_factors = Embedding(n_users, n_factors)
self.user_bias = Embedding(n_users, 1)
self.movie_factors = Embedding(n_movies, n_factors)
self.movie_bias = Embedding(n_movies, 1)
self.y_range = y_range
def forward(self, x):
users = self.user_factors(x[:, 0])
movies = self.movie_factors(x[:, 1])
res = (users * movies).sum(dim=1)
res += self.user_bias(x[:, 0]).squeeze() + self.movie_bias(x[:, 1]).squeeze()
return sigmoid_range(res, *self.y_range)
|
Show/Hide the code
1
2
| x, y = dls.one_batch()
x.shape, y.shape
|
(torch.Size([64, 2]), torch.Size([64, 1]))
Show/Hide the code
1
2
3
| model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1)
|
| epoch | train_loss | valid_loss | time |
|---|
| 0 | 0.907877 | 0.969352 | 00:07 |
| 1 | 0.673456 | 0.907031 | 00:05 |
| 2 | 0.523983 | 0.877795 | 00:07 |
| 3 | 0.460534 | 0.863388 | 00:05 |
| 4 | 0.421944 | 0.858831 | 00:05 |
Show/Hide the code
1
2
| def create_params(size):
return nn.Parameter(torch.zeros(*size).normal_(0, 0.01))
|
Show/Hide the code
1
2
3
| movie_bias = learn.model.movie_bias.weight.squeeze()
idx = movie_bias.argsort(descending=True)[:5]
[dls.classes["title"][i] for i in idx]
|
['Shawshank Redemption, The (1994)',
"Schindler's List (1993)",
'Star Wars (1977)',
'L.A. Confidential (1997)',
'Titanic (1997)']
Show/Hide the code
1
2
| learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5), metrics=rmse)
learn.fit_one_cycle(10, 5e-3, wd=0.1)
|
| epoch | train_loss | valid_loss | _rmse | time |
|---|
| 0 | 1.013464 | 1.019671 | 1.009788 | 00:07 |
| 1 | 0.769047 | 0.908077 | 0.952931 | 00:03 |
| 2 | 0.602542 | 0.898511 | 0.947898 | 00:06 |
| 3 | 0.497094 | 0.893938 | 0.945483 | 00:06 |
| 4 | 0.433481 | 0.894063 | 0.945549 | 00:03 |
| 5 | 0.374946 | 0.889614 | 0.943194 | 00:05 |
| 6 | 0.326030 | 0.885589 | 0.941058 | 00:05 |
| 7 | 0.301046 | 0.882571 | 0.939453 | 00:03 |
| 8 | 0.278223 | 0.880205 | 0.938192 | 00:04 |
| 9 | 0.279707 | 0.879805 | 0.937979 | 00:04 |
Show/Hide the code
1
2
3
4
5
| movie_factors = learn.model.i_weight.weight
idx = dls.classes["title"].o2i["Silence of the Lambs, The (1991)"]
distances = nn.CosineSimilarity(dim=1)(movie_factors, movie_factors[idx][None])
idxs = distances.argsort(descending=True)[0:20]
[dls.classes["title"][i] for i in idxs]
|
['Silence of the Lambs, The (1991)',
'Manchurian Candidate, The (1962)',
'Farewell to Arms, A (1932)',
'Meet John Doe (1941)',
'Wedding Gift, The (1994)',
'Fugitive, The (1993)',
'To Catch a Thief (1955)',
'Ben-Hur (1959)',
'Lost Horizon (1937)',
"It's a Wonderful Life (1946)",
'Shawshank Redemption, The (1994)',
'Arsenic and Old Lace (1944)',
'Dial M for Murder (1954)',
'Pather Panchali (1955)',
'Third Man, The (1949)',
'Gaslight (1944)',
'Mr. Smith Goes to Washington (1939)',
'Guantanamera (1994)',
'Great Escape, The (1963)',
'Once Were Warriors (1994)']