-
Notifications
You must be signed in to change notification settings - Fork 0
/
MY ML PROJECT Music Recommendation System (2).py
432 lines (270 loc) · 15.4 KB
/
MY ML PROJECT Music Recommendation System (2).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
#!/usr/bin/env python
# coding: utf-8
# # Import Libraries
# In[4]:
import os
import numpy as np
import pandas as pd
import spotify
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import warnings
warnings.filterwarnings("ignore")
# # Read Data
#
# Here, I build a music recommendation system, I used the Spotify Dataset, which is publicly available on Kaggle and contains metadata and audio features for over 170,000 different songs.
#
# I used five data files from this dataset. The first one contains data for individual songs while the next two files contain the data grouped the genres and years in which the songs were released.
#
# I used the Spotify dataset from kaggle.com site.....
# In[5]:
data = pd.read_csv(r"G:\projectdata\datasetone.csv")
# In[6]:
genre_data = pd.read_csv(r"G:\projectdata\data_w_genres.csv")
# In[7]:
genre_data = pd.read_csv(r"G:\projectdata\data_by_genres.csv")
# In[8]:
year_data = pd.read_csv(r"G:\projectdata\data_by_year.csv")
# In[9]:
artist_data = pd.read_csv(r"G:\projectdata\data_by_artist.csv")
# # info of all files
# #I have included the column metadata below that was generated by calling the Pandas info function for each data frame.
# In[10]:
print(data.info())
# In[11]:
print(genre_data.info())
# In[12]:
print(genre_data.info())
# In[13]:
print(artist_data.info())
# Based on the column descriptions above,
# we can see that here look each dataframe has information about the audio features such as the danceability
# and loudness of different songs,
# that have also been aggregated across genres and specific years.
# This dataset is extremely useful and can be used for a wide range of tasks.
#
# Before building a recommendation system,I decided to create some visualizations to better understand the data and the trends in music over the last 100 years.
# # Yellowbrick: Machine Learning Visualization
#
# #We are going to check for all the analysis with the target as 'popularity'. Before going to do that let's check for the Feature Correlation by considering a few features and for that, I'm going to use the yellowbrick package.
# In[14]:
from yellowbrick.target import FeatureCorrelation
feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness',
'liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration_ms','explicit','key','mode','year']
X, y = data[feature_names], data['popularity']
# Create a list of the feature names
features = np.array(feature_names)
# Instantiate the visualizer
visualizer = FeatureCorrelation(labels=features)
plt.rcParams['figure.figsize']=(20,20)
visualizer.fit(X, y) # Fit the data to the visualizer
visualizer.show()
# ,,..Based on the plot above, we can see that music has transitioned from the more acoustic and instrumental sound of the early 1900s to the more danceable and energetic sound of the 2000s.
#
# The majority of the tracks from the 1920s were likely instrumental pieces from classical and jazz genres. The music of the 2000s sounds very different due to the advent of computers and advanced audio engineering technology that allows us to create electronic music with a wide range of effects and beats.
#
# We can also take a look at how the average tempo or speed of music has changed over the years. The drastic shift in sound towards electronic music is supported by the graph produced by the code below as well.
# # Music Over Time
#
# #Using the data grouped by year, we can understand how the overall sound of music has changed from 1921 to 2020.
#
# In the code below, I used Plotly to visualize the values of different audio features for songs over the past 100 years.
# In[15]:
def get_decade(year):
period_start = int(year/10) * 10
decade = '{}s'.format(period_start)
return decade
data['decade'] = data['year'].apply(get_decade)
sns.set(rc={'figure.figsize':(11 ,6)})
sns.countplot(data['decade'])
# In[16]:
sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence']
fig = px.line(year_data, x='year', y=sound_features)
fig.show()
# # Characteristics of Different Genres
#
# #This dataset contains the audio features for different songs along with the audio features for different genres.
#
# We can use this information to compare different genres and understand their unique differences in sound. In the code below, I selected the ten most popular genres from the dataset and visualized audio features for each of them.
# In[17]:
top10_genres = genre_data.nlargest(10, 'popularity')
fig = px.bar(top10_genres, x='genres', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='group')
fig.show()
# Many of the genres above, such as Chinese electropop are extremely specific and likely belong to one or more broad genres such as pop or electronic music.
#
# We can take these highly specific genres and understand how similar they are to other genres by clustering them based on their audio features.
# # Clustering Genres with K-Means
#
# #Here, the simple K-means clustering algorithm is used to divide the genres in this dataset into ten clusters based on the numerical audio features of each genres.In the code below, I used the famous and simple K-means clustering algorithm to divide the over 2,900 genres in this dataset into ten clusters based on the numerical audio features of each genre.
# In[18]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10, random_state=0))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)
# Now that the genres have been assigned to clusters, we can take this analysis a step further by visualizing the clusters in a two-dimensional space.
# # Visualizing the Genre Clusters with t-SNE
# Visualizing the Genre Clusters with t-SNE .There are many audio features for each genre and it is difficult to visualize clusters in a high-dimensional space. However, we can use a dimensionality reduction technique known as t-Distributed Stochastic Neighbor Embedding to compress the data into a two-dimensional space as demonstrated in the code below.
# In[19]:
# Visualizing the Clusters with t-SNE
from sklearn.manifold import TSNE
tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']
fig = px.scatter(
projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()
# Now, we can easily visualize the genre clusters in a two-dimensional coordinate plane by using Plotly’s scatter function.
# # Clustering Songs with K-Means
# ,,,,We can also cluster the songs using K-means as demonstrated below in order to understand how to build a better recommendation system.
# In[20]:
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()),
('kmeans', KMeans(n_clusters=20,
verbose=False, random_state=4))
], verbose=False)
X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels
# # Visualizing the Song Clusters with PCA
#
# ....,,,,The song data frame is much larger than the genre data frame so I decided to use PCA for dimensionality reduction rather than t-SNE because it runs significantly faster.
# In[21]:
# Visualizing the Clusters with PCA
from sklearn.decomposition import PCA
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = data['name']
projection['cluster'] = data['cluster_label']
fig = px.scatter(
projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()
# ,,..The plot above is interactive,
# so you can see the title of each song when you hover over the points. If you spend some time exploring the plot above you’ll find that similar songs tend to be located close to each other and songs within clusters tend to be at least somewhat similar.
# This observation is the key idea behind the content-based recommendation system that I created in the next section.
# # Build Recommender System
#
# Based on the analysis and visualizations, it’s clear that similar genres tend to have data points that are located close to each other while similar types of songs are also clustered together.
#
# This observation makes perfect sense. Similar genres will sound similar and will come from similar time periods while the same can be said for songs within those genres. We can use this idea to build a recommendation system by taking the data points of the songs a user has listened to and recommending songs corresponding to nearby data points.
#
# Spotipy is a Python client for the Spotify Web API that makes it easy for developers to fetch data and query Spotify’s catalog for songs. You have to install using pip install spotipy.
#
# After installing Spotipy, you will need to create an app on the Spotify Developer’s page and save your Client ID and secret key.
# In[ ]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id='1fbc9b55c2a84f1386ce4ad7d14b9d71',
client_secret='40af79915bde46829e278ddff0caf005',))
def find_song(name, year):
song_data = defaultdict()
results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
if results['tracks']['items'] == []:
return None
results = results['tracks']['items'][0]
track_id = results['id']
audio_features = sp.audio_features(track_id)[0]
song_data['name'] = [name]
song_data['year'] = [year]
song_data['explicit'] = [int(results['explicit'])]
song_data['duration_ms'] = [results['duration_ms']]
song_data['popularity'] = [results['popularity']]
for key, value in audio_features.items():
song_data[key] = value
return pd.DataFrame(song_data)
# Now we can finally build the music recommendation system! The recommendation algorithm I used is pretty simple and follows three steps:
# Compute the average vector of the audio and metadata features for each song the user has listened to.
#
# Find the n-closest data points in the dataset (excluding the points from the songs in the user’s listening history) to this average vector.
#
# Take these n points and recommend the songs corresponding to them.
#
# This algorithm follows a common approach that is used in content-based recommender systems and is generalizable because we can mathematically define the term closest with a wide range of distance metrics ranging from the classic Euclidean distance to the cosine distance.
#
# For the purpose of this project, I used the cosine distance, which is defined below for two vectors u and v.
#
# (Cosine distance formula.(Distance(u,v)= 1-(u*v)/||u|| ||v||= 1-cos0 )
#
# here,In other words, the cosine distance is one minus the cosine similarity — the cosine of the angle between the two vectors. The cosine distance is commonly used in recommender systems and can work well even when the vectors being used have different magnitudes.
#
# If the vectors for two songs are parallel, the angle between them will be zero, meaning the cosine distance between them will also be zero because the cosine of zero is 1.
#
# The functions that I have defined below implement this simple algorithm with the help of Scipy’s cdist function for finding the distances between two pairs of collections of points.
# In[ ]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib
number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']
def get_song_data(song, spotify_data):
try:
song_data = spotify_data[(spotify_data['name'] == song['name'])
& (spotify_data['year'] == song['year'])].iloc[0]
return song_data
except IndexError:
return find_song(song['name'], song['year'])
def get_mean_vector(song_list, spotify_data):
song_vectors = []
for song in song_list:
song_data = get_song_data(song, spotify_data)
if song_data is None:
print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
continue
song_vector = song_data[number_cols].values
song_vectors.append(song_vector)
song_matrix = np.array(list(song_vectors))
return np.mean(song_matrix, axis=0)
def flatten_dict_list(dict_list):
flattened_dict = defaultdict()
for key in dict_list[0].keys():
flattened_dict[key] = []
for dictionary in dict_list:
for key, value in dictionary.items():
flattened_dict[key].append(value)
return flattened_dict
def recommend_songs( song_list, spotify_data, n_songs=10):
metadata_cols = ['name', 'year', 'artists']
song_dict = flatten_dict_list(song_list)
song_center = get_mean_vector(song_list, spotify_data)
scaler = song_cluster_pipeline.steps[0][1]
scaled_data = scaler.transform(spotify_data[number_cols])
scaled_song_center = scaler.transform(song_center.reshape(1, -1))
distances = cdist(scaled_song_center, scaled_data, 'cosine')
index = list(np.argsort(distances)[:, :n_songs][0])
rec_songs = spotify_data.iloc[index]
rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
return rec_songs[metadata_cols].to_dict(orient='records')
# The logic behind the algorithm sounds convincing but does this recommender system really work? The only way to find out is by testing it with practical examples.
#
# Let’s say that we want to recommend music for someone who listens to 1990s grunge, specifically songs by Nirvana.
#
# We can use the recommend_songs function to specify their listening history and generate recommendations as shown below.
# In[ ]:
recommend_songs([{'name': 'Come As You Are', 'year':1991},
{'name': 'Smells Like Teen Spirit', 'year': 1991},
{'name': 'Lithium', 'year': 1992},
{'name': 'All Apologies', 'year': 1993}], data )
# Spotify keeps track of metadata and audio features for songs that we can use to build music recommendation systems.
#
# In this Project,
#
# I demonstrated how you can use this data to build a simple content-based music recommender system with the cosine distance metric.
# In[ ]:
# In[ ]: