MY ML PROJECT Music Recommendation System (2).py

#!/usr/bin/env python
# coding: utf-8

# # Import Libraries 

# In[4]:


import os
import numpy as np
import pandas as pd
import spotify
import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")


# # Read Data
#  
#  Here, I build a music recommendation system, I used the Spotify Dataset, which is publicly available on Kaggle and contains metadata and audio features for over 170,000 different songs. 
#  
#  I used five data files from this dataset. The first one contains data for individual songs while the next two files contain the data grouped the genres and years in which the songs were released.
#  
#  I used the Spotify dataset from kaggle.com site.....

# In[5]:


data = pd.read_csv(r"G:\projectdata\datasetone.csv")


# In[6]:


genre_data = pd.read_csv(r"G:\projectdata\data_w_genres.csv")


# In[7]:


genre_data = pd.read_csv(r"G:\projectdata\data_by_genres.csv")


# In[8]:


year_data = pd.read_csv(r"G:\projectdata\data_by_year.csv")


# In[9]:


artist_data = pd.read_csv(r"G:\projectdata\data_by_artist.csv")


# # info of all files
# #I have included the column metadata below that was generated by calling the Pandas info function for each data frame.

# In[10]:


print(data.info())


# In[11]:


print(genre_data.info())


# In[12]:


print(genre_data.info())


# In[13]:


print(artist_data.info())


# Based on the column descriptions above,
# we can see that here look each dataframe has information about the audio features such as the danceability
# and loudness of different songs, 
# that have also been aggregated across genres and specific years.

# This dataset is extremely useful and can be used for a wide range of tasks.
# 
# Before building a recommendation system,I decided to create some visualizations to better understand the data and the trends in music over the last 100 years.

# # Yellowbrick: Machine Learning Visualization
# 
# #We are going to check for all the analysis with the target as 'popularity'. Before going to do that let's check for the Feature Correlation by considering a few features and for that, I'm going to use the yellowbrick package.

# In[14]:


from yellowbrick.target import FeatureCorrelation

feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration_ms','explicit','key','mode','year']

X, y = data[feature_names], data['popularity']

# Create a list of the feature names
features = np.array(feature_names)

# Instantiate the visualizer
visualizer = FeatureCorrelation(labels=features)

plt.rcParams['figure.figsize']=(20,20)
visualizer.fit(X, y)     # Fit the data to the visualizer
visualizer.show()


# ,,..Based on the plot above, we can see that music has transitioned from the more acoustic and instrumental sound of the early 1900s to the more danceable and energetic sound of the 2000s.
# 
# The majority of the tracks from the 1920s were likely instrumental pieces from classical and jazz genres. The music of the 2000s sounds very different due to the advent of computers and advanced audio engineering technology that allows us to create electronic music with a wide range of effects and beats.
# 
# We can also take a look at how the average tempo or speed of music has changed over the years. The drastic shift in sound towards electronic music is supported by the graph produced by the code below as well.

# # Music Over Time
# 
# #Using the data grouped by year, we can understand how the overall sound of music has changed from 1921 to 2020. 
# 
# In the code below, I used Plotly to visualize the values of different audio features for songs over the past 100 years.

# In[15]:


def get_decade(year):
    period_start = int(year/10) * 10
    decade = '{}s'.format(period_start)
    return decade

data['decade'] = data['year'].apply(get_decade)

sns.set(rc={'figure.figsize':(11 ,6)})
sns.countplot(data['decade'])


# In[16]:


sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence']
fig = px.line(year_data, x='year', y=sound_features)
fig.show()


# # Characteristics of Different Genres
# 
# #This dataset contains the audio features for different songs along with the audio features for different genres. 
# 
# We can use this information to compare different genres and understand their unique differences in sound. In the code below, I selected the ten most popular genres from the dataset and visualized audio features for each of them.

# In[17]:


top10_genres = genre_data.nlargest(10, 'popularity')

fig = px.bar(top10_genres, x='genres', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='group')
fig.show()


# Many of the genres above, such as Chinese electropop are extremely specific and likely belong to one or more broad genres such as pop or electronic music.
# 
# We can take these highly specific genres and understand how similar they are to other genres by clustering them based on their audio features.

# # Clustering Genres with K-Means
# 
# #Here, the simple K-means clustering algorithm is used to divide the genres in this dataset into ten clusters based on the numerical audio features of each genres.In the code below, I used the famous and simple K-means clustering algorithm to divide the over 2,900 genres in this dataset into ten clusters based on the numerical audio features of each genre.

# In[18]:


from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10, random_state=0))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)


# Now that the genres have been assigned to clusters, we can take this analysis a step further by visualizing the clusters in a two-dimensional space.

# # Visualizing the Genre Clusters with t-SNE
# Visualizing the Genre Clusters with t-SNE .There are many audio features for each genre and it is difficult to visualize clusters in a high-dimensional space. However, we can use a dimensionality reduction technique known as t-Distributed Stochastic Neighbor Embedding to compress the data into a two-dimensional space as demonstrated in the code below.

# In[19]:


# Visualizing the Clusters with t-SNE

from sklearn.manifold import TSNE

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()


# Now, we can easily visualize the genre clusters in a two-dimensional coordinate plane by using Plotly’s scatter function.

# # Clustering Songs with K-Means
# ,,,,We can also cluster the songs using K-means as demonstrated below in order to understand how to build a better recommendation system.

# In[20]:


song_cluster_pipeline = Pipeline([('scaler', StandardScaler()), 
                                  ('kmeans', KMeans(n_clusters=20, 
                                   verbose=False, random_state=4))
                                 ], verbose=False)

X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels


# # Visualizing the Song Clusters with PCA
# 
# ....,,,,The song data frame is much larger than the genre data frame so I decided to use PCA for dimensionality reduction rather than t-SNE because it runs significantly faster.

# In[21]:


# Visualizing the Clusters with PCA

from sklearn.decomposition import PCA

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = data['name']
projection['cluster'] = data['cluster_label']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()


# ,,..The plot above is interactive, 
# so you can see the title of each song when you hover over the points. If you spend some time exploring the plot above you’ll find that similar songs tend to be located close to each other and songs within clusters tend to be at least somewhat similar. 
# This observation is the key idea behind the content-based recommendation system that I created in the next section.

# # Build Recommender System
# 
# Based on the analysis and visualizations, it’s clear that similar genres tend to have data points that are located close to each other while similar types of songs are also clustered together.
# 
# This observation makes perfect sense. Similar genres will sound similar and will come from similar time periods while the same can be said for songs within those genres. We can use this idea to build a recommendation system by taking the data points of the songs a user has listened to and recommending songs corresponding to nearby data points.
# 
# Spotipy is a Python client for the Spotify Web API that makes it easy for developers to fetch data and query Spotify’s catalog for songs. You have to install using pip install spotipy.
# 
# After installing Spotipy, you will need to create an app on the Spotify Developer’s page and save your Client ID and secret key.

# In[ ]:


import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id='1fbc9b55c2a84f1386ce4ad7d14b9d71',
                                                                              client_secret='40af79915bde46829e278ddff0caf005',))


def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q= 'track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)


# Now we can finally build the music recommendation system! The recommendation algorithm I used is pretty simple and follows three steps:
# Compute the average vector of the audio and metadata features for each song the user has listened to.
# 
# Find the n-closest data points in the dataset (excluding the points from the songs in the user’s listening history) to this average vector.
# 
# Take these n points and recommend the songs corresponding to them.
# 
# This algorithm follows a common approach that is used in content-based recommender systems and is generalizable because we can mathematically define the term closest with a wide range of distance metrics ranging from the classic Euclidean distance to the cosine distance.
# 
# For the purpose of this project, I used the cosine distance, which is defined below for two vectors u and v.
# 
# (Cosine distance formula.(Distance(u,v)= 1-(u*v)/||u|| ||v||= 1-cos0 )
# 
# here,In other words, the cosine distance is one minus the cosine similarity — the cosine of the angle between the two vectors. The cosine distance is commonly used in recommender systems and can work well even when the vectors being used have different magnitudes.
# 
# If the vectors for two songs are parallel, the angle between them will be zero, meaning the cosine distance between them will also be zero because the cosine of zero is 1.
# 
# The functions that I have defined below implement this simple algorithm with the help of Scipy’s cdist function for finding the distances between two pairs of collections of points.

# In[ ]:


from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']


def get_song_data(song, spotify_data):
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) 
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['year'])
        

def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)


def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict


def recommend_songs( song_list, spotify_data, n_songs=10):
    
    metadata_cols = ['name', 'year', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    song_center = get_mean_vector(song_list, spotify_data)
    scaler = song_cluster_pipeline.steps[0][1]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['name'].isin(song_dict['name'])]
    return rec_songs[metadata_cols].to_dict(orient='records')


# The logic behind the algorithm sounds convincing but does this recommender system really work? The only way to find out is by testing it with practical examples.
# 
# Let’s say that we want to recommend music for someone who listens to 1990s grunge, specifically songs by Nirvana.
# 
# We can use the recommend_songs function to specify their listening history and generate recommendations as shown below.

# In[ ]:


recommend_songs([{'name': 'Come As You Are', 'year':1991},
                {'name': 'Smells Like Teen Spirit', 'year': 1991},
                {'name': 'Lithium', 'year': 1992},
                {'name': 'All Apologies', 'year': 1993}], data )


# Spotify keeps track of metadata and audio features for songs that we can use to build music recommendation systems.
# 
# In this Project, 
# 
# I demonstrated how you can use this data to build a simple content-based music recommender system with the cosine distance metric.

# In[ ]:


# In[ ]: