from lyricsgenius import Genius
import re
import billboard
import datetime as dt
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import os
from requests.exceptions import Timeout
from ast import literal_eval
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.gridspec as gridspec
import seaborn as sns
from collections import defaultdict
import langdetect
import nltk.tokenize
from langdetect import detect, detect_langs
import networkx as nx
import netwulf as nw
from itertools import combinations
from collections import defaultdict
import random
from scipy import stats
from networkx.algorithms import community
import community
import json
#import plotly.io as plotly
#import plotly.tools as tls
#from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import nltk
from PIL import Image
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from wordcloud import WordCloud, ImageColorGenerator
from glob import glob

#plotly_folder = '../static/plotly/'
seed = 1337
random.seed(seed)
np.random.seed(seed)


chart = billboard.ChartData('hot-100', date="1960-01-04", fetch=True, timeout=50)


# Create empty dataframe
columns = ['title', 'artist', 'rank', 'date', 'weeks']
songInfo = pd.DataFrame(None, columns=columns)

start = datetime.strptime('Jan 4 1960', '%b %d %Y')
end = datetime.now()
#end = datetime.strptime('Jan 4 1961', '%b %d %Y')

# Run the code below to scrape BillBoard 100

# outer_bar = tqdm(range(len(list(rrule.rrule(rrule.WEEKLY, dtstart=start, until=end)))), desc=f"Progress", position=0, leave=True)
# for dt in rrule.rrule(rrule.WEEKLY, dtstart=start, until=end):
#     outer_bar.update(1)
#     chart = billboard.ChartData('hot-100', date=dt.strftime("%Y-%m-%d"), fetch=True, timeout=25)
#     for song in chart:
#         if dt == start:
#             songInfo.loc[len(songInfo)] = [song.title, song.artist, song.rank, dt.strftime("%Y-%m-%d"), song.weeks]
#         else:
#             if song.isNew:
#                 songInfo.loc[len(songInfo)] = [song.title, song.artist, song.rank, dt.strftime("%Y-%m-%d"), song.weeks]
# #             else:
# #                 index = (songInfo['title'] == song.title) & (songInfo['artist'] == song.artist)
# #                 index = np.argmax(index)
# #                 #row = (songInfo['title'] == song.title) & (songInfo['artist'] == song.artist)
# #                 if len(songInfo.iloc[index]) == 0:
# #                     songInfo.loc[len(songInfo)] = [song.title, song.artist, song.rank, dt.strftime("%Y-%m-%d"), song.weeks]
# #                 elif song.rank > songInfo.loc[index, "rank"]:
# #                     songInfo.loc[index, "rank"] = song.rank
# #                     songInfo.loc[index, "date"] = dt.strftime("%Y-%m-%d")

# songInfo.to_csv("songInfo.csv")
# songInfo.to_csv("songInfo_noIndex.csv",index=False)


token = 'UNXh1BykDmagMbxVjcAeMXiwDhnkmgsDC3a2AM2YWRKzLhLDpxsRJzfdvXP2cXRZ'
genius = Genius(token, timeout=20, remove_section_headers=True, verbose=False, skip_non_songs=False)


feature_expressions = ['feature', 'feat.', 'ft.', ' with ', '(with ']
extra_expressions = [' and ', ' & ', ',']

def find_artist(name):
    artist = genius.search_artist(name, max_songs=0)
    if artist is not None:
        return artist

    name = name.lower()
    og_name = name
    for fe in feature_expressions:
        if fe in name:
            name = name.split(fe)[0]
            break

    if name != og_name:
        artist = genius.search_artist(name, max_songs=0)
        if artist is not None:
            return artist


    name = name.replace('(', '')
    name = name.replace(')', '')

    artist = genius.search_artist(name.replace(' and ', ' & '), max_songs=0)
    if artist is not None:
        return artist

    og_name = name
    for ee in extra_expressions:
        if ee in name:
            name = name.split(ee)[0]
    if name != og_name:
        artist = genius.search_artist(name, max_songs=0)
    return artist

def find_song(artist, title):
    song = genius.search_song(title, artist)
    if song is not None:
        return song

    artist = artist.lower()
    og_artist = artist
    for fe in feature_expressions:
        if fe in artist:
            artist = artist.split(fe)[0]
            break

    if artist != og_artist:
        song = genius.search_song(title, artist.title())
        if song is not None:
            return song

    artist = artist.replace('(', '')
    artist = artist.replace(')', '')

    artist_and = artist.replace(' and ', ' & ')
    if artist != artist_and:
        song = genius.search_song(title, artist_and.title())
        if song is not None:
            return song

    og_artist = artist
    for ee in extra_expressions:
        if ee in artist:
            artist = artist.split(ee)[0]
    if artist != og_artist:
        song = genius.search_song(title, artist.title())
    if song is not None:
        return song

    song = genius.search_song(title)
    return song

def artist_to_list(name_segment):
    if ' & ' in name_segment:
        artist_list = name_segment.split(' & ')
        if ', ' in artist_list[0]:
            artist_list = artist_list[0].split(', ') + [artist_list[1]]
        return artist_list
    return [name_segment]

def process_artist_names(artist_names):
    ft_code = '(?<=\(Ft\. )(.*?)(?=\))'
    main_code = '(.*?) \('
    features = re.findall(ft_code, artist_names)
    if not features:
        main_artists = artist_names
        all_artists = artist_to_list(main_artists)
    else:
        all_artists = artist_to_list(features[0])
        main_artists = re.findall(main_code, artist_names)
        all_artists += artist_to_list(main_artists[0])

    return all_artists

def convert_date(date):
    try:
        if len(date) < 5:
            conv_date = datetime.strptime(date, '%Y')
            conv_date_str = datetime.strftime(conv_date, '%Y')
        else:
            conv_date = datetime.strptime(date, '%B %d, %Y')
            conv_date_str = datetime.strftime(conv_date, '%Y-%m-%d')
    except:
        return date
    return conv_date_str


columns = ['released', 'artists', 'lyrics', 'genres', 'title']
genius_df = pd.DataFrame(None, columns=columns)

bad_genres = {'track\\s?list', 'album art(work)?', 'liner notes', 'booklet', 'credits', 'interview', 'skit', 'instrumental', 'setlist', 'non-music', 'literature'}

Start = '<<<<<<<<<<'
End = '>>>>>>>>>>'

N = len(songInfo)
now = time.time()

successes = 0

last_checkpoint = 29100
step = 28

for i in range(last_checkpoint, N):
    print(f'Succes rate: {successes} / {i-last_checkpoint}')
    print('='*50)
    while True:
        try:
            song = find_song(songInfo.artist[i], songInfo.title[i])
            break
        except:
            print('Failed to find song... Trying again.')
            pass
    if song is None:
        print('Failed at song:', songInfo.artist[i], 'with title:', songInfo.title[i], '\nDue to no song found')
        continue

    raw_lyrics = song.lyrics
    if not raw_lyrics:
        print('Failed at song:', songInfo.artist[i], 'with title:', songInfo.title[i], '\nDue to empty lyric')
        continue

    lyrics, genres_and_release_date = raw_lyrics.split(Start)
    raw_genres, release_date = genres_and_release_date.split(End)
    genres = raw_genres.split('_')
    bad_genre = None
    for genre in genres:
        if genre in bad_genres:
            bad_genre = genre
            break
    if bad_genre is not None:
        print('Failed at song:', songInfo.artist[i], 'with title:', songInfo.title[i], f'\nDue to bad genre: {bad_genre}')
        continue

    if release_date == 'Unknown':
        release_date = songInfo.date[i]
    else:
        release_date = convert_date(release_date)
    sd = song.to_dict()
    title = sd['title']

    artists = process_artist_names(sd['artist_names'])

    genius_df.loc[i] = [release_date, artists, lyrics, genres, title]

    if not (i+1) % step:
        print('SAVING CHECKPOINT!')
        genius_df.to_csv(f'songData{last_checkpoint}_{i}.csv')
        try:
            os.remove(f'songData{last_checkpoint}_{i-step}.csv')
        except FileNotFoundError:
            pass

    successes += 1

    now_now = time.time()
    print(f'Song number {i+1} of {N}, time spent on song: {now_now - now:.2f} seconds')
    now = now_now
    # print(f'Artists: {songInfo.artist[i]:>10}, {" ".join(artists):>20}')
    print(f'Artists: {songInfo.artist[i]:>20}')
    print(f'{", ".join(artists):>29}')
    print(f'Title: {songInfo.title[i][:20]:>32}')
    print(f'{title[:20]:>39}')
    print(f'Date: {songInfo.date[i]:>20}')
    print(f'{release_date:>26}')
    print(f'Genres: {", ".join(genres):>20}\n')


token = 'UNXh1BykDmagMbxVjcAeMXiwDhnkmgsDC3a2AM2YWRKzLhLDpxsRJzfdvXP2cXRZ'
genius = Genius(token, timeout=20, remove_section_headers=True, verbose=False, skip_non_songs=False)
for val, tit, art in zip(songData.index.values, songData.title, songData.artists):
    if 'Genius' in ''.join(art):
        print(val, art, tit)
        try:
            artist, rest = tit.split(' — ')
        except:
            #songData = songData.drop(val)
            continue
        
        print('='*50)
        print(f'artist: {artist}')
        print(f'title: {rest}')
        
        title = rest.split('ft.')[0]
        
        code = '(.*?) (?=\(.+ .+\))'
        cut_title = re.findall(code, title)
        if cut_title:
            title = cut_title[0]
            
        artist = artist.split(' & ')[0]
        
        song = genius.search_song(title, artist)
        raw_lyrics = song.lyrics
        lyrics, genres_and_release_date = raw_lyrics.split(John)
        raw_genres, release_date = genres_and_release_date.split(flipped_John)
        genres = raw_genres.split('_')
                
        if release_date == 'Unknown':
            release_date = songInfo.date[val]
        else:
            release_date = convert_date(release_date)
        sd = song.to_dict()
        title = sd['title']

        artists = process_artist_names(sd['artist_names'])
        #songData.loc[val] = [release_date, artists, lyrics, genres, title]
        print(f'Artists: {songInfo.artist[val]:>20}')
        print(f'{", ".join(artists):>29}')
        print(f'Title: {songInfo.title[val][:20]:>32}')
        print(f'{title[:20]:>39}')
        print(f'Date: {songInfo.date[val]:>20}')
        print(f'{release_date:>26}')
        print(f'Genres: {", ".join(genres):>20}\n')


val = 18539
song = genius.search_song('Woo-Hah!! Got you all in check')
raw_lyrics = song.lyrics
lyrics, genres_and_release_date = raw_lyrics.split(John)
raw_genres, release_date = genres_and_release_date.split(flipped_John)
genres = raw_genres.split('_')

if release_date == 'Unknown':
    release_date = songInfo.date[val]
else:
    release_date = convert_date(release_date)
sd = song.to_dict()
title = sd['title']

artists = process_artist_names(sd['artist_names'])

print(f'Artists: {songInfo.artist[val]:>20}')
print(f'{", ".join(artists):>29}')
print(f'Title: {songInfo.title[val][:20]:>32}')
print(f'{title[:20]:>39}')
print(f'Date: {songInfo.date[val]:>20}')
print(f'{release_date:>26}')
print(f'Genres: {", ".join(genres):>20}\n')
songData.loc[val] = [release_date, artists, lyrics, genres, title]


songData = pd.read_csv('songData.csv', index_col=0)


all_genres = set([])
i = 0
for genres in songData.genres:
    i += 1
    if i < 10:
        print(i, genres[2:-2])
    genres = genres[2:-2].split("', '")
    for genre in genres:
        all_genres.add(genre)
#all_genres

1 country
2 pop
3 pop
4 pop
5 country', 'cover
6 pop
7 pop
8 pop
9 pop


for i in songData.index.values:
    lyrics = " ".join([token for token in set(nltk.tokenize.word_tokenize(songData.lyrics[i])) if token.isalpha()])
    if not lyrics:
#         print("NO GUT HERE")
#         print(song_data.artists[i])
#         print(song_data.title[i],"\n")

        songData = songData.drop(i)
        continue
    if langdetect.detect(lyrics) != "en":
        #print(i)
#         print(song_data.artists[i])
#         print(song_data.title[i])
#         print(lyrics[:50],"\n")
        songData = songData.drop(i)
#     print(langdetect.detect(lyrics))
#     break


all_songs = set()
songs_count = {}

for i, art, tit in zip(songData.index.values, songData.artists, songData.title):
    song = ', '.join(art) + ': ' + tit
    if song in all_songs:
        songs_count[song] += 1
        #songData = songData.drop(i)
    else:
        songs_count[song] = 1
    all_songs.add(song)

print("Amount of unique songs:", len(all_songs))

Amount of unique songs: 25750


song_data = pd.read_pickle('songData_noduplicates.df')


print("Length of rap god (in characters):", len(song_data[song_data.title == 'Rap God'].lyrics.item()))

Length of rap god (in characters): 8003


for i in song_data.index.values:
    title = song_data.title[i]
    song_data.lyrics[i] = " ".join(song_data.lyrics[i].split("Lyrics")[1:])


for i in song_data.index.values:
    if "\u200e" in song_data.lyrics[i]:
        song_data.lyrics[i] = song_data.lyrics[i].replace('\u200e', '')


cut_list = ["genius users cypher", "world record"]
for cut in cut_list:
    for i in song_data.index.values:
        if cut in song_data.title[i].lower():
            song_data = song_data.drop(i)
            print(i, cut)

1491 genius users cypher
8166 genius users cypher
520 world record


lengths = [len(lyrics.split()) for lyrics in song_data.lyrics]
# lengths = sorted(lengths, reverse=True)
fig, axes = plt.subplots(figsize=(12,4), dpi=95)
############# HISTOGRAM ##############
sns.histplot(ax=axes, x=lengths, kde=True, color='#3498DB', ls='--')
axes.grid(alpha = 0.1)
axes.axvline(np.mean(lengths), linewidth=1, color='#E74C3C', ls='--', 
                  label=f'mean: {round(np.mean(lengths))}')
axes.set_title('Distribution of song lengths')
axes.set_xlabel('Length (in words)')
axes.legend(loc='upper right',
          fancybox=True, shadow=True, ncol=1)

<matplotlib.legend.Legend at 0x1b64568d7c0>


cut_list = ["highest to lowest", "marcel proust", 'watsky', 'glee cast', 'harttsick', 'eric the red', 'fabvl', 'c-mob', 'hampered']
for cut in cut_list:
    for i in song_data.index.values:
        if cut in song_data.artists[i]:
            song_data = song_data.drop(i)
            #print(i, cut)


i = -1 

while True:
    
    lengths = [len(lyrics) for lyrics in song_data.lyrics]
    a = np.argsort(lengths)[-1]
    
    index = song_data.index.values[a]
    
    if len(song_data.lyrics[index]) < 10_000:
        break
#     print(len(song_data.lyrics[index]))
#     print(song_data.artists[index])
#     print(song_data.title[index])
#     print(song_data.lyrics[index])
#     print("="*100)
    song_data = song_data.drop(index)


lengths = [len(lyrics.split()) for lyrics in song_data.lyrics]
# lengths = sorted(lengths, reverse=True)
# fig = plt.figure(dpi=135)
# plt.xlim(-1, max(lengths))
# plt.ylim(0, 1700)
# plt.title("Distribution of song lengths")
# plt.xlabel("Length (in characters)")
# plt.ylabel("Count")
# plt.hist(lengths, bins = 100, color='#3498DB', alpha=0.7)
# plt.vlines(np.mean(lengths), 0, 1800, colors='#E74C3C', alpha=0.7, label='Mean length')
# plt.legend()
# plt.savefig("../static/images/song_lengths.png", bbox_inches='tight')
# plt.show()

fig, axes = plt.subplots(figsize=(12,4), dpi=95)
############# HISTOGRAM ##############
sns.histplot(ax=axes, x=lengths, kde=True, color='#3498DB', ls='--')
axes.grid(alpha = 0)
axes.axvline(np.mean(lengths), linewidth=1, color='#E74C3C', ls='--', 
                  label=f'mean: {round(np.mean(lengths))}')
axes.set_title('Distribution of song lengths')
axes.set_xlabel('Length (in words)')
axes.legend(loc='upper right',
          fancybox=True, shadow=True, ncol=1)
plt.savefig("../static/images/song_lengths_sns.png", bbox_inches='tight')


for i in song_data.index.values:
    a = song_data.artists[i]
    for j,artist in enumerate(a):
        if ' (' in artist and ')' not in artist:
            a.pop(j)
            artist = artist.split(' (')[0].split(', ')
            song_data.artists[i] = a + artist
#             print(artist)
#             print(song_data.title[i])
#             print('')


artist_count = defaultdict(lambda: 0)
artist_colab_count = defaultdict(lambda: defaultdict(lambda: 0))

for artists in song_data.artists:
    for artist in artists:
        artist_count[artist] += 1
        for colab in artists:
            if colab != artist:
                artist_colab_count[artist][colab] += 1


artist_colab_count['wind']

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'earth': 31, 'fire': 31, 'ramsey lewis': 1})


artist_count['wind']

31


regroupings = set()

for artist_a, songs_a in artist_count.items():
    colabs = [artist_a]
    for artist_b, songs_b in artist_colab_count[artist_a].items():
        if songs_b == artist_count[artist_b] == songs_a  and songs_a > 2:
            colabs.append(artist_b)
    if len(colabs) > 1:
        regroupings.add((songs_a, tuple(sorted(colabs))))


for i in song_data.index.values:
    for num, group in regroupings:
        if group[0] in song_data.artists[i]:
            if i < 5:
                print(f'Artists before: {song_data.artists[i]}')
            for g in group:
                song_data.artists[i].remove(g)
            song_data.artists[i].append("_".join(group))
            if i < 5:
                print(f'Artists after: {song_data.artists[i]}')
                print("")


song_data = pd.read_pickle('songData.df')


artist_count = defaultdict(lambda: 0)
artist_colab_count = defaultdict(lambda: defaultdict(lambda: 0))

for artists in song_data.artists:
    for artist in artists:
        artist_count[artist] += 1
        for colab in artists:
            if colab != artist:
                artist_colab_count[artist][colab] += 1


sorted_artists = {k: v for k, v in sorted(artist_count.items(), key=lambda item: item[1], reverse=True) if v > 60}
#for k, v in sorted_artists.items():
#    print(k + ':', v)

fig = plt.figure(figsize=(12,4), dpi=95)
plt.bar(*zip(*sorted_artists.items()), color='#3498DB', alpha=0.7)
plt.xlim(-1, len(sorted_artists.items()))
plt.xlabel('Artist')
plt.xticks(rotation=45, ha="right")
plt.ylabel('Count')
plt.title('Songs pr. Artist')
plt.savefig("../static/images/songs_per_artist.png", bbox_inches='tight')
plt.show()


# plotly_fig = tls.mpl_to_plotly(fig, resize=False)
# plotly_fig.update_layout(
#     xaxis = dict(
#         tickmode = 'array',
#         tickvals = list(np.arange(len(sorted_artists.keys()))),
#         ticktext = list(sorted_artists.keys())),
#     paper_bgcolor='rgba(0,0,0,0)',
#     plot_bgcolor='rgba(0,0,0,0)',
#     title_x =0.5,
#     margin=dict(l=5, r=5, t=20, b=5)
# )

# plotly.write_image(plotly_fig, plotly_folder + "songs_per_artist.json")
#iplot(plotly_fig)


all_genres = set()
genre_count = {}

for genres in song_data.genres:
    for genre in genres:
        if genre in all_genres:
            genre_count[genre] += 1
        else:
            genre_count[genre] = 1
    all_genres = all_genres.union(set(genres))
    
print("Total genres:", len(all_genres))

Total genres: 612


sorted_genres = {k: v for k, v in sorted(genre_count.items(), key=lambda item: item[1], reverse=True) if v > 500}
#for k, v in sorted_genres.items():
#    print(k + ':', v)

fig = plt.figure(figsize=(12,4), dpi=95)
plt.bar(*zip(*sorted_genres.items()), color='#3498DB', alpha=0.7)
plt.xlim(-1, len(sorted_genres.items()))
plt.xlabel('Genre')
plt.xticks(rotation=45, ha="right")
plt.ylabel('Count')
plt.title('Songs pr. Genre')
plt.savefig("../static/images/songs_per_genre.png", bbox_inches='tight')
plt.show()

# plotly_fig = tls.mpl_to_plotly(fig, resize=False)
# plotly_fig.update_layout(
#     xaxis = dict(
#         tickmode = 'array',
#         tickvals = list(np.arange(len(sorted_genres.keys()))),
#         ticktext = list(sorted_genres.keys())),
#     paper_bgcolor='rgba(0,0,0,0)',
#     plot_bgcolor='rgba(0,0,0,0)',
#     title_x =0.5,
#     margin=dict(l=5, r=5, t=20, b=5)
# )

# plotly.write_image(plotly_fig, plotly_folder + "songs_per_genre.json")
#iplot(plotly_fig)


decade_count = defaultdict(lambda: 0)
for date in song_data.released:
    year = (int(date[:4]) // 10) * 10
    decade_count[str(year)] += 1

decade_count = {k: v for k, v in sorted(decade_count.items(), key=lambda item: int(item[0])) if v > 1}

fig = plt.figure(figsize=(12,4), dpi=95)
plt.bar(*zip(*decade_count.items()), color='#3498DB', alpha=0.7)
plt.xlabel('Decade')
plt.ylabel('Count')
plt.title('Songs pr. Decade')
plt.savefig("../static/images/songs_per_decade.png", bbox_inches='tight')
plt.show()

# plotly_fig = tls.mpl_to_plotly(fig, resize=False)
# plotly_fig.update_layout(
#     xaxis = dict(
#         tickmode = 'array',
#         tickvals = list(np.arange(len(decade_count.keys()))),
#         ticktext = list(decade_count.keys())),
#     paper_bgcolor='rgba(0,0,0,0)',
#     plot_bgcolor='rgba(0,0,0,0)',
#     title_x =0.5,
#     margin=dict(l=5, r=5, t=20, b=5)
# )

# plotly.write_image(plotly_fig, plotly_folder + "songs_per_decade.json")
#iplot(plotly_fig)


song_data = pd.read_pickle('songData.df')
print(f'Number of songs: {len(song_data)}')

Number of songs: 25706


with open('network_figures/config.txt') as f:
    data = f.read()
config = json.loads(data)


all_artists = set()
artist_genres = dict()
artist_genres_count = defaultdict(lambda: defaultdict(lambda: 0))
for artists, genres in zip(song_data.artists, song_data.genres):
    for artist in artists:
        all_artists = all_artists.union(set([artist]))
        for genre in genres:
            artist_genres_count[artist][genre] += 1
        if artist in artist_genres.keys():
            artist_genres[artist] = artist_genres[artist].union(set(genres))
        else:
            artist_genres[artist] = set(genres)
        

all_artists = list(all_artists)
print(f'Number of unique artists: {len(all_artists)}')

Number of unique artists: 8079


genre_list = ['pop', 'rock', 'rap', 'r&b', 'country', 'soul', 'ballad', 'hip-hop', 
              'trap', 'singer-songwriter', 'funk', 'eighties', 'seventies', 'soundtrack',
              'dance', 'electronic', 'folk', 'cover', 'jazz', 'blues']


colour_list = ['#E74C3C', '#8E44AD', '#3498DB', '#2ECC71', '#F39C12', '#F1C40F', '#F5B7B1', '#5D6D7E', 
               '#AED6F1','#F5B7B1', '#FCF3CF', '#DCB9ED', '#8F2323', '#8F6A23', '#4F8F23', '#23628F', 
               '#6B238F', '#AED6F1','#A3E4D7', '#D4AC0D', '#D7BDE2']


genre_to_colour_dict = {}
for colour, genre in zip(colour_list, genre_list):
    genre_to_colour_dict[genre] = colour


artist_count = defaultdict(lambda: 0)
artist_colab_count = defaultdict(lambda: defaultdict(lambda: 0))

for artists in song_data.artists:
    for artist in artists:
        artist_count[artist] += 1
        for colab in artists:
            if colab != artist:
                artist_colab_count[artist][colab] += 1


G = nx.Graph()
for artist in all_artists:
    most_occurences = 0
    rap_count = 0
    trap_count = 0
    max_key = 'other'
    random.shuffle(genre_list)
    
    for genre in genre_list:
        if genre in artist_genres_count[artist]:
            if genre == 'rap':
                rap_count = artist_genres_count[artist][genre]
            if genre == 'trap':
                trap_count = artist_genres_count[artist][genre]             
            
            if artist_genres_count[artist][genre] > most_occurences:
                most_occurences = artist_genres_count[artist][genre]
                max_key = genre
                
    if (rap_count > 0) and (rap_count == trap_count) and (max_key in ['rap', 'trap']):
        max_key = 'trap'
    
    G.add_node(artist, 
               genre=max_key, 
               size=artist_count[artist], 
               all_genres=artist_genres[artist], 
               group=genre_to_colour_dict[max_key])
    
G.number_of_nodes()
print(f'Number of nodes: {G.number_of_nodes()}')

Number of nodes: 8079


linked_artists = set()
for artists in song_data.artists:
    if len(artists) > 1:
        for comb in combinations(artists, 2):
            if not comb[0] == comb[1]:
                linked_artists = linked_artists.union({tuple([comb[0], comb[1], artist_colab_count[comb[0]][comb[1]]])})

linked_artists = list(linked_artists)
G.add_weighted_edges_from(linked_artists)
print(f'Number of edges: {G.number_of_edges()}')

Number of edges: 7273


def randomized_graph(graph, N):
    g = graph.copy()
    swaps = 0
    while swaps < N:
        uv = random.choice(list(g.edges()))
        if uv[0] == uv[1]:
            uv = random.choice(list(g.edges()))
        xy = random.choice(list(g.edges()))
        while uv[1] == xy[0]:
            xy = random.choice(list(g.edges()))
        if not g.has_edge(uv[0], xy[1]) and not g.has_edge(uv[1], xy[0]):
            g.remove_edges_from([uv, xy])
            g.add_edges_from([(uv[0], xy[1]), (uv[1], xy[0])])
            swaps += 1
    return g


def naive_randomized_graph(graph):
    g = nx.MultiGraph(graph)
    edges = list(g.edges())
    sources = [a for a, b in edges]
    targets = [b for a, b in edges]
    unique_list = sources + targets
    random.shuffle(unique_list)
    new_edges = [(unique_list[i], unique_list[i+1]) for i in range(0, len(unique_list)-1, 2)]
    g.remove_edges_from(edges)
    g.add_edges_from(new_edges)
    return g


def get_network_by_genre(G, genre):
    genre_nodes = [node for node, data in G.nodes(data=True) if genre in data['all_genres']]
    return G.subgraph(genre_nodes)


def get_partitioning(filtered_graph):
    partitioning = []
    nc = set(nx.get_node_attributes(filtered_graph, 'group').values())
    for i in nc:
        nodes = (
            node
            for node, data
            in filtered_graph.nodes(data=True)
            if data.get("group") == i
        )
        partitioning.append(filtered_graph.subgraph(nodes))
    return partitioning


def modularity(graph, partitioning):
    M = 0
    L = graph.number_of_edges()
    for subgraph in partitioning:
        Lc = subgraph.number_of_edges()
        kc = sum(graph.degree[node] for node in subgraph.nodes())
        M += Lc/L - (kc / (2 * L))**2
    return M


main_genre_sizes = defaultdict(lambda: 0)
for art, data in G.nodes(data=True):
    main_genre_sizes[data['genre']] += 1


all_genre_sizes = defaultdict(lambda: 0)
for art, data in G.nodes(data=True):
    for g in data['all_genres']:
        all_genre_sizes[g] += 1


{k: v for k, v in sorted(main_genre_sizes.items(), key=lambda item: item[1], reverse=True)}

{'pop': 3529,
 'rap': 1847,
 'rock': 946,
 'r&b': 641,
 'country': 385,
 'trap': 168,
 'soul': 91,
 'soundtrack': 82,
 'hip-hop': 69,
 'electronic': 53,
 'dance': 48,
 'singer-songwriter': 40,
 'ballad': 39,
 'eighties': 32,
 'folk': 25,
 'seventies': 23,
 'funk': 22,
 'cover': 19,
 'jazz': 11,
 'blues': 9}


_ = [print(x) for x in list({k: v for k, v in sorted(all_genre_sizes.items(), key=lambda item: item[1], reverse=True)}.items())[:25]]

('pop', 4922)
('rap', 2601)
('rock', 1826)
('r&b', 1641)
('soul', 725)
('ballad', 698)
('pop-rock', 676)
('country', 624)
('soundtrack', 605)
('hip-hop', 593)
('singer-songwriter', 565)
('adult contemporary', 459)
('soul pop', 454)
('eighties', 445)
('trap', 440)
('dance', 430)
('electronic', 424)
('uk', 406)
('funk', 394)
('cover', 389)
('seventies', 381)
('alternative rock', 370)
('dance-pop', 365)
('east coast', 364)
('synth-pop', 328)


network_G, _ = nw.visualize(G, config=config, plot_in_cell_below=False)
fig, ax = nw.draw_netwulf(network_G)
#plt.savefig("network_figures/G.png")


print(f'Number of Nodes: {G.number_of_nodes()}')
print(f'Number of Links: {G.number_of_edges()}')
print(f'Density: {nx.density(G):.5f}')
print(f'Avg. clustering: {nx.average_clustering(G):.2f}')
degrees = list(dict(G.degree()).values())
print(f'Average degrees: {np.mean(degrees):.2f}')
print(f'Median degrees: {np.median(degrees)}')
print(f'Mode of degrees: {stats.mode(degrees)[0][0]}')
print(f'Minimum degree: {min(degrees)}')
print(f'Maximum degree: {max(degrees)}')

Number of Nodes: 8079
Number of Links: 7273
Density: 0.00022
Avg. clustering: 0.16
Average degrees: 1.80
Median degrees: 0.0
Mode of degrees: 0
Minimum degree: 0
Maximum degree: 108


bins = np.logspace(0, np.log10(max(degrees)), 13)
density = True
hist, edges = np.histogram(degrees, bins=bins, density = density)
x = (edges[1:] + edges[:-1])/2
width = bins[1] - bins[0]

fig, ax = plt.subplots(dpi=90)
ax.plot(x, hist, marker='.', alpha=0.7, linewidth=2.5, markersize=12, color='#3498DB', label='Degree')
ax.vlines(np.mean(degrees), 0, 1, ls='--', colors='#E74C3C', alpha=0.7, label=f'Mean degree: {np.mean(degrees):.1f}')
ax.set_xlabel('degrees')
if density:
    ax.set_ylabel('probability density')
else:
    ax.set_ylabel('counts')
ax.legend()
ax.set_title('Distribution of degrees')
ax.set_yscale("log")
ax.set_xscale("log")

plt.savefig("../static/images/distribution_of_degrees_90.png", bbox_inches='tight')


genre_partitioning = get_partitioning(G)
genre_partitioning_dict = dict()
for partition in genre_partitioning:
    for artist, data in partition.nodes(data=True):
        genre_partitioning_dict[artist] = data['group']

print(f'Genre partitioning modularity using homemade modularity function: {modularity(G, genre_partitioning):.4f}')
print(f'Genre partitioning modularity using python-louvain modularity function: {community.modularity(genre_partitioning_dict, G):.4f}')

Genre partitioning modularity using homemade modularity function: 0.3263
Genre partitioning modularity using python-louvain modularity function: 0.3415


louvain_modularity = community.modularity(community.best_partition(G), G)
print(f'Modularity for the full network: {louvain_modularity:.4f}')

Modularity for the full network: 0.7600


modularity_list = []
N = 1000
# graph = G.copy()
for i in tqdm(range(N)):
    graph = G.copy()
    RG = nx.double_edge_swap(graph, nswap=graph.number_of_edges()*1.2, max_tries=10000, seed=None)
    new_RG = nw.get_filtered_network(RG, node_group_key='group')
    RG_partitioning = get_partitioning(new_RG)
    modularity_list.append(modularity(RG, RG_partitioning))

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [04:52<00:00,  3.42it/s]


print(f'Average modularity of random networks: {np.mean(modularity_list):.4f}')
print(f'Standard deviation of modularity of random networks: {np.std(modularity_list):.4f}')

Average modularity of random networks: 0.0265
Standard deviation of modularity of random networks: 0.0048


plt.hist(modularity_list, bins=10, alpha=0.5, label='Random', density=True)
plt.axvline(modularity(G, genre_partitioning), color = '#E74C3C', linestyle='--', label='Genre')
plt.axvline(louvain_modularity, color = '#9B59B6', linestyle='--', label='Louvain')
plt.legend()
plt.title(f'Modularity of {N} random graphs')
plt.xlabel('Modularity')
plt.ylabel('Count')
plt.show()


G_louvain_partition = community.best_partition(G)
G_louvain = G.copy()
for artist, data in G_louvain.nodes(data=True):
    data['group'] = G_louvain_partition[artist]


network_G_louvain, _ = nw.visualize(G_louvain, config=config, plot_in_cell_below=False)
fig, ax = nw.draw_netwulf(network_G_louvain)


print(f'Number of groups using genres: {len(set(data["group"] for art, data in G.nodes(data=True)))}')
print(f'Number of groups using Louvain: {max(G_louvain_partition.values())+1}')

Number of groups using genres: 18
Number of groups using Louvain: 5095


bc = nx.betweenness_centrality(G)
sorted_bc = {k: v for k, v in sorted(bc.items(), key=lambda item: item[1], reverse=True)}
{k:v for k, v in list(sorted_bc.items())[:20]}

{'lil wayne': 0.005023970550920831,
 'kanye west': 0.004432003820360854,
 'drake': 0.004197969508689598,
 'nicki minaj': 0.003618482295013173,
 'chris brown': 0.003409014405779317,
 'quincy jones': 0.0032170842776790614,
 'ludacris': 0.003132771594697594,
 'snoop dogg': 0.003112365297176812,
 'mariah carey': 0.0027966731820614466,
 'usher': 0.0027480671967596766,
 'jay-z': 0.0026838272719125664,
 'fat joe': 0.002373255664088242,
 'james ingram': 0.0022984208053862115,
 't.i.': 0.0021649631078659295,
 'travis scott': 0.002161676030420683,
 'stevie wonder': 0.002152689703629887,
 'eminem': 0.0020126325393940545,
 'll cool j': 0.001968864960368148,
 'mary j. blige': 0.001912284868599049,
 'janet jackson': 0.0018828904840405716}


G_no_singles = G.copy()
for artist, data in G.nodes(data=True):
    if G_no_singles.degree(artist) == 0 and data['size'] < 5:
        G_no_singles.remove_node(artist)


network_G_no_singles, _ = nw.visualize(G_no_singles, config=config, plot_in_cell_below=False)
fig, ax = nw.draw_netwulf(network_G_no_singles)
# plt.savefig("network_figures/G_no_singles.pdf")


graph = G_no_singles
print(f'Number of Nodes: {graph.number_of_nodes()}')
print(f'Number of Links: {graph.number_of_edges()}')
print(f'Density: {nx.density(graph):.5f}')
print(f'Avg. clustering: {nx.average_clustering(graph):.2f}')
degrees = list(dict(graph.degree()).values())
print(f'Average degrees: {np.mean(degrees):.2f}')
print(f'Median degrees: {np.median(degrees)}')
print(f'Mode of degrees: {stats.mode(degrees)[0][0]}')
print(f'Minimum degree: {min(degrees)}')
print(f'Maximum degree: {max(degrees)}')

Number of Nodes: 4306
Number of Links: 7273
Density: 0.00078
Avg. clustering: 0.30
Average degrees: 3.38
Median degrees: 1.0
Mode of degrees: 1
Minimum degree: 0
Maximum degree: 108


bins = np.logspace(0, np.log10(max(degrees)), 13)
density = True
hist, edges = np.histogram(degrees, bins=bins, density = density)
x = (edges[1:] + edges[:-1])/2
width = bins[1] - bins[0]

fig, ax = plt.subplots(dpi=135)
ax.plot(x, hist, marker='.', alpha=0.7, linewidth=2.5, markersize=12, color='#3498DB', label='Degree')
ax.vlines(np.mean(degrees), 0, 1, ls='--', colors='#E74C3C', alpha=0.7, label=f'Mean degree: {np.mean(degrees):.1f}')
ax.set_xlabel('degrees')
if density:
    ax.set_ylabel('probability density')
else:
    ax.set_ylabel('counts')
ax.legend()
ax.set_title('Distribution of degrees')
ax.set_yscale("log")
ax.set_xscale("log")


genre_partitioning = get_partitioning(G_no_singles)
genre_partitioning_dict = dict()
for partition in genre_partitioning:
    for artist, data in partition.nodes(data=True):
        genre_partitioning_dict[artist] = data['group']

print(f'Genre partitioning modularity using homemade modularity function: {modularity(G_no_singles, genre_partitioning):.4f}')
print(f'Genre partitioning modularity using python-louvain modularity function: {community.modularity(genre_partitioning_dict, G_no_singles):.4f}')

Genre partitioning modularity using homemade modularity function: 0.3263
Genre partitioning modularity using python-louvain modularity function: 0.3415


louvain_modularity = community.modularity(community.best_partition(G_no_singles), G_no_singles)
print(f'Modularity for the full network: {louvain_modularity:.4f}')

Modularity for the full network: 0.7577


modularity_list = []
N = 1000
for i in tqdm(range(N)):
    graph = G_no_singles.copy()
    RG = nx.double_edge_swap(graph, nswap=graph.number_of_edges()*1.2, max_tries=10000, seed=None)
    new_RG = nw.get_filtered_network(RG, node_group_key='group')
    RG_partitioning = get_partitioning(new_RG)
    modularity_list.append(modularity(RG, RG_partitioning))

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:56<00:00,  4.24it/s]


print(f'Average modularity of random networks: {np.mean(modularity_list):.4f}')
print(f'Standard deviation of modularity of random networks: {np.std(modularity_list):.4f}')

Average modularity of random networks: 0.0265
Standard deviation of modularity of random networks: 0.0048


plt.hist(modularity_list, bins=10, alpha=0.5, label='Random', density=True)
plt.axvline(modularity(G_no_singles, genre_partitioning), color = '#E74C3C', linestyle='--', label='Genre')
plt.axvline(louvain_modularity, color = '#9B59B6', linestyle='--', label='Louvain')
plt.legend()
plt.title(f'Modularity of {N} random graphs')
plt.xlabel('Modularity')
plt.ylabel('Count')
plt.show()


G_no_singles_louvain_partition = community.best_partition(G_no_singles)
G_no_singles_louvain = G_no_singles.copy()
for artist, data in G_no_singles_louvain.nodes(data=True):
    data['group'] = G_no_singles_louvain_partition[artist]


network_G_no_singles_louvain, _ = nw.visualize(G_no_singles_louvain, config=config, plot_in_cell_below=False)
fig, ax = nw.draw_netwulf(network_G_no_singles_louvain)


print(f'Number of groups using genres: {len(set(data["group"] for art, data in G_no_singles.nodes(data=True)))}')
print(f'Number of groups using Louvain: {max(G_no_singles_louvain_partition.values())+1}')

Number of groups using genres: 18
Number of groups using Louvain: 1321


genre = 'pop'
G_pop = get_network_by_genre(G, genre)
for artist, data in G_pop.nodes(data=True):
    data['size'] = artist_genres_count[artist][genre]


network_G_pop, _ = nw.visualize(G_pop, config=config, plot_in_cell_below=False)
fig, ax = nw.draw_netwulf(network_G_pop)
# plt.savefig("network_figures/G.pdf")


graph = G_pop
print(f'Number of Nodes: {graph.number_of_nodes()}')
print(f'Number of Links: {graph.number_of_edges()}')
print(f'Density: {nx.density(graph):.5f}')
print(f'Avg. clustering: {nx.average_clustering(graph):.2f}')
degrees = list(dict(graph.degree()).values())
print(f'Average degrees: {np.mean(degrees):.2f}')
print(f'Median degrees: {np.median(degrees)}')
print(f'Mode of degrees: {stats.mode(degrees)[0][0]}')
print(f'Minimum degree: {min(degrees)}')
print(f'Maximum degree: {max(degrees)}')

Number of Nodes: 4922
Number of Links: 4250
Density: 0.00035
Avg. clustering: 0.11
Average degrees: 1.73
Median degrees: 0.0
Mode of degrees: 0
Minimum degree: 0
Maximum degree: 74


genre_partitioning = get_partitioning(G_pop)
genre_partitioning_dict = dict()
for partition in genre_partitioning:
    for artist, data in partition.nodes(data=True):
        genre_partitioning_dict[artist] = data['group']

print(f'Genre partitioning modularity using homemade modularity function: {modularity(G_pop, genre_partitioning):.4f}')
print(f'Genre partitioning modularity using python-louvain modularity function: {community.modularity(genre_partitioning_dict, G_pop):.4f}')

Genre partitioning modularity using homemade modularity function: 0.2318
Genre partitioning modularity using python-louvain modularity function: 0.2859


louvain_modularity = community.modularity(community.best_partition(G_pop), G_pop)
print(f'Modularity for the full network: {louvain_modularity:.4f}')

Modularity for the full network: 0.7159


modularity_list = []
N = 1000
for i in tqdm(range(N)):
    graph = G_pop.copy()
    RG = nx.double_edge_swap(graph, nswap=graph.number_of_edges()*1.2, max_tries=10000, seed=None)
    new_RG = nw.get_filtered_network(RG, node_group_key='group')
    RG_partitioning = get_partitioning(new_RG)
    modularity_list.append(modularity(RG, RG_partitioning))

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:29<00:00,  4.77it/s]


print(f'Average modularity of random networks: {np.mean(modularity_list):.4f}')
print(f'Standard deviation of modularity of random networks: {np.std(modularity_list):.4f}')

Average modularity of random networks: 0.0163
Standard deviation of modularity of random networks: 0.0063


plt.hist(modularity_list, bins=10, alpha=0.5, label='Random', density=True)
plt.axvline(modularity(G_pop, genre_partitioning), color = '#E74C3C', linestyle='--', label='Genre')
plt.axvline(louvain_modularity, color = '#9B59B6', linestyle='--', label='Louvain')
plt.legend()
plt.title(f'Modularity of {N} random graphs')
plt.xlabel('Modularity')
plt.ylabel('Count')
plt.show()


G_pop_louvain_partition = community.best_partition(G_pop)
G_pop_louvain = G_pop.copy()
for artist, data in G_pop_louvain.nodes(data=True):
    data['group'] = G_pop_louvain_partition[artist]


network_G_pop_louvain, _ = nw.visualize(G_pop_louvain, config=config, plot_in_cell_below=False)
fig, ax = nw.draw_netwulf(network_G_pop_louvain)


print(f'Number of groups using genres: {len(set(data["group"] for art, data in G_pop.nodes(data=True)))}')
print(f'Number of groups using Louvain: {max(G_pop_louvain_partition.values())+1}')

Number of groups using genres: 18
Number of groups using Louvain: 3382


G_pop_no_singles = G_pop.copy()
for artist, data in G_pop.nodes(data=True):
    if G_pop_no_singles.degree(artist) == 0 and data['size'] < 5:
        G_pop_no_singles.remove_node(artist)


network_G_pop_no_singles, _ = nw.visualize(G_pop_no_singles, config=config, plot_in_cell_below=False)
fig, ax = nw.draw_netwulf(network_G_pop_no_singles)
# plt.savefig("network_figures/G_no_singles.pdf")


graph = G_pop_no_singles
print(f'Number of Nodes: {graph.number_of_nodes()}')
print(f'Number of Links: {graph.number_of_edges()}')
print(f'Density: {nx.density(graph):.5f}')
print(f'Avg. clustering: {nx.average_clustering(graph):.2f}')
degrees = list(dict(graph.degree()).values())
print(f'Average degrees: {np.mean(degrees):.2f}')
print(f'Median degrees: {np.median(degrees)}')
print(f'Mode of degrees: {stats.mode(degrees)[0][0]}')
print(f'Minimum degree: {min(degrees)}')
print(f'Maximum degree: {max(degrees)}')

Number of Nodes: 2292
Number of Links: 4250
Density: 0.00162
Avg. clustering: 0.23
Average degrees: 3.71
Median degrees: 1.0
Mode of degrees: 1
Minimum degree: 0
Maximum degree: 74


genre_partitioning = get_partitioning(G_pop_no_singles)
genre_partitioning_dict = dict()
for partition in genre_partitioning:
    for artist, data in partition.nodes(data=True):
        genre_partitioning_dict[artist] = data['group']

print(f'Genre partitioning modularity using homemade modularity function: {modularity(G_pop_no_singles, genre_partitioning):.4f}')
print(f'Genre partitioning modularity using python-louvain modularity function: {community.modularity(genre_partitioning_dict, G_pop_no_singles):.4f}')

Genre partitioning modularity using homemade modularity function: 0.2318
Genre partitioning modularity using python-louvain modularity function: 0.2859


louvain_modularity = community.modularity(community.best_partition(G_pop_no_singles), G_pop_no_singles)
print(f'Modularity for the full network: {louvain_modularity:.4f}')

Modularity for the full network: 0.7207


modularity_list = []
N = 1000
for i in tqdm(range(N)):
    graph = G_pop_no_singles.copy()
    RG = nx.double_edge_swap(graph, nswap=graph.number_of_edges()*1.2, max_tries=10000, seed=None)
    new_RG = nw.get_filtered_network(RG, node_group_key='group')
    RG_partitioning = get_partitioning(new_RG)
    modularity_list.append(modularity(RG, RG_partitioning))

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [02:12<00:00,  7.55it/s]


print(f'Average modularity of random networks: {np.mean(modularity_list):.4f}')
print(f'Standard deviation of modularity of random networks: {np.std(modularity_list):.4f}')

Average modularity of random networks: 0.0163
Standard deviation of modularity of random networks: 0.0062


plt.hist(modularity_list, bins=10, alpha=0.5, label='Random', density=True)
plt.axvline(modularity(G_pop_no_singles, genre_partitioning), color = '#E74C3C', linestyle='--', label='Genre')
plt.axvline(louvain_modularity, color = '#9B59B6', linestyle='--', label='Louvain')
plt.legend()
plt.title(f'Modularity of {N} random graphs')
plt.xlabel('Modularity')
plt.ylabel('Count')
plt.show()


G_pop_no_singles_louvain_partition = community.best_partition(G_pop_no_singles)
G_pop_no_singles_louvain = G_pop_no_singles.copy()
for artist, data in G_pop_no_singles_louvain.nodes(data=True):
    data['group'] = G_pop_no_singles_louvain_partition[artist]


network_G_pop_no_singles_louvain, _ = nw.visualize(G_pop_no_singles_louvain, config=config, plot_in_cell_below=False)
fig, ax = nw.draw_netwulf(network_G_pop_no_singles_louvain)


print(f'Number of groups using genres: {len(set(data["group"] for art, data in G_pop_no_singles.nodes(data=True)))}')
print(f'Number of groups using Louvain: {max(G_pop_no_singles_louvain_partition.values())+1}')

Number of groups using genres: 18
Number of groups using Louvain: 754


def save_all_networks(genre, G):
    """ -------------------------- With singletons -------------------------- """
    # Get network for specified genre
    if genre == 'all':
        G_genre = G
    else:
        G_genre = get_network_by_genre(G, genre)
        for artist, data in G_genre.nodes(data=True):
            data['size'] = artist_genres_count[artist][genre]
        
    # Visualise network
    network_G_genre, config_genre = nw.visualize(G_genre, config=config, plot_in_cell_below=False)
    network_G_genre_data = nx.node_link_data(G_genre)
    
    size_map = {node['id']: node['size'] for node in network_G_genre_data['nodes']}
    for node in network_G_genre['nodes']:
        node['size'] = size_map[node['id']]
    
    # Make folder and save network and config
    filepath = f'../content/networks/{genre}/withsingles/genre'
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    with open(f'../content/networks/{genre}/withsingles/genre/network.json', 'w') as fp:
        json.dump(network_G_genre, fp)
    with open(f'../content/networks/{genre}/withsingles/genre/config.json', 'w') as fp:
        json.dump(config_genre, fp)

    # Get statistics for network and save in folder
    stats_genre = dict()
    degrees = list(dict(G_genre.degree()).values())
    stat_names = ["Number of nodes", "Number of links", "Denisty", 
                  "Average clustering", "Average degree", "Max degree"]
    stat_vals = [G_genre.number_of_nodes(), G_genre.number_of_edges(), round(nx.density(G_genre),5), 
                 round(nx.average_clustering(G_genre), 2), round(np.mean(degrees), 2), max(degrees)]
    for name, val in zip(stat_names, stat_vals):
        stats_genre[name] = val
    
    # Calculate top-10 betweenness centrality
    bc = nx.betweenness_centrality(G_genre)
    sorted_bc = {k: v for k, v in sorted(bc.items(), key=lambda item: item[1], reverse=True)}
    sorted_bc_top10 = {k: round(v, 4) for k, v in list(sorted_bc.items())[:10]}
    stats_genre['Betweenness centrality'] = sorted_bc_top10
    
    # Calculate modularity
    genre_partitioning = get_partitioning(G_genre)
    stats_genre['Modularity'] = round(modularity(G_genre, genre_partitioning), 2)
    
    with open(f'../content/networks/{genre}/withsingles/genre/stats.json', 'w') as fp:
        json.dump(stats_genre, fp)
        
    """ -------------------------- With singletons Louvain -------------------------- """
    # Get Louvain partitioning
    G_louvain_partition = community.best_partition(G_genre)
    G_louvain = G_genre.copy()
    for artist, data in G_louvain.nodes(data=True):
        data['group'] = G_louvain_partition[artist]
    
    # Visualise network
    network_G_louvain, config_louvain = nw.visualize(G_louvain, config=config, plot_in_cell_below=False)
    network_G_louvain_data = nx.node_link_data(G_louvain)
    
    size_map = {node['id']: node['size'] for node in network_G_louvain_data['nodes']}
    for node in network_G_louvain['nodes']:
        node['size'] = size_map[node['id']]

    # Calculate modularity
    stats_genre['Modularity'] = round(community.modularity(G_louvain_partition, G_louvain), 2)
        
    # Make folder and save network and config
    filepath = f'../content/networks/{genre}/withsingles/louvain'
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    with open(f'../content/networks/{genre}/withsingles/louvain/network.json', 'w') as fp:
        json.dump(network_G_louvain, fp)
    with open(f'../content/networks/{genre}/withsingles/louvain/config.json', 'w') as fp:
        json.dump(config_louvain, fp)
    with open(f'../content/networks/{genre}/withsingles/louvain/stats.json', 'w') as fp:
        json.dump(stats_genre, fp)
    
    
    """ -------------------------- Without singletons -------------------------- """
    ## Make networks without singletons with less than 5 songs
    G_genre_no_singles = G_genre.copy()
    for artist, data in G_genre.nodes(data=True):
        if G_genre_no_singles.degree(artist) == 0 and data['size'] < 5:
            G_genre_no_singles.remove_node(artist)
    
    # Visualise network
    network_G_genre_no_singles, config_genre_no_singles = nw.visualize(G_genre_no_singles, config=config, plot_in_cell_below=False)
    network_G_genre_no_singles_data = nx.node_link_data(G_genre_no_singles)
    
    size_map = {node['id']: node['size'] for node in network_G_genre_no_singles_data['nodes']}
    for node in network_G_genre_no_singles['nodes']:
        node['size'] = size_map[node['id']]

    
    # Make folder and save network and config
    filepath = f'../content/networks/{genre}/withoutsingles/genre'
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    with open(f'../content/networks/{genre}/withoutsingles/genre/network.json', 'w') as fp:
        json.dump(network_G_genre_no_singles, fp)
    with open(f'../content/networks/{genre}/withoutsingles/genre/config.json', 'w') as fp:
        json.dump(config_genre_no_singles, fp)
    
    # Get statistics for network and save in folder
    stats_genre_no_singles = dict()
    degrees_no_singles = list(dict(G_genre_no_singles.degree()).values())
    stat_names = ["Number of nodes", "Number of links", "Denisty", 
                  "Average clustering", "Average degree", "Max degree"]
    stat_vals = [G_genre_no_singles.number_of_nodes(), G_genre_no_singles.number_of_edges(), round(nx.density(G_genre_no_singles),5), 
                 round(nx.average_clustering(G_genre_no_singles), 2), round(np.mean(degrees_no_singles), 2), max(degrees_no_singles)]
    for name, val in zip(stat_names, stat_vals):
        stats_genre_no_singles[name] = val
    
    # Calculate modularity
    genre_no_singles_partitioning = get_partitioning(G_genre_no_singles)
    stats_genre_no_singles['Modularity'] = round(modularity(G_genre_no_singles, genre_no_singles_partitioning), 2)
    
    # Calculate top-10 betweenness centrality
    bc = nx.betweenness_centrality(G_genre_no_singles)
    sorted_bc = {k: v for k, v in sorted(bc.items(), key=lambda item: item[1], reverse=True)}
    sorted_bc_top10 = {k: round(v, 4) for k, v in list(sorted_bc.items())[:10]}
    stats_genre_no_singles['Betweenness centrality'] = sorted_bc_top10

    with open(f'../content/networks/{genre}/withoutsingles/genre/stats.json', 'w') as fp:
        json.dump(stats_genre_no_singles, fp)
    
    """ -------------------------- Without singletons Louvain -------------------------- """
    # Get Louvain partitioning
    G_louvain_partition_no_singles = community.best_partition(G_genre_no_singles)
    G_louvain_no_singles = G_genre_no_singles.copy()
    for artist, data in G_louvain_no_singles.nodes(data=True):
        data['group'] = G_louvain_partition_no_singles[artist]
    
    # Visualise network
    network_G_louvain_no_singles, config_louvain_no_singles = nw.visualize(G_louvain_no_singles, config=config, plot_in_cell_below=False)
    network_G_louvain_no_singles_data = nx.node_link_data(G_louvain_no_singles)
    
    size_map = {node['id']: node['size'] for node in network_G_louvain_no_singles_data['nodes']}
    for node in network_G_louvain_no_singles['nodes']:
        node['size'] = size_map[node['id']]
    
    # Calculate modularity
    stats_genre_no_singles['Modularity'] = round(community.modularity(G_louvain_partition_no_singles, G_louvain_no_singles), 2)
    
    # Make folder and save network and config
    filepath = f'../content/networks/{genre}/withoutsingles/louvain'
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    with open(f'../content/networks/{genre}/withoutsingles/louvain/network.json', 'w') as fp:
        json.dump(network_G_louvain_no_singles, fp)
    with open(f'../content/networks/{genre}/withoutsingles/louvain/config.json', 'w') as fp:
        json.dump(config_louvain_no_singles, fp)
    with open(f'../content/networks/{genre}/withoutsingles/louvain/stats.json', 'w') as fp:
        json.dump(stats_genre_no_singles, fp)


#save_all_networks('funk', G)


def lexical_diversity(text):
    return len(set(text)) / len(text)


songData = pd.read_pickle('songData_sorted.df')


# This has been done and saved to songData!

# Stopwords = set(w for w in stopwords.words('english'))
# lem_fun = WordNetLemmatizer()

# all_tokens = []
# all_tokens_repeat = []
# for t in songData.lyrics:
#     tokens = list(lem_fun.lemmatize(token.lower()) for token in nltk.word_tokenize(t) if token.isalpha() and lem_fun.lemmatize(token.lower()) not in Stopwords)
#     all_tokens_repeat.append(tokens)
#     all_tokens.append(list(set(tokens)))

# songData['tokens'] = all_tokens
# songData['tokens_repeat'] = all_tokens_repeat


top_genres = ['pop', 'rock', 'rap', 'r&b', 'country', 'soul',
              'singer-songwriter', 'trap', 'ballad', 'uk',
              'funk', 'dance', 'electronic', 'folk',
              'jazz', 'blues']

decade_genre_df = pd.DataFrame(None, columns=top_genres)
percentage_df = pd.DataFrame(None, columns=top_genres)
decade_genre_list = {decade: [0]*len(top_genres) for decade in range(1960, 2021, 10)}

for tokens, release, genres in zip(songData.tokens, songData.released, songData.genres):
    decade = max(int(int(release[:4]) / 10) * 10, 1960)
    for i, genre in enumerate(top_genres):
        if genre in genres:
            decade_genre_list[decade][i] += 1

for decade, counts in decade_genre_list.items():
    decade_genre_df.loc[decade] = decade_genre_list[decade]
    decade_count = sum(decade_genre_list[decade])
    percentage_df.loc[decade] = [c/decade_count for c in decade_genre_list[decade]]

decade_genre_df


color_picker = {'pop': '#E74C3C', 
                'rock': '#8E44AD', 
                'rap': '#3498DB', 
                'r&b': '#2ECC71', 
                'country': '#F39C12', 
                'soul': '#F1C40F',
                'ballad': '#DCB9ED',
                'trap': '#AED6F1',
                'singer-songwriter':'#F5B7B1', 
                'funk': '#FCF3CF', 
                'dance': '#4F8F23',
                'electronic': '#23628F',
                'folk': '#6B238F',
                'jazz': '#A3E4D7',
                'blues': '#D4AC0D', 'uk': 'grey'}

plt.figure(figsize=(12,4), dpi=95)
n_decades = len(percentage_df)
for col in percentage_df.columns:
    plt.plot(range(n_decades), percentage_df[col], 'o-', c=color_picker[col], label=col)
plt.title('Genre ratio pr. decade', size=20)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.xticks(range(n_decades), decade_genre_list.keys())
plt.xlabel('Decade')
plt.ylabel('Percentage (%)')
plt.savefig("../static/images/genre_per_decade.png", bbox_inches='tight')
plt.show()


def idf(term: str, docs: {int: {str}}, log=np.log2):
    n = len(docs.keys())
    d = sum(term in doc for doc in docs.values())
    return log(n/d)


genre_count = defaultdict(lambda: 0)

for genres in songData.genres:
    for genre in genres:
        genre_count[genre] += 1

# Handpick genres
top_genres = ['pop', 'rock', 'rap', 'r&b', 'country', 'soul',
              'singer-songwriter', 'trap', 'ballad', 'uk',
              'funk', 'dance', 'electronic', 'folk',
              'jazz', 'blues']
N = len(top_genres)


# Calculate all genre documents
genre_docs = defaultdict(lambda: [])

for tokens, genres in zip(songData.tokens, songData.genres):
    for genre in genres:
        genre_docs[genre] += tokens

# Choose only those in "top_genres"
top_genre_docs = {genre: doc for genre, doc in genre_docs.items() if genre in top_genres}

# Calculate the IDF scores for all terms in the corpus.
top_genre_docs_set = {genre: set(doc) for genre, doc in top_genre_docs.items()}
all_terms = set()
for doc in top_genre_docs.values():
    all_terms = all_terms.union(set(doc))
genre_idf_dict = {t: idf(t, top_genre_docs_set) for t in all_terms}


pri = True
# Loop through all top genres and calculate TF-IDF
for genre, doc in top_genre_docs.items():
    if pri:
        print('='*50)
        print(genre)

    fdist = nltk.FreqDist(doc)
    N = len(doc)

    stock_tfidf = {t: fdist.freq(t) * genre_idf_dict[t] for t in set(doc)}
    tfidf_keys = [k[0] for k in sorted(stock_tfidf.items(), key=lambda x: x[1], reverse=True)[:10]]
    
    if pri:
        print(f'{"TF":>14} {"TFIDF":>25}')
        i = 1
        for (tf_word, _), iftdf_word in zip(fdist.most_common(10), tfidf_keys[:10]):
            print(f'{str(i)+".":<4} {tf_word:<8} {_/N:.4f} {iftdf_word:>17} {stock_tfidf[iftdf_word]:.5f}')
            i += 1
        print('')
    pri = False

==================================================
pop
            TF                     TFIDF
1.   know     0.0100            chorus 0.00009
2.   love     0.0099     miscellaneous 0.00007
3.   oh       0.0077            broken 0.00006
4.   like     0.0077             party 0.00006
5.   got      0.0072          breaking 0.00006
6.   time     0.0070           breathe 0.00005
7.   go       0.0066           rainbow 0.00005
8.   one      0.0063            happen 0.00005
9.   na       0.0062             nigga 0.00005
10.  see      0.0062            spoken 0.00005


# With inspiration from Geeksforgeeks
def convertImage(file):
    img = Image.open(file)
    img = img.convert("RGBA")
  
    datas = img.getdata()
    newData = []
    for item in datas:
        if item[0] == 255 and item[1] == 255 and item[2] == 255:
            newData.append((255, 255, 255, 0))
        else:
            newData.append(item)
  
    img.putdata(newData)
    return img


# Load genre images
genre_files = glob('wordcloud_masks/genres/*')
genres_with_images = {file.split('\\')[-1].split('.')[0]: file for file in genre_files}
genre_ims = {genre: np.array(Image.open(f"wordcloud_masks/genres/{genre}.jpg")) for genre in top_genre_docs.keys() if genre in genres_with_images.keys()}

# Create Wordclouds in 3x3 grid and save figure with transparent background
plt.figure(dpi=400)
for i, genre in enumerate(genre_ims.keys()):
    fdist = nltk.FreqDist(genre_docs[genre])
    genre_tfidf = {t: fdist.freq(t) * genre_idf_dict[t] for t in set(genre_docs[genre])}
    wc = WordCloud(mode='RGBA', background_color=None, mask=genre_ims[genre])
    wc.generate_from_frequencies(genre_tfidf)
    image_colors = ImageColorGenerator(genre_ims[genre])
    plt.subplot(3, 3, i+1)
    plt.imshow(wc.recolor(color_func=image_colors), interpolation='bilinear')
    im = convertImage(f"wordcloud_masks/genres/{genre}.jpg")
    plt.imshow(im, interpolation='nearest', alpha=0.2)
    plt.title(genre, fontsize=6)
    plt.axis("off")
plt.tight_layout()
plt.savefig('../static/images/genre_clouds.png', bbox_inches='tight')
plt.show()


# Create decade documents
decade_docs = defaultdict(lambda: [])
for tokens, release, genres in zip(songData.tokens, songData.released, songData.genres):
    decade = max(int(int(release[:4]) / 10) * 10, 1960)
    decade_docs[decade] += tokens
decade_docs = {k: v for k, v in sorted(decade_docs.items(), key=lambda item: int(item[0]))}

# Calculate IDF scores
decade_docs_set = {decade: set(doc) for decade, doc in decade_docs.items()}
all_terms = set()
for doc in decade_docs.values():
    all_terms = all_terms.union(set(doc))
idf_dict = {t: idf(t, decade_docs_set) for t in all_terms}

pri = True
# Calculate TF-IDF scores
for decade, doc in decade_docs.items():
    if pri:
        print('='*50)
        print(decade)

    fdist = nltk.FreqDist(doc)
    N = len(doc)

    stock_tfidf = {t: fdist.freq(t) * idf_dict[t] for t in set(doc)}
    tfidf_keys = [k[0] for k in sorted(stock_tfidf.items(), key=lambda x: x[1], reverse=True)[:10]]
    
    if pri:
        print(f'{"TF":>14} {"TFIDF":>25}')
        i = 1
        for (tf_word, _), iftdf_word in zip(fdist.most_common(10), tfidf_keys[:10]):
            print(f'{str(i)+".":<4} {tf_word:<8} {_/N:.4f} {iftdf_word:>17} {stock_tfidf[iftdf_word]:.5f}')
            i += 1
        print('')
    pri = False

==================================================
1960
            TF                     TFIDF
1.   love     0.0122            watusi 0.00011
2.   know     0.0103          tenderly 0.00009
3.   oh       0.0083             looka 0.00007
4.   go       0.0069            sighin 0.00007
5.   got      0.0069             hully 0.00006
6.   like     0.0068            billow 0.00006
7.   come     0.0067             rovin 0.00006
8.   one      0.0066            fickle 0.00005
9.   baby     0.0065             twine 0.00005
10.  time     0.0064           doggone 0.00005


class MyColorFunctor():
    def __init__(self, tfidf):
        self.tfidf = tfidf
        self.high = max(tfidf.values())
        self.low = min(tfidf.values())

    def scale(self, x):
        return 210 + (x - self.low)/(self.high - self.low) * 150

    def __call__(self,word,font_size,position,orientation,random_state=None,**kwargs):
        return "hsl(%d, 80%%, %d%%)" % (self.scale(self.tfidf[word]), 50)


decade_ims = {decade: np.array(Image.open(f"wordcloud_masks/decades/{decade}.png")) for decade in decade_docs.keys()}
n_decades = len(decade_docs.keys())
plt.figure(figsize=(30,15), dpi=400)
gs = gridspec.GridSpec(2, n_decades+1)
for i, (decade, doc) in enumerate(decade_docs.items()):
    fdist = nltk.FreqDist(doc)
    decade_tfidf = {t: fdist.freq(t) * idf_dict[t] for t in set(doc)}
    wordcloud = WordCloud(mode='RGBA', background_color=None, mask=decade_ims[decade], color_func=MyColorFunctor(decade_tfidf))
    wordcloud.generate_from_frequencies(decade_tfidf)
    
    if i < ((n_decades+1)//2):
        ax = plt.subplot(gs[0, 2 * i:2 * i + 2])
    else:
        ax = plt.subplot(gs[1, 2 * i - n_decades:2 * i + 2 - n_decades])
    
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.set_title(decade, size=20)
    plt.axis("off")
plt.savefig('decade_wordcloud.png')
plt.show()

pri = False
if pri:
    for i, (decade, doc) in enumerate(decade_docs.items()):
        fdist = nltk.FreqDist(doc)
        decade_tfidf = {t: fdist.freq(t) * idf_dict[t] for t in set(doc)}
        wordcloud = WordCloud(mode='RGBA', background_color=None, mask=decade_ims[decade], color_func=MyColorFunctor(decade_tfidf))
        wordcloud.generate_from_frequencies(decade_tfidf)

        plt.figure(dpi=100)
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.title(decade, fontsize=16)
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(f'../static/images/decades/{decade}.png')


artist_count = defaultdict(lambda: 0)

for artists in songData.artists:
    for artist in artists:
        artist_count[artist] += 1
artist_count = {k: v for k, v in sorted(artist_count.items(), key=lambda item: item[1], reverse=True)}

top_artists = list(artist for artist, count in artist_count.items() if count >= 10)
print('Total number of artists:', len(artist_count))
print('Number of top artists:', len(top_artists))

Total number of artists: 7855
Number of top artists: 735


# Create artist documents
artist_docs = defaultdict(lambda: [])
for tokens, artists in zip(songData.tokens, songData.artists):
    for artist in artists:
        artist_docs[artist] += tokens

top_artist_docs = {artist: doc for artist, doc in artist_docs.items() if artist in top_artists}

artists_with_images = {'.'.join(file.split('\\')[-1].split('.')[:-1]): file for file in glob('wordcloud_masks/artists/*')}
artist_ims = {artist: np.array(Image.open(artists_with_images[artist])) for artist in top_artist_docs.keys() if artist in artists_with_images.keys()}
print('Artists with images:')
for artist in artists_with_images.keys():
    print(artist, end=', ')

Artists with images:
aretha franklin, ariana grande, beyoncé, billie eilish, britney spears, cher, chris brown, dj khaled, drake, ed sheeran, elton john, elvis presley, eminem, frank sinatra, future, j. cole, james brown, jay-z, juice wrld, justin bieber, kanye west, katy perry, lil baby, lil durk, lil uzi vert, lil wayne, madonna, marvin gaye, michael jackson, miley cyrus, nicki minaj, prince, queen, snoop dogg, stevie wonder, taylor swift, the beatles, the weeknd, travis scott, unknown, young thug, youngboy never broke again,


# Calculate IDF scores
top_artist_docs_set = {artist: set(doc) for artist, doc in top_artist_docs.items()}
all_terms = set()
for doc in top_artist_docs.values():
    all_terms = all_terms.union(set(doc))

idf_dict = {t: idf(t, top_artist_docs_set) for t in all_terms}

pri = True
# For each artist who is a "top artist" calculate the TF-IDF scores
for artist, doc in top_artist_docs.items():
    if artist not in artist_ims.keys():
        continue
    if pri:
        print('='*50)
        print(artist)

    fdist = nltk.FreqDist(doc)
    N = len(doc)

    stock_tfidf = {t: fdist.freq(t) * idf_dict[t] for t in set(doc)}
    tfidf_keys = [k[0] for k in sorted(stock_tfidf.items(), key=lambda x: x[1], reverse=True)[:10]]
    
    if pri:
        print(f'{"TF":>14} {"TFIDF":>25}')
        i = 1
        for (tf_word, _), iftdf_word in zip(fdist.most_common(10), tfidf_keys[:10]):
            print(f'{str(i)+".":<4} {tf_word:<8} {_/N:.4f} {iftdf_word:>17} {stock_tfidf[iftdf_word]:.5f}')
            i += 1
        print('')
    pri = False

==================================================
frank sinatra
            TF                     TFIDF
1.   love     0.0107         reminding 0.00657
2.   like     0.0096           musical 0.00613
3.   heart    0.0090         interlude 0.00613
4.   come     0.0090            sleigh 0.00595
5.   go       0.0079          carousel 0.00579
6.   wa       0.0079           perhaps 0.00538
7.   know     0.0079     inconceivable 0.00537
8.   never    0.0073           granada 0.00537
9.   day      0.0073              dreg 0.00537
10.  time     0.0068          slopping 0.00537


for i, artist in enumerate(artist_ims.keys()):
    if i > 0: break
    fdist = nltk.FreqDist(artist_docs[artist])
    artist_tfidf = {t: fdist.freq(t) * idf_dict[t] for t in set(artist_docs[artist])}
    wc = WordCloud(mode='RGBA', background_color=None, mask=artist_ims[artist]) # , color_func=MyColorFunctor(decade_tfidf)
    wc.generate_from_frequencies(artist_tfidf)
    image_colors = ImageColorGenerator(artist_ims[artist])
    plt.figure(dpi=200)
    plt.imshow(wc.recolor(color_func=image_colors), interpolation='bilinear')
    im = convertImage(f"wordcloud_masks/artists/{artist}.jpg")
    plt.imshow(im, interpolation='nearest', alpha=0.2)
    #plt.title(artist, size=20)
    plt.axis("off")
    plt.tight_layout()
    plt.savefig('wordclouds/artists/' + artist.replace(' ', '_') + '.png', bbox_inches='tight')
    plt.show()


# Taken directly from nltk. Modified to allow for custom xticks.
def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot", xticks=None):
    """
    Generate a lexical dispersion plot.

    :param text: The source text
    :type text: list(str) or enum(str)
    :param words: The target words
    :type words: list of str
    :param ignore_case: flag to set if case should be ignored when searching text
    :type ignore_case: bool
    """

    try:
        from matplotlib import pylab
    except ImportError as e:
        raise ValueError(
            "The plot function requires matplotlib to be installed."
            "See https://matplotlib.org/"
        ) from e

    text = list(text)
    words.reverse()

    if ignore_case:
        words_to_comp = list(map(str.lower, words))
        text_to_comp = list(map(str.lower, text))
    else:
        words_to_comp = words
        text_to_comp = text

    points = [
        (x, y)
        for x in range(len(text_to_comp))
        for y in range(len(words_to_comp))
        if text_to_comp[x] == words_to_comp[y]
    ]
    if points:
        x, y = list(zip(*points))
    else:
        x = y = ()
    pylab.plot(x, y, "b|", scalex=0.1)
    pylab.yticks(list(range(len(words))), words, color="b")
    pylab.ylim(-1, len(words))
    pylab.title(title)
    pylab.xlabel("Decade by Word Offset")
    if xticks is not None:
        pylab.xticks(*zip(*decade_tick.items()))
    plt.savefig("../static/images/dispersion.png", bbox_inches='tight')
    pylab.show()


# Concatenate all documents together in cronological order.
all_tokens = []
decade_tick = {0: 1960}
ny = 1970
for release, tokens in zip(songData.released, songData.tokens):
    all_tokens += tokens
    year = int(release[:4])
    if year >= ny:
        decade_tick[len(all_tokens)] = ny
        ny += 10

words = nltk.Text(all_tokens)
plt.figure(figsize=(10,3), dpi=135)
dispersion_plot(words, ['swag', 'shawty', 'boogie', 'funky', 'darling' , 'bitch', 'watusi', 'drug', 'skrrt', 'nigga'], xticks=decade_tick)


labMT = pd.read_csv('Hedonometer.csv', index_col='Rank')
happiness_score = {w: happiness for w, happiness in zip(labMT['Word'], labMT['Happiness Score'])}
words_with_score = set(w for w in labMT['Word'])

def text_happiness_score(token_list):
    text = [w for w in token_list if w in words_with_score]
    fdist = nltk.FreqDist(text)
    return sum([happiness_score[w] * fdist.freq(w) for w in set(text)])


# Calculate sentiment for each genre
genre_happiness = {}
for genre, doc in top_genre_docs.items():
    genre_happiness[genre] = text_happiness_score(doc)
    
genre_happiness = {k: v for k, v in sorted(genre_happiness.items(), key=lambda item: item[1], reverse=True)}

plt.figure(figsize=(12,4), dpi=95)
plt.title('Average sentiment of genre lyrics')
plt.ylabel('Sentiment score')
plt.bar(*zip(*genre_happiness.items()), color='#3498DB', alpha=0.7)
plt.axhline(np.mean(list(happiness_score.values())), c='#E74C3C', alpha=0.7, label='average sentiment from labMT')
plt.xticks(rotation=30, ha='right')
plt.legend()
plt.ylim(5,6)
plt.xlim(-0.75, len(genre_happiness.items())-0.25)
plt.savefig("../static/images/genre_sentiment.png", bbox_inches='tight')
plt.show()


# Create monthly documents
month_docs = defaultdict(lambda: [])
for tokens, release in zip(songData.tokens, songData.released):
    if len(release) == 4 or int(release[:4]) < 1960:
        continue
    month_docs[release[:7]] += tokens

# Create month sentiment scores
dates = [np.datetime64(month) for month in month_docs.keys()]
month_happiness = [text_happiness_score(doc) for doc in month_docs.values()]
month_happiness = pd.Series(month_happiness, index=dates)

# Compute rolling average of 1 year and plot
rolled_series = month_happiness.rolling("365D").mean()

myFmt = mdates.DateFormatter("%Y")
fig, ax = plt.subplots(figsize=(12,4), dpi=95)
plt.title("Rolling average sentiment")
ax.plot(month_happiness.index, month_happiness.values, ls = "--", color='#3498DB', alpha=0.7, label = "monthly average happiness")
ax.plot(rolled_series.index, rolled_series.values, color = '#E74C3C', alpha=0.7, label = "1 year rolling average")
ax.set_ylabel("Average sentiment")
ax.legend()
ax.xaxis.set_major_formatter(myFmt)
plt.savefig("../static/images/rolling_sentiment.png", bbox_inches='tight')
plt.show()

# Calculate sentiment for each decade
decade_happiness = {}
for decade, doc in decade_docs.items():
    decade_happiness[decade] = text_happiness_score(doc)

plt.figure(figsize=(12,4), dpi=95)
plt.title('Average sentiment of decade lyrics')
plt.ylabel('Sentiment score')
plt.bar(*zip(*decade_happiness.items()), width=4, color='#3498DB', alpha=0.5)
plt.axhline(np.mean(list(happiness_score.values())), c='#E74C3C', alpha=0.7, label='average sentiment from labMT')
plt.xticks(rotation=30, ha='right')
plt.legend()
plt.ylim(5,6)
plt.savefig("../static/images/decade_sentiment.png",bbox_inches='tight')
plt.show()


print('Sentiment score of "bitch":', happiness_score['bitch'])
print('Sentiment score of "darling":', happiness_score['darling'])

Sentiment score of "bitch": 3.14
Sentiment score of "darling": 7.22


# Calculate sentiment for each decade
artist_happiness = {}
for artist, doc in artist_docs.items():
    artist_happiness[artist] = text_happiness_score(doc)

artist_happiness = {k: v for k, v in sorted(artist_happiness.items(), key=lambda item: item[1])}
top_artist_happiness = {k: v for k, v in sorted(artist_happiness.items(), key=lambda item: item[1]) if k in top_artists}

pri = True
for artist in artists_with_images.keys():
    if not pri:break
    if artist == 'unknown':
        continue
    plt.figure(figsize=(6,4), dpi=100)
    plt.title('Artist sentiment distribution')
    plt.hist(artist_happiness.values(), color='#3498DB', alpha=0.3, bins=40, density=True, label='all artists')
    plt.hist(top_artist_happiness.values(), color='green', alpha=0.3, bins=25, density=True, label='top artists')
    plt.xlabel('Sentiment score')
    plt.axvline(artist_happiness[artist], c='#E74C3C', label=artist)
    plt.xlim(4.7, 6.7)
    plt.legend()
    plt.savefig(f'../static/images/artist_dists/{artist}.png')
    plt.show()
    pri=False


def cosinesimilarity(vec1, vec2):
    return vec1@vec2 / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def similar(artist_vec, doc_term_matrix, artist=None, n=5):
    artist_id = artist2id[artist] if artist is not None else -1
    top_n_sim = [(None, -1) for _ in range(n)]
    for i, artist_b_vec in enumerate(doc_term_matrix):
        if i == artist_id:
            continue
        similarity = cosinesimilarity(artist_vec, artist_b_vec)
        if similarity > min(top_n_sim, key=lambda x: x[1])[1]:
            top_n_sim[-1] = (id2artist[i], similarity)
            top_n_sim = sorted(top_n_sim, key=lambda x: x[1], reverse=True)
    return top_n_sim

def unsimilar(artist_vec, doc_term_matrix, artist=None, n=5):
    artist_id = artist2id[artist] if artist is not None else -1
    bottom_n_sim = [(None, 1) for _ in range(n)]
    for i, artist_b_vec in enumerate(doc_term_matrix):
        if i == artist_id:
            continue
        similarity = cosinesimilarity(artist_vec, artist_b_vec)
        if similarity < max(bottom_n_sim, key=lambda x: x[1])[1]:
            bottom_n_sim[-1] = (id2artist[i], similarity)
            bottom_n_sim = sorted(bottom_n_sim, key=lambda x: x[1], reverse=False)
    return bottom_n_sim


# Calculate number of times each token appears (This was done to allow for words to be removed if necessary)
token_count = defaultdict(lambda: 0)
for tokens in songData.tokens:
    for token in tokens:
        token_count[token] += 1

vocabulary = list(token for token, count in token_count.items())
print('Words in vocabulary:', len(vocabulary))
word2id = {word: i for i, word in enumerate(vocabulary)}

Words in vocabulary: 50697


# Create unique term sets for each artist and calculate TF-IDF scores
artist_docs_set = {artist: set(doc) for artist, doc in artist_docs.items()}
idf_dict = {t: idf(t, artist_docs_set) for t in vocabulary}

artist2id = {artist: i for i, artist in enumerate(artist_docs.keys())}
id2artist = {i: artist for i, artist in enumerate(artist_docs.keys())}

# Build the term x artist matrix
term_artist_mat = np.zeros((len(artist_docs.keys()), len(vocabulary)))
for artist, doc in artist_docs.items():
    fdist = nltk.FreqDist(doc)
    for t in set(doc):
        term_artist_mat[artist2id[artist], word2id[t]] = fdist.freq(t) * idf_dict[t]


artist = 'justin bieber'
artist_vec = term_artist_mat[artist2id[artist]]
print('Most similar')
for i, (sim_artist, similarity) in enumerate(similar(artist_vec, term_artist_mat, artist=artist, n=5)):
    print(f'{i+1}. {sim_artist:<15} has similarity {similarity:.3f} with {artist}')
    
print('\nLeast similar')
for i, (unsim_artist, similarity) in enumerate(unsimilar(artist_vec, term_artist_mat, artist=artist, n=5)):
    print(f'{i+1}. {unsim_artist:<15} has similarity {similarity:.3f} with {artist}')

Most similar
1. ariana grande   has similarity 0.615 with justin bieber
2. taylor swift    has similarity 0.615 with justin bieber
3. chris brown     has similarity 0.610 with justin bieber
4. the weeknd      has similarity 0.606 with justin bieber
5. drake           has similarity 0.599 with justin bieber

Least similar
1. baauer          has similarity 0.002 with justin bieber
2. kali uchis, tainy has similarity 0.003 with justin bieber
3. davon king      has similarity 0.004 with justin bieber
4. spacejam jiff   has similarity 0.004 with justin bieber
5. k.a.a.n.        has similarity 0.004 with justin bieber


pri = True
for artist in artists_with_images.keys():
    if pri == False: break
    if artist == 'unknown':
        continue
    print('='*50)
    print('Most similar to:', artist)
    artist_vec = term_artist_mat[artist2id[artist]]
    for i, (sim_artist, similarity) in enumerate(similar(artist_vec, term_artist_mat, artist=artist, n=5)):
        print(f'\t{i+1}. {sim_artist:<15} has similarity {similarity:.3f} with {artist}')
    pri = False

==================================================
Most similar to: aretha franklin
	1. marvin gaye     has similarity 0.593 with aretha franklin
	2. elvis presley   has similarity 0.567 with aretha franklin
	3. stevie wonder   has similarity 0.565 with aretha franklin
	4. the temptations has similarity 0.555 with aretha franklin
	5. diana ross      has similarity 0.549 with aretha franklin

Work	David Ari Ostenfeldt, s194237	Kristian Rhindal Møllman, s194246	Kristoffer Marboe, s194249
Data	40%	30%	30%
Networks	30%	40%	30%
Text	30%	30%	40%
Website	33%	33%	33%
Explainer notebook	33%	33%	33%

released	artists	lyrics	genres	title
1957	[marty robbins]	El Paso Lyrics\nOut in the West Texas town of ...	[country]	El Paso
1960-01-04	[frankie avalon]	Why Lyrics I'll never let you go\nWhy? Because ...	[pop]	Why
1959	[johnny preston]	Running Bear LyricsOn the bank of the river\nS...	[pop]	Running Bear
1960-01-04	[freddy cannon]	Way Down Yonder in New Orleans LyricsWell, way ...	[pop]	Way Down Yonder in New Orleans
1960-01-04	[guy mitchell]	Heartaches by the Number Lyrics\nHeartaches by...	[country, cover]	Heartaches by the Number

Data Set	Songs	Size (mb)
Billboard List	29,128	1.6
Pre-cleaned	29,128	92.5
Cleaned	25,419	44.2

	pop	rock	rap	r&b	country	soul	singer-songwriter	trap	ballad	uk	funk	dance	electronic	folk	jazz	blues
1960	3711	909	64	753	382	506	86	2	71	102	94	2	0	80	69	58
1970	2570	1268	43	844	375	582	288	0	192	165	293	19	3	75	24	34
1980	2482	1231	103	606	102	287	250	0	174	248	199	89	37	14	29	13
1990	1453	621	700	763	229	309	138	2	163	103	60	94	72	16	13	7
2000	1241	937	1004	748	613	218	304	31	287	64	48	63	70	24	14	15
2010	1814	650	2011	873	760	129	482	848	427	242	45	231	301	45	19	20
2020	432	72	851	254	155	39	189	571	95	47	13	46	61	35	4	2

0. Distribution of work and initialisation¶

1. Motivation¶

What is your dataset?¶

Why did you choose this dataset?¶

What was your goal for the end user's experience?¶

Scraping the data¶

2. Basic stats¶

Data Cleaning¶

Unwanted characters and non-English songs¶

Removing long songs¶

Regrouping artists¶

Preliminary look at the data¶

Characteristics of the data¶

Big¶

Always-on¶

Non-reactive¶

Incomplete¶

Inaccessable¶

Nonrepresentative¶

Drifting¶

Algorithmically confounded¶

Dirty¶

Sensitive¶

3.Tools, theory and analysis.¶

Network¶

Creating the full network¶

Add nodes¶

Add edges¶

Helper functions¶

Deciding which genre networks to analyse¶

Analysis¶

NB: Networks are not meant to be looked at here in the notebook but rather in the network section on the website.¶

With singletons¶

Analysis of degrees¶

Community detection¶

Building randon network for comparison¶

Betweenness centrality¶

Without singletons¶

Properties¶

Analysis of degrees¶

Community detection¶

Building randon network for comparison¶

Pop network¶

With singletons¶

Properties¶

Community detection¶

Building random network for comparison¶

Without singletons¶

Properties¶

Community detection¶

Building randon network for comparison¶

Retrieving statistics and visualisations for the remaining genres¶

Text Analysis¶

Preprocessing lyrics¶

Fraction of genres pr. decade¶

TF-IDF & Wordcloud¶

Genres¶

Decade¶

Artists¶

Dispersion plot¶

Sentiment analysis¶

Genre¶

Decade¶

Artist¶

LSA¶

4. Discussion¶