Skip to content

Latest commit

 

History

History
281 lines (207 loc) · 8.94 KB

README.md

File metadata and controls

281 lines (207 loc) · 8.94 KB

Python-Lyrics-Visualizer

A Python program which helps to extract data from the Genius API and conduct data analysis and visualisation.

Initialization

I get the lyrics and data from the API offered by genius.com- is the world's biggest collection of song lyrics and musical knowledge. There is already a python binding for the genius API called the lyrics genius which helps you to achieve this but i wanted more control and understanding of the process so i decided to directly use the original API.

Once you register for the api the user will get 3 client keys, use them to start the API.

client_id = 'CLIENT_ID'
client_secret = 'CLIENT_SECRET'
client_token = 'CLIENT_TOKEN'

# Basic variables
orig_url = 'https://genius.com'
base_url = 'https://api.genius.com'
path = 'search/'
search = '/search?q='
header = {'Authorization':'Bearer '+client_token}
pattern =  '(?m)^\[.*\n?'
corpus_root = ''
file  ='song.txt'

song_list = []
album_list=[]
sort_type=[]
ssid=[]

Helpful projects like - https://github.com/dlarsen5/PyRap/blob/master/Retrieve_Lyrics.py and https://gist.github.com/imdkm/a60247b59ff1881fa4bb8846a9b44c96 were useful as i used some functions from there but modified them to fit my project.

Fetching Lyrics

There are 2 main functions whihc deal with the genius.com api and its resulting JSON data. properly nagviating it to get the appropriate information. in this section

def lyrics_fetch(song_api):
    try:
        song_url = base_url+song_api
        response = requests.get(song_url,headers=header)
        data = response.json()
        path = data['response']['song']['path']
        page_url = orig_url+path
        page = requests.get(page_url)

        path2 = data['response']['song']['album']['url']
       # print(path2)

        parsed = json.dumps(response.json(), indent=4)
        #print(parsed)

        #print(page_url)
        html  = bs(page.text,"html.parser")
        [h.extract() for h in html('script')]
        lyrics = html.find('div', class_='lyrics')


        song = html.find('h1')
        song =  song.get_text()
        name = html.find('h2')
        name = name.get_text()
        feat = html.find('h3')
        feat =  feat.get_text()
        details = song + name  + feat
        song_list.append(song)
        lyrics = details+ lyrics.get_text()
        #lyrics = lyrics.get_text()
        return lyrics
    except:
        print('error')



def get_lyrics(artist,num):
    try:
        artist_id = ''
        search_url = base_url + "/search?q=" + artist
        response = requests.get(search_url, headers=header)
        json = response.json()
        num_songs = num #input('Number of songs: ')
        for hit in json['response']['hits']:
            if hit['result']['primary_artist']['name'] == artist:
                artist_id = hit['result']['primary_artist']['api_path']
                break
        artist_url = base_url + artist_id + '/songs?sort=popularity&per_page=%s' % num_songs
        #artist_url = base_url + artist_id + '/songs?sort=title&per_page=%s' % num_songs
        #artist_url = base_url + artist_id + '/songs?sort=release_date&per_page=%s' % num_songs
        #print(artist_url)
        artist_response = requests.get(artist_url, headers=header)
        artist_json = artist_response.json()
        song_paths = {}
        song_lyrics = {}
        i = 0
        k=0
        lyrics_p = []
        lyrics =[]
        for song in artist_json['response']['songs']:
            song_paths[song['title_with_featured']] = song['api_path']
            if i < int(num_songs):
                lyrics_p.append( song['api_path'])
                i = i +  1
        for song,song_path in song_paths.items():
            #song_lyrics[song] = lyrics_fetch(song_path)
            lyrics.append(lyrics_fetch(song_path))

            img = image_fetch(artist,song_path,k)
           # print(img)
            get_album(song_path)
            k+=1
        return lyrics
    except:
        print('error')

These functions get the artist id ....

Getting Album Data

def get_album(song_api):
    try:
        song_url = base_url+song_api
        response = requests.get(song_url,headers=header)
        data = response.json()
        page_url = data['response']['song']['album']['url']
        page =   requests.get(page_url)

        #print(page_url)
        html  = bs(page.text,"html.parser")

        #[h.extract() for h in html('script')]

        lyrics = html.findAll('h3', class_='chart_row-content-title')
        #print(lyrics.get_text())
        for i in lyrics:
            lyr = i.get_text().split()
            lyr.remove('Lyrics')
            l = ' '.join(lyr)
           # print(l)



        img = html.findAll('img', class_='cover_art-image')
        #for i in img:
           # print(i['src'])

        #print(img)


        return img
    except:
        print('error')

Save album image

def image_fetch(artist,song_api,k):
    try:
        song_url = base_url+song_api
        response = requests.get(song_url,headers=header)
        json = response.json()
        path = json['response']['song']['song_art_image_url']
        artist =artist
        k=str(k)
        img_name = artist+k+'.jpg'
        file_path = 'static/images/'
        f = open(file_path+img_name,'wb')
        f.write(requests.get(path).content)
        f.close()

        #page = requests.get(page_url)
        return path
    except:
        print('error')

Getting song data

def _get(path, params=None, headers=None):

    # generate request URL
    requrl = '/'.join([base_url, path])
    token = "Bearer {}".format(client_token)
    if headers:
        headers['Authorization'] = token
    else:
        headers = {"Authorization": token}

    response = requests.get(url=requrl, params=params, headers=headers)
    response.raise_for_status()

    return response.json()







def get_artist_songs(artist_id):
    # initialize variables & a list.
    current_page = 1
    next_page = True
    songs = []

    # main loop
    while next_page:

        path = "artists/{}/songs/".format(artist_id)
        params = {'page': current_page}
        data = _get(path=path, params=params)

        page_songs = data['response']['songs']

        if page_songs:
            # add all the songs of current page,
            # and increment current_page value for next loop.
            songs += page_songs
            current_page += 1
        else:
            # if page_songs is empty, quit.
            next_page = False

    # get all the song ids, excluding not-primary-artist songs.
    songs = [song["id"] for song in songs
             if song["primary_artist"]["id"] == artist_id]
    print(songs)

    return songs





def get_song_information(song_ids):
    # initialize a dictionary.
    song_list = {}

    # main loop
    for i, song_id in enumerate(song_ids):
        #print("id:" + str(song_id) + " start. ->")

        path = "songs/{}".format(song_id)
        data = _get(path=path)["response"]["song"]
        #print(data)

        song_list.update({
        i: {
            "title": data["title"],
            "album": data["album"]["name"] if data["album"] else "<single>",
            "release_date": data["release_date"] if data["release_date"] else "unidentified",
            "featured_artists":
                [feat["name"] if data["featured_artists"] else "" for feat in data["featured_artists"]],
            "producer_artists":
                [feat["name"] if data["producer_artists"] else "" for feat in data["producer_artists"]],
            "writer_artists":
                [feat["name"] if data["writer_artists"] else "" for feat in data["writer_artists"]],
            "genius_track_id": song_id,
            "genius_album_id": data["album"]["id"] if data["album"] else "none"}
        })
        #print(song_list)
        #print("-> id:" + str(song_id) + " is finished. \n")
        if song_list[i]['album'] == 'The Slim Shady LP':
            print(song_list[i]['title'] + str(song_id))
            ssid.append(song_id)
    return song_list





# find artist id from given data.
find_id = _get("search", {'q': artist})
for hit in find_id["response"]["hits"]:
   if hit["result"]["primary_artist"]["name"] == artist:
       artist_id = hit["result"]["primary_artist"]["id"]

       break