-
Notifications
You must be signed in to change notification settings - Fork 0
/
Billboard.py
160 lines (93 loc) · 4.73 KB
/
Billboard.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python
# coding: utf-8
# # Spotify Streaming Pattern Analysis
#
# Author: Arun Ganapathy <br>
# Dated: 9th December 2022
#
# In this project, we aim to analyze Billboard chart data and Spotify track information to gain insights into popular music trends. The project involves the following key steps:
#
# 1. **Data Collection**: We retrieve historical Billboard chart data for pop songs from 2008 to 2021. This data includes details about artists, track titles, and years of charting.
#
# 2. **Data Cleaning and Transformation**: We preprocess the collected data by cleaning it, converting data types, and handling any discrepancies.
#
# 3. **Exploratory Analysis**: We explore the dataset to identify patterns, trends, and interesting insights related to artists, genres, and charting over the years.
#
# 4. **Genre Correction**: We correct and standardize the genre information using text processing techniques.
#
# 5. **Integration with Spotify Data**: We integrate Spotify track information with the Billboard data to enhance our analysis. This includes identifying popular artists and tracks based on Spotify metrics.
#
# 6. **Data Export**: After processing and analyzing the data, we export the final dataset to a CSV file for further use.
#
# Through this project, we aim to uncover how music popularity has evolved over the years, identify dominant genres, and highlight noteworthy artists and tracks within the given time frame.
#
# In[1]:
import billboard
import pandas as pd
from tqdm import tqdm
# In[2]:
# Specify the chart type ('pop-songs') and the year (2008) for the ChartData.
# This will retrieve Billboard chart data for the specified year and chart type.
chart = billboard.ChartData('pop-songs', year=2008)
# In[3]:
# Create an empty DataFrame called 'df'.
df = pd.DataFrame()
# Add two empty columns 'Artist' and 'Title' to the DataFrame.
df[['Artist', 'Title']] = ""
# Add a new column 'Year' with default value 0 to the DataFrame.
df['Year'] = 0
# In[4]:
# Loop through years from 2008 to 2021 (inclusive).
for i in tqdm(range(2008, 2022)):
# Retrieve Billboard chart data for 'pop-songs' chart and the current year (i).
chart = billboard.ChartData('pop-songs', year=i)
# Loop through the entries in the current year's chart.
for j in range(len(chart)):
# Append a new row to the DataFrame 'df' with artist, title, and year information.
df.loc[len(df)] = [chart.entries[j].artist, chart.entries[j].title, i]
# In[5]:
# Export the DataFrame 'df' to a CSV file named 'billboard_08_21.csv'. # this billboard data is later used during visual development
df.to_csv('data/billboard_08_21.csv')
# ## Data Cleaning
# In[6]:
# Read in Spotify data for cleaning
data = pd.read_csv('data/final_data.csv')
data.head()
# In[7]:
# Converting specific columns in the 'data' DataFrame to string data type.
# This ensures consistent data type for 'track_name', 'genre', and 'artist_name' columns.
data = data.astype({"track_name": str, "genre": str, "artist_name": str})
# In[8]:
# Filtering out rows in the 'data' DataFrame where the 'track_name' column contains "?".
# The tilde (~) operator is used to negate the condition, keeping only rows without "?".
data = data[~data.track_name.str.contains("?", regex=False)]
# In[9]:
# Grouping the 'data' DataFrame by unique combinations of 'track_name' and 'artist_name'.
# The 'first()' function is applied to each group, retaining the first occurrence within each group.
# Finally, the 'reset_index()' function is used to reset the index and create a new DataFrame.
data = data.groupby(['track_name', 'artist_name']).first().reset_index()
# In[10]:
# Filtering the 'data' DataFrame to select rows where the 'artist_name' column is 'Cardi B'.
cardi_b_data = data[data.artist_name == 'Cardi B']
cardi_b_data.head()
# In[11]:
# Replacing occurrences of 'Children’s Music' with 'Children's Music' in the 'genre' column of the 'data' DataFrame.
data.genre = data.genre.str.replace('Children’s Music', "Children's Music")
# In[12]:
# Importing the 're' module for working with regular expressions.
import re
# Looping through each row in the 'data' DataFrame using the 'iterrows()' function.
for i, row in data.iterrows():
# Checking if the 'genre' column in the current row is equal to 'Reggae'.
if data.loc[i, 'genre'] == 'Reggae':
# Replacing the 'genre' value with 'Reggaeton' for the current row.
data.loc[i, 'genre'] = 'Reggaeton'
# In[13]:
print(set(data.genre))
# In[14]:
data.head(5)
# In[15]:
# Exporting the 'data' DataFrame to a CSV file named 'spotify_data.csv'.
# The 'index' parameter is set to False to exclude the index column from the CSV.
data.to_csv('data/spotify_data.csv', index=False)
# ### End of data processing