Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions merge_csv_top200.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@

import pandas as pd
import os
from tqdm import tqdm
from datetime import datetime, timedelta

def merge_csv(source, destination):
# 통합 csv dataframe 생성
header = ['rank','track_id','artist_names','track_name','source','peak_rank','previous_rank','weeks_on_chart','streams', 'country_code','date']
combined_df = pd.DataFrame(columns=header)

# 파일명을 기반으로 국가명과 일자 정보를 추출하여 데이터에 추가
directory = source
for filename in tqdm(os.listdir(directory)):
if filename.endswith('.csv'):
file_path = os.path.join(directory, filename)
_, country, _, year, month, day = filename.split('-')

# 기존 데이터 로드
df = pd.read_csv(file_path,encoding='utf-8')

# uri에서 track_id만 추출하여 컬럼명 변경하여 저장
df['uri'] = df['uri'].str.split(':').str[-1]
df.rename(columns={'uri': 'track_id'}, inplace=True)

# 국가명과 일자 정보 추가
df['country_code'] = country
df['date'] = datetime(int(year), int(month), int(day.split('.')[0]))

# DataFrame을 통합
combined_df = pd.concat([combined_df, df], ignore_index=True)
print(filename)

# 수정된 데이터를 새로운 파일로 저장
combined_df.to_csv(destination, index=False,encoding='utf-8-sig')

directory = 'C:/Users/Yeojun/s4tify/weekly_Top_Songs/'
new_filename = "weekly_top200_combined.csv"

source = 'C:/Users/Yeojun/Downloads/'
destination = os.path.join(directory, new_filename)

merge_csv(source, destination)
52 changes: 52 additions & 0 deletions weekly_top200_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time, os
from datetime import datetime, timedelta
from dotenv import load_dotenv
load_dotenv("C://Users/Yeojun/.env")
from tqdm import tqdm

def get_weekly_top200_songs(countries, year=3):

dates = []

current_date = datetime.now() - timedelta(days=5)
end_date = current_date - timedelta(days=(year * 365))
while current_date >= end_date:
date_string = current_date.strftime("%Y-%m-%d")
dates.append(date_string)
current_date -= timedelta(weeks=1)


options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) as driver:
driver.get("https://charts.spotify.com/charts/view/regional-global-weekly/latest")
time.sleep(2)

login_button = driver.find_element(By.XPATH, '//*[@id="__next"]/div/div/header/div/div[2]/a/span[1]')
login_button.click()

id_input = driver.find_element(By.XPATH, '//*[@id="login-username"]').send_keys(os.getenv("SPOTIFY_ID"))
password_input = driver.find_element(By.XPATH, '//*[@id="login-password"]').send_keys(os.getenv("SPOTIFY_PASS"))
login_button = driver.find_element(By.XPATH, '//*[@id="login-button"]')
login_button.click()
time.sleep(3)

for country in tqdm(countries):
for date in dates:
driver.get(f"https://charts.spotify.com/charts/view/regional-{country}-weekly/{date}")

time.sleep(2)

csv_download_button = driver.find_element(By.XPATH, '//*[@id="__next"]/div/div[3]/div/div/div[2]/span')
csv_download_button.click()

time.sleep(2)

countries = ['global','kr','us']
get_weekly_top200_songs(countries)
Loading