From 644f29858439f070baf0944bc8be6e8bf0df197b Mon Sep 17 00:00:00 2001 From: Yeojun <52143231+Yun024@users.noreply.github.com> Date: Tue, 4 Mar 2025 19:21:06 +0900 Subject: [PATCH 1/2] [Upload] spotify_weekly_top200_crawler --- merge_csv_top200.py | 43 +++++++++++++++++++++++++++++++++ weekly_top200_crawler.py | 52 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 merge_csv_top200.py create mode 100644 weekly_top200_crawler.py diff --git a/merge_csv_top200.py b/merge_csv_top200.py new file mode 100644 index 0000000..cbd5e37 --- /dev/null +++ b/merge_csv_top200.py @@ -0,0 +1,43 @@ + +import pandas as pd +import os +from tqdm import tqdm +from datetime import datetime, timedelta + +def merge_csv(source, destination): + # 통합 csv dataframe 생성 + header = ['rank','track_id','artist_names','track_name','source','peak_rank','previous_rank','weeks_on_chart','streams', 'country_code','date'] + combined_df = pd.DataFrame(columns=header) + + # 파일명을 기반으로 국가명과 일자 정보를 추출하여 데이터에 추가 + directory = source + for filename in tqdm(os.listdir(directory)): + if filename.endswith('.csv'): + file_path = os.path.join(directory, filename) + _, country, _, year, month, day = filename.split('-') + + # 기존 데이터 로드 + df = pd.read_csv(file_path) + + # uri에서 track_id만 추출하여 컬럼명 변경하여 저장 + df['uri'] = df['uri'].str.split(':').str[-1] + df.rename(columns={'uri': 'track_id'}, inplace=True) + + # 국가명과 일자 정보 추가 + df['country_code'] = country + df['date'] = datetime(int(year), int(month), int(day.split('.')[0])) + + # DataFrame을 통합 + combined_df = pd.concat([combined_df, df], ignore_index=True) + print(filename) + + # 수정된 데이터를 새로운 파일로 저장 + combined_df.to_csv(destination, index=False) + +directory = 'C:/Users/Yeojun/s4tify/weekly_Top_Songs/' +new_filename = "weekly_top200_combined.csv" + +source = 'C:/Users/Yeojun/Downloads/' +destination = os.path.join(directory, new_filename) + +merge_csv(source, destination) \ No newline at end of file diff --git a/weekly_top200_crawler.py b/weekly_top200_crawler.py new file mode 100644 index 0000000..8f843ce --- /dev/null +++ b/weekly_top200_crawler.py @@ -0,0 +1,52 @@ +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +from webdriver_manager.chrome import ChromeDriverManager +import time, os +from datetime import datetime, timedelta +from dotenv import load_dotenv +load_dotenv("C://Users/Yeojun/.env") +from tqdm import tqdm + +def get_weekly_top200_songs(countries, year=3): + + dates = [] + + current_date = datetime.now() - timedelta(days=5) + end_date = current_date - timedelta(days=(year * 365)) + while current_date >= end_date: + date_string = current_date.strftime("%Y-%m-%d") + dates.append(date_string) + current_date -= timedelta(weeks=1) + + + options = webdriver.ChromeOptions() + options.add_argument("--headless") + options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") + + with webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) as driver: + driver.get("https://charts.spotify.com/charts/view/regional-global-weekly/latest") + time.sleep(2) + + login_button = driver.find_element(By.XPATH, '//*[@id="__next"]/div/div/header/div/div[2]/a/span[1]') + login_button.click() + + id_input = driver.find_element(By.XPATH, '//*[@id="login-username"]').send_keys(os.getenv("SPOTIFY_ID")) + password_input = driver.find_element(By.XPATH, '//*[@id="login-password"]').send_keys(os.getenv("SPOTIFY_PASS")) + login_button = driver.find_element(By.XPATH, '//*[@id="login-button"]') + login_button.click() + time.sleep(3) + + for country in tqdm(countries): + for date in dates: + driver.get(f"https://charts.spotify.com/charts/view/regional-{country}-weekly/{date}") + + time.sleep(2) + + csv_download_button = driver.find_element(By.XPATH, '//*[@id="__next"]/div/div[3]/div/div/div[2]/span') + csv_download_button.click() + + time.sleep(2) + +countries = ['global','kr','us'] +get_weekly_top200_songs(countries) \ No newline at end of file From df92ad2a497df85b0742362477380976e20d9d13 Mon Sep 17 00:00:00 2001 From: Yeojun <52143231+Yun024@users.noreply.github.com> Date: Wed, 5 Mar 2025 10:39:48 +0900 Subject: [PATCH 2/2] [Update] change_write_encoding --- merge_csv_top200.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/merge_csv_top200.py b/merge_csv_top200.py index cbd5e37..4e7db03 100644 --- a/merge_csv_top200.py +++ b/merge_csv_top200.py @@ -17,7 +17,7 @@ def merge_csv(source, destination): _, country, _, year, month, day = filename.split('-') # 기존 데이터 로드 - df = pd.read_csv(file_path) + df = pd.read_csv(file_path,encoding='utf-8') # uri에서 track_id만 추출하여 컬럼명 변경하여 저장 df['uri'] = df['uri'].str.split(':').str[-1] @@ -32,7 +32,7 @@ def merge_csv(source, destination): print(filename) # 수정된 데이터를 새로운 파일로 저장 - combined_df.to_csv(destination, index=False) + combined_df.to_csv(destination, index=False,encoding='utf-8-sig') directory = 'C:/Users/Yeojun/s4tify/weekly_Top_Songs/' new_filename = "weekly_top200_combined.csv" @@ -40,4 +40,4 @@ def merge_csv(source, destination): source = 'C:/Users/Yeojun/Downloads/' destination = os.path.join(directory, new_filename) -merge_csv(source, destination) \ No newline at end of file +merge_csv(source, destination)