-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
189 lines (156 loc) · 7.98 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import os , re , requests,json
from time import sleep
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from selenium import webdriver
from get_chrome_driver import GetChromeDriver
from datetime import datetime
from markitdown import MarkItDown
import traceback
from typing import Literal
# 環境変数の読み込み
load_dotenv()
DEEPL_API_KEY = os.environ.get("DEEPL_API_KEY" , os.getenv("DEEPL_API_KEY"))
DISCORD_WEBHOOK_URL = os.environ.get("DISCORD_WEBHOOK_URL" , os.getenv("DISCORD_WEBHOOK_URL"))
GLOSSARY_ID = os.environ.get("GLOSSARY_ID" , os.getenv("GLOSSARY_ID"))
SCRAPING_LOG = os.environ.get("SCRAPING_LOG" , None)
SCRAPING_BETA_LOG = os.environ.get("SCRAPING_BETA_LOG" , None)
class Scraper:
"""
Minecraftの更新情報をスクレイピングしてDiscordに投稿するクラスです
"""
def __init__(self, type: Literal["Release", "Beta-and-Preview"], username="Minecraft Release Changelog", avatar_url="https://raw.githubusercontent.com/AKHstudio/mc-update-discord/refs/heads/main/icon/command_block.png"):
"""
Minecraftの更新情報をスクレイピングしてDiscordに投稿するクラス
"""
self.type = type
if type == "Release":
self.url = "https://feedback.minecraft.net/hc/en-us/sections/360001186971-Release-Changelogs"
elif type == "Beta-and-Preview":
self.url = "https://feedback.minecraft.net/hc/en-us/sections/360001185332-Beta-and-Preview-Information-and-Changelogs"
else:
print("typeはReleaseかBeta-and-Previewを指定してください。")
exit(1)
self.username = username
self.avatar_url = avatar_url
# get_driver = GetChromeDriver()
# get_driver.install()
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
# driver
self.driver = webdriver.Chrome(options=options)
# 定義
self.new_post = None
def deepl_translate(self,text: str | list[str]):
"""
DeepLを使って翻訳する関数
"""
source_lang = 'EN'
target_lang = 'JA'
url = "https://api-free.deepl.com/v2/translate"
headers = {
"Authorization": f"DeepL-Auth-Key {DEEPL_API_KEY}"
}
params = {
"text": text,
"source_lang": source_lang,
"target_lang": target_lang,
"glossary_id": GLOSSARY_ID
}
response = requests.post(url, headers=headers, data=params)
# print(response.json())
sleep(1)
return response.json()
def get(self):
"""
スクレイピングしてDiscordに投稿する関数
"""
try:
# スクレイピング
self.driver.get(self.url)
# ページのソースを取得
html = self.driver.page_source
# BeautifulSoupでパース
soup = BeautifulSoup(html, "html.parser")
# 新しい投稿を取得
if self.type == "Release":
self.new_post = soup.find(class_="article-list-link", string=re.compile("Bedrock"), href=True)
else:
self.new_post = soup.find(class_="article-list-link" ,href=True)
if not self.new_post:
print("投稿が見つかりませんでした。")
exit(1)
# ログと取得した投稿が一致しているか確認
if SCRAPING_LOG == self.new_post.text and self.type == "Release":
print("新しい投稿はありません。")
exit(0)
elif SCRAPING_BETA_LOG == self.new_post.text and self.type == "Beta-and-Preview":
print("新しい投稿はありません。")
exit(0)
else:
# 投稿をスクレイピング
self.driver.get('https://feedback.minecraft.net' + self.new_post["href"])
html = self.driver.page_source
soup = BeautifulSoup(html, "html.parser")
artical_body = soup.find(class_="article-body")
if not artical_body:
print("artical_bodyが取得できませんでした。")
exit(1)
for img in artical_body.find_all("img", src=True):
img["src"] = "https://feedback.minecraft.net" + img["src"]
# h1 タグを取得
features = artical_body.find("h1")
if not features:
print("featuresが取得できませんでした。")
exit(1)
# h1 の次の要素を取得し続ける
extracted_html = [features.prettify()]
current = features.find_next_sibling()
while current:
extracted_html.append(str(current))
current = current.find_next_sibling()
if current and current.name in ["h1" , "footer"]: # 次の h1 が来たら終了
break
extracted_html = "".join(extracted_html)
extracted_soup = BeautifulSoup(extracted_html , "html.parser")
text_elements = [tag for tag in extracted_soup.find_all(string=True) if tag.parent.name not in ["script", "style" , "code"] and tag.strip() and tag not in ["MCPE" , "©"]]
translate_texts = self.deepl_translate([text.strip() for text in text_elements])
for text , translate_text in zip(text_elements , translate_texts["translations"]):
text.replace_with(translate_text["text"])
with open("changelog.html" , "w" , encoding="UTF-8") as f:
f.write(extracted_soup.prettify())
markitdown = MarkItDown()
changelog = markitdown.convert("changelog.html")
farst_p = artical_body.find("p").text
if not farst_p:
print("farst_pが取得できませんでした。")
print(farst_p)
exit(1)
date_obj = datetime.strptime(farst_p.replace("Posted:" , "").strip(), '%d %B %Y')
date_str = date_obj.strftime('%Y/%m/%d')
discord_webhook_data = {
"content": f"# {self.new_post.text}\nhttps://feedback.minecraft.net{self.new_post['href']}\n\n**投稿日 : {date_str}**\n" + re.sub(r"[*+]" , "-" , changelog.text_content) + "\n### <:snail:1232230937681596426> and more...",
"username": self.username,
"avatar_url": self.avatar_url
}
# Discordに投稿
response = requests.post(DISCORD_WEBHOOK_URL, data=json.dumps(discord_webhook_data), headers={"Content-Type": "application/json"})
# レスポンス確認
if response.status_code == 204:
print("Message sent successfully!")
else:
print(f"Failed to send message. Status code: {response.status_code}")
# ログを更新
if self.type == "Release":
with open("scraping-release.log", "w" , encoding="UTF-8") as f:
f.write(self.new_post.text)
else:
with open("scraping-beta.log", "w" , encoding="UTF-8") as f:
f.write(self.new_post.text)
except Exception as e:
print(f"エラーが発生しました: {e}")
traceback.print_exc()
exit(1)
finally:
self.driver.close()