-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpttcrawler-bot_no-secret.py
153 lines (131 loc) · 5.91 KB
/
pttcrawler-bot_no-secret.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# coding: utf-8
from flask import Flask, request, abort
from linebot import LineBotApi, WebhookHandler
from linebot.exceptions import InvalidSignatureError, LineBotApiError
from linebot.models import MessageEvent, TextMessage, TextSendMessage
from bs4 import BeautifulSoup
import requests
#line token
channel_access_token = 'Put your token here' # author's token has been changed
channel_secret = 'Put your secret here' # author's secret has been changed
line_bot_api = LineBotApi(channel_access_token)
handler = WebhookHandler(channel_secret)
app = Flask(__name__)
@app.route('/')
def home():
return 'Hello! This is the Line Bot webhook service.'
# 監聽所有來自 /callback 的 Post Request
@app.route("/callback", methods=['POST'])
def callback():
# get X-Line-Signature header value
signature = request.headers['X-Line-Signature']
# get request body as text
body = request.get_data(as_text=True)
app.logger.info("Request body: " + body)
# handle webhook body
try:
handler.handle(body, signature)
except InvalidSignatureError:
abort(400)
return 'OK'
# 爬蟲函數: 爬取 PTT 的最新文章標題
def crawl_ptt(board):
try:
app.logger.info(f"開始爬取 PTT {board}...")
url = f'https://www.ptt.cc/bbs/{board}/index.html'
headers = {'User-Agent': 'Mozilla/5.0'}
cookies = {'over18': '1'} # 設定cookie繞過18歲驗證
response = requests.get(url, headers=headers, cookies=cookies)
# 檢查請求是否成功
if response.status_code != 200:
app.logger.error(f"爬取失敗,狀態碼: {response.status_code}")
return False, "爬取失敗,請檢查看板名稱,或是稍後再試"
soup = BeautifulSoup(response.text, 'html.parser')
posts = [] # 存放結果的列表
# 選擇包含標題、推文和日期的區塊
for post in soup.select('div.r-ent'):
title_elem = post.select_one('.title a') # 文章標題
if title_elem:
title = title_elem.text # 取得標題文字
link = 'https://www.ptt.cc' + title_elem['href'] # 取得文章連結
else:
# 如果文章被刪除,可能會找不到連結
title = "(本文已被刪除)"
link = "(無連結)"
# 取得推文數量
push_count = post.select_one('.nrec').text.strip() # 推文數量
push_count = push_count if push_count else '0' # 如果推文數量是空字串,設置為 0
# 取得文章日期
date = post.select_one('.meta .date').text.strip() # 提取日期
# 將每篇文章的資料儲存到列表中
posts.append({
'title': title,
'link': link,
'push_count': push_count,
'date': date # 加入文章日期
})
if len(posts) == 0:
app.logger.info(f"無法從 {board} 中抓取任何文章")
return False, "目前沒有找到任何文章"
# 紀錄爬取結果
app.logger.info(f"抓取到的標題數量: {len(posts)}")
return True, posts
except:
app.logger.error(f"爬蟲過程中發生錯誤:")
return False, "爬蟲過程中發生錯誤,請稍後再試"
def crawl_hotboards(k:int=10):
url = 'https://www.ptt.cc/bbs/hotboards.html'
headers = {'User-Agent': 'Mozilla/5.0'}
cookies = {'over18': '1'} # 設定cookie繞過18歲驗證
response = requests.get(url, headers=headers, cookies=cookies)
soup = BeautifulSoup(response.text, 'html.parser')
my_boards = []
for board in soup.select(".board")[:k]:
name = board.select_one(".board-name").text
nuser = board.select_one(".board-nuser").text
class_ = board.select_one(".board-class").text
# title = board.select_one(".board-title").text
my_boards.append(f"{name:15s} {nuser.rjust(4)} {class_}")# {title}")
return my_boards
@handler.add(MessageEvent, message=TextMessage)
def handle_message(event):
user_message = event.message.text
try:
# 如果使用者的輸入包含「爬蟲」,觸發爬取 PTT 的文章標題,「爬蟲」二字以外的輸出則視為要爬的ptt版
if user_message[:2] == "爬蟲":
board = user_message.replace("爬蟲", '')
while ' ' in board:
board = board.replace(' ', '_')
crawling_status, posts = crawl_ptt(board) # 執行爬蟲函數
# 回傳爬取結果給使用者
if crawling_status: # 如果有成功爬取
# 格式化結果
reply = ""
for post in posts:
reply += f"{post['title']}\n{post['link']}\n推文數: {post['push_count']}\n日期: {post['date']}\n\n"
text = f"PTT {board} 最新文章標題:\n{reply}"
else:
text = posts
elif "熱門" in user_message:
try:
k = int(user_message.replace("熱門", ''))
text = ""
except:
k = 10
text = "讀取個數輸入有誤,預設讀取最熱門10個看板\n"
text += f"目前最熱門的{k}個PTT看板\n---------------------\n"
boards = crawl_hotboards(k)
text += "\n".join(boards)
else:
text = "若要進行PTT爬蟲,請輸入「爬蟲{看板英文名稱}」\n範例1:爬蟲Soft_Job\n範例2:爬蟲Stock"
text += "\n查看目前熱門看板,請輸入「熱門{欲查看熱門看板數量}」\n範例:熱門15"
line_bot_api.reply_message(
event.reply_token,
TextSendMessage(text=text)
)
except LineBotApiError as e:
app.logger.error(f"LineBotApiError: {e}")
import os
if __name__ == "__main__":
port = int(os.environ.get('PORT', 5000))
app.run(host='0.0.0.0', port=port)