Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add unregister with email smtp #452

Merged
merged 18 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ PORTAL_2FA_KEY=/[2-7A-Z]{16}/
DOCKERHUB_USERNAME=
DOCKERHUB_PASSWORD=
SENTRY_DSN=
PORTAL_JSESSIONID=
10 changes: 10 additions & 0 deletions apps/core/management/commands/crawl_portal_view.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from django.core.management import BaseCommand

from apps.core.management.scripts.portal_crawler import crawl_view


class Command(BaseCommand):
help = "포탈 공지글의 조회수를 크롤링합니다"

def handle(self, *args, **options):
crawl_view()
189 changes: 161 additions & 28 deletions apps/core/management/scripts/portal_crawler.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import hashlib
import re
import uuid
from datetime import datetime
from pytz import timezone as pytz_timezone
from datetime import datetime, timedelta

import boto3
import requests
Expand All @@ -11,10 +10,13 @@
from django.db import transaction
from django.utils import timezone
from django.utils.translation import gettext
from pytz import timezone as pytz_timezone
from tqdm import tqdm

from apps.core.models import Article
from apps.core.models.portal_view_count import PortalViewCount
from apps.user.models import UserProfile
from ara.log import log
from ara.settings import (
AWS_S3_BUCKET_NAME,
PORTAL_ID,
Expand All @@ -40,6 +42,7 @@
BASE_URL = "https://portal.kaist.ac.kr"

KST = pytz_timezone("Asia/Seoul")
PORTAL_NOTICE_BOARD_ID = 1


def _login_kaist_portal():
Expand All @@ -48,11 +51,18 @@ def _login_kaist_portal():
f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page=1&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC",
cookies=COOKIES,
)
print("_login_kaist_portal status code: ", response.status_code)
log.info(f"_login_kaist_portal status code: {response.status_code}")
return session


def _get_article(url, session):
def _list_link_to_full_link(link):
board_id = link.split("/")[-2]
num = link.split("/")[-1]
full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"
return full_link


def _get_portal_article(url, session):
def _already_hyperlinked(html):
soup = bs(html, "lxml")
tagged_links = []
Expand Down Expand Up @@ -92,6 +102,10 @@ def _enable_hyperlink(s):
def _get_new_url_and_save_to_s3(url, session):
if url.startswith("data:") or "." in url.split("/")[-1]: # not a portal image
return url

if url.startswith("/board"):
return f"https://{BASE_URL}/${url}"

enc = hashlib.md5()
enc.update(url.encode())
hash = enc.hexdigest()[:20]
Expand Down Expand Up @@ -128,18 +142,21 @@ def _save_portal_image(html, session):
.contents[0]
.strip()
)
created_at_str = (
soup.find("th", text="작성일(조회수)")
.findNext("td")
.contents[0]
.strip()
.split("(")[0]

created_at_view_count_str = (
soup.find("th", text="작성일(조회수)").findNext("td").contents[0].strip()
)

created_at_str = created_at_view_count_str.split("(")[0]
created_at = (
datetime.strptime(created_at_str, "%Y.%m.%d %H:%M:%S")
.astimezone(KST)
.astimezone(timezone.utc)
)

view_count_str = created_at_view_count_str.split("(")[1].split(")")[0]
view_count = int(view_count_str)

title = soup.select("table > tbody > tr > td.req_first")[0].contents[0]

trs = soup.select("table > tbody > tr")
Expand Down Expand Up @@ -170,13 +187,15 @@ def _save_portal_image(html, session):
"content": html,
"writer": writer,
"created_at": created_at,
"view_count": view_count,
}


def crawl_hour(day=None):
# parameter에서 default로 바로 today()하면, 캐싱되어서 업데이트가 안됨
if day is None:
day = timezone.datetime.today().date()
log.info(f"crawl_hour running for day {day}")

session = _login_kaist_portal()

Expand All @@ -192,9 +211,9 @@ def _get_board_today(page_num):
dates = soup.select("table > tbody > tr > td:nth-child(5)")

if links:
print("------- portal login success!")
log.info("------- portal login success!")
else:
print("------- portal login failed!")
log.info("------- portal login failed!")

today_date = str(day).replace("-", ".")
for link, date in zip(links, dates):
Expand Down Expand Up @@ -224,7 +243,7 @@ def _get_board_today(page_num):

last_portal_article_in_db = (
Article.objects.filter(
parent_board_id=1,
parent_board_id=PORTAL_NOTICE_BOARD_ID,
)
.order_by("-created_at")
.first()
Expand All @@ -234,11 +253,8 @@ def _get_board_today(page_num):
prev_title = ""

for link in links:
link = link["link"]
board_id = link.split("/")[-2]
num = link.split("/")[-1]
full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"
info = _get_article(full_link, session)
full_link = _list_link_to_full_link(link["link"])
info = _get_portal_article(full_link, session)

# Since it is time ordered, consequent ones have been posted more than 1 hour ago.

Expand Down Expand Up @@ -269,21 +285,25 @@ def _get_board_today(page_num):
)

article = Article(
parent_board_id=1,
parent_board_id=PORTAL_NOTICE_BOARD_ID,
title=info["title"],
content=info["content"],
content_text=info["content_text"],
created_by=user,
created_at=created_at_utc,
url=full_link,
latest_portal_view_count=info["view_count"],
)

new_articles.append(article)

prev_title = article.title

# DB의 마지막 포탈글과 방금 크롤링한 글 중 가장 이른 글을 비교
if not new_articles:
log.info("no new articles")
return

earliest_new_article = new_articles[-1]
is_same_day = (
last_portal_article_in_db.created_at.date()
Expand All @@ -299,8 +319,21 @@ def _get_board_today(page_num):

created_articles = Article.objects.bulk_create(new_articles)

new_portal_view_counts = []

for article in created_articles:
portal_view_count = PortalViewCount(
article=article,
view_count=article.latest_portal_view_count,
)
new_portal_view_counts.append(portal_view_count)

PortalViewCount.objects.bulk_create(new_portal_view_counts)

for i in range(len(created_articles)):
print(f"crawled article: {created_articles[i].title}")
log.info(f"crawled article: {created_articles[i].title}")

log.info(f"created {len(created_articles)} articles")


def list_contains_article(articles, article_info):
Expand Down Expand Up @@ -332,18 +365,16 @@ def _get_board(page_num):
page_num = 1

while True:
print("page_num:", page_num)
log.info("page_num:", page_num)
links = []
link = _get_board(page_num)
if link:
links.extend(link)

with transaction.atomic():
for link in tqdm(links):
board_id = link.split("/")[-2]
num = link.split("/")[-1]
full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko"
info = _get_article(full_link, session)
full_link = _list_link_to_full_link(link)
info = _get_portal_article(full_link, session)

user_exist = UserProfile.objects.filter(
nickname=info["writer"], is_newara=False
Expand All @@ -361,24 +392,126 @@ def _get_board(page_num):
picture="user_profiles/default_pictures/KAIST-logo.png",
)

a, created = Article.objects.get_or_create(
parent_board_id=1, # 포탈공지 게시판
a, article_created = Article.objects.get_or_create(
parent_board_id=PORTAL_NOTICE_BOARD_ID, # 포탈공지 게시판
title=info["title"],
content=info["content"],
content_text=info["content_text"],
created_by=user,
url=full_link,
)

if created:
if article_created:
a.created_at = info["created_at"]
a.save()

log.info(info["view_count"])

PortalViewCount.objects.update_or_create(
article=a,
view_count=info["view_count"],
)

page_num += 1

else:
break


def crawl_view():
"""
update all portal_view_count of portal articles
from a week ago until now
"""
now = timezone.datetime.today().date()
log.info(f"crawl_view running on {now}")

week_ago = (
(datetime.today() - timedelta(days=7)).astimezone(KST).astimezone(timezone.utc)
)

session = _login_kaist_portal()

def _get_board_week(page_num):
board_req = session.get(
f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC",
cookies=COOKIES,
)
soup = bs(board_req.text, "lxml")
table = soup.select(".req_tbl_01")[0]
info_list_per_page = []

for row in table.find("tbody").find_all("tr"):
cells = row.find_all("td")
created_at_str = cells[4].text.strip()
created_at = (
datetime.strptime(created_at_str, "%Y.%m.%d")
.astimezone(KST)
.astimezone(timezone.utc)
)

if week_ago > created_at:
return info_list_per_page, True # stop

info = {
"title": cells[0].text.strip(),
"view_count": int(cells[3].text.strip()),
"link": cells[0].find("a").attrs["href"],
"created_at": created_at,
}

info_list_per_page.append(info)

return info_list_per_page, False

info_list = []
page_num = 1

while True:
info_list_per_page, stop = _get_board_week(page_num)
info_list.extend(info_list_per_page)
if stop:
break

page_num += 1

if len(info_list) == 0:
log.info("no portal notice in a week")
return

articles = Article.objects.filter(
created_at__gte=week_ago, parent_board_id=PORTAL_NOTICE_BOARD_ID
)
article_dict = {}

for a in articles:
article_dict[a.url] = a

new_portal_view_counts = []
updated_articles = []

for info in info_list:
full_link = _list_link_to_full_link(info["link"])

if full_link not in article_dict.keys():
continue

article = article_dict[full_link]

portal_view_count = PortalViewCount(
article=article,
view_count=info["view_count"],
)

new_portal_view_counts.append(portal_view_count)

article.latest_portal_view_count = info["view_count"]
updated_articles.append(article)

Article.objects.bulk_update(updated_articles, ["latest_portal_view_count"])
PortalViewCount.objects.bulk_create(new_portal_view_counts)
log.info(f"crawled view count of {len(new_portal_view_counts)} portal notices")


if __name__ == "__main__":
_login_kaist_portal()
3 changes: 2 additions & 1 deletion apps/core/management/tasks.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import time
from collections import defaultdict

from apps.core.management.scripts.portal_crawler import crawl_hour
from apps.core.management.scripts.portal_crawler import crawl_hour, crawl_view
from apps.core.management.scripts.reminder_email_for_reply import send_email
from apps.core.models import BestArticle
from ara import celery_app, redis


@celery_app.task
def crawl_portal():
crawl_view()
crawl_hour()


Expand Down
Loading
Loading