From 245d9fc71186a6ef34f074ab01a17f323afa5595 Mon Sep 17 00:00:00 2001 From: retroinspect Date: Wed, 18 Jan 2023 14:47:18 +0000 Subject: [PATCH 01/16] Refactor to save portal view count for new article --- .../core/management/scripts/portal_crawler.py | 45 ++++++++++++++++--- apps/core/management/tasks.py | 3 +- apps/core/migrations/0044_portalviewcount.py | 33 ++++++++++++++ apps/core/models/__init__.py | 1 + apps/core/models/portal_view_count.py | 19 ++++++++ 5 files changed, 93 insertions(+), 8 deletions(-) create mode 100644 apps/core/migrations/0044_portalviewcount.py create mode 100644 apps/core/models/portal_view_count.py diff --git a/apps/core/management/scripts/portal_crawler.py b/apps/core/management/scripts/portal_crawler.py index 641d64c1..c030cede 100644 --- a/apps/core/management/scripts/portal_crawler.py +++ b/apps/core/management/scripts/portal_crawler.py @@ -14,6 +14,7 @@ from tqdm import tqdm from apps.core.models import Article +from apps.core.models.portal_view_count import PortalViewCount from apps.user.models import UserProfile from ara.settings import ( AWS_S3_BUCKET_NAME, @@ -52,7 +53,7 @@ def _login_kaist_portal(): return session -def _get_article(url, session): +def _get_portal_article(url, session): def _already_hyperlinked(html): soup = bs(html, "lxml") tagged_links = [] @@ -123,18 +124,24 @@ def _save_portal_image(html, session): .contents[0] .strip() ) - created_at_str = ( + + created_at_view_count_str = ( soup.find("th", text="작성일(조회수)") .findNext("td") .contents[0] .strip() - .split("(")[0] ) + + created_at_str = created_at_view_count_str.split("(")[0] created_at = ( datetime.strptime(created_at_str, "%Y.%m.%d %H:%M:%S") .astimezone(KST) .astimezone(timezone.utc) ) + + view_count_str = created_at_view_count_str.split("(")[1].split(")")[0] + view_count = int(view_count_str) + title = soup.select("table > tbody > tr > td.req_first")[0].contents[0] trs = soup.select("table > tbody > tr") @@ -165,6 +172,7 @@ def _save_portal_image(html, session): "content": html, "writer": writer, "created_at": created_at, + "view_count": view_count, } @@ -226,6 +234,7 @@ def _get_board_today(page_num): ) new_articles = [] + new_portal_view_counts = [] prev_title = "" for link in links: @@ -233,7 +242,7 @@ def _get_board_today(page_num): board_id = link.split("/")[-2] num = link.split("/")[-1] full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko" - info = _get_article(full_link, session) + info = _get_portal_article(full_link, session) # Since it is time ordered, consequent ones have been posted more than 1 hour ago. @@ -273,7 +282,15 @@ def _get_board_today(page_num): url=full_link, ) + portal_view_count = PortalViewCount( + article=article, + view_count=info["view_count"], + ) + + new_articles.append(article) + new_portal_view_counts.append(portal_view_count) + prev_title = article.title # DB의 마지막 포탈글과 방금 크롤링한 글 중 가장 이른 글을 비교 @@ -293,6 +310,7 @@ def _get_board_today(page_num): new_articles.pop() created_articles = Article.objects.bulk_create(new_articles) + PortalViewCount.objects.bulk_create(new_portal_view_counts) for i in range(len(created_articles)): print(f"crawled article: {created_articles[i].title}") @@ -338,7 +356,7 @@ def _get_board(page_num): board_id = link.split("/")[-2] num = link.split("/")[-1] full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko" - info = _get_article(full_link, session) + info = _get_portal_article(full_link, session) user_exist = UserProfile.objects.filter( nickname=info["writer"], is_newara=False @@ -356,7 +374,7 @@ def _get_board(page_num): picture="user_profiles/default_pictures/KAIST-logo.png", ) - a, created = Article.objects.get_or_create( + a, article_created = Article.objects.get_or_create( parent_board_id=1, # 포탈공지 게시판 title=info["title"], content=info["content"], @@ -365,15 +383,28 @@ def _get_board(page_num): url=full_link, ) - if created: + if article_created: a.created_at = info["created_at"] a.save() + print(info["view_count"]) + + PortalViewCount.objects.update_or_create( + article=a, + view_count=info["view_count"], + ) + + page_num += 1 else: break +def crawl_view(): + # TODO: update all portal_view_count of portal articles + # from a week ago until now + pass + if __name__ == "__main__": _login_kaist_portal() diff --git a/apps/core/management/tasks.py b/apps/core/management/tasks.py index b88efe52..736d79f7 100644 --- a/apps/core/management/tasks.py +++ b/apps/core/management/tasks.py @@ -1,7 +1,7 @@ import time from collections import defaultdict -from apps.core.management.scripts.portal_crawler import crawl_hour +from apps.core.management.scripts.portal_crawler import crawl_hour, crawl_view from apps.core.management.scripts.reminder_email_for_reply import send_email from apps.core.models import BestArticle from ara import celery_app, redis @@ -9,6 +9,7 @@ @celery_app.task def crawl_portal(): + crawl_view() crawl_hour() diff --git a/apps/core/migrations/0044_portalviewcount.py b/apps/core/migrations/0044_portalviewcount.py new file mode 100644 index 00000000..bebfd697 --- /dev/null +++ b/apps/core/migrations/0044_portalviewcount.py @@ -0,0 +1,33 @@ +# Generated by Django 3.2.16 on 2023-01-18 14:20 + +import datetime +from django.db import migrations, models +import django.db.models.deletion +from django.utils.timezone import utc +import django.utils.timezone + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0043_board_comment_access_mask'), + ] + + operations = [ + migrations.CreateModel( + name='PortalViewCount', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now, verbose_name='생성 시간')), + ('updated_at', models.DateTimeField(auto_now=True, db_index=True, verbose_name='수정 시간')), + ('deleted_at', models.DateTimeField(db_index=True, default=datetime.datetime(1, 1, 1, 0, 0, tzinfo=utc), verbose_name='삭제 시간')), + ('view_count', models.IntegerField(default=0, verbose_name='조회수 값')), + ('article', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='게시물', to='core.article')), + ], + options={ + 'verbose_name': '포탈 조회 기록', + 'ordering': ('-created_at',), + 'abstract': False, + }, + ), + ] diff --git a/apps/core/models/__init__.py b/apps/core/models/__init__.py index 52eaaa10..0c04bbff 100644 --- a/apps/core/models/__init__.py +++ b/apps/core/models/__init__.py @@ -18,3 +18,4 @@ from .signals import * from .topic import * from .vote import * +from .portal_view_count import * diff --git a/apps/core/models/portal_view_count.py b/apps/core/models/portal_view_count.py new file mode 100644 index 00000000..634d2b18 --- /dev/null +++ b/apps/core/models/portal_view_count.py @@ -0,0 +1,19 @@ +from django.db import models +from ara.db.models import MetaDataModel + + +class PortalViewCount(MetaDataModel): + class Meta(MetaDataModel.Meta): + verbose_name = "포탈 조회 기록" + + article = models.ForeignKey( + on_delete=models.CASCADE, + to="core.Article", + null=False, + related_name="게시물", + ) + + view_count = models.IntegerField( + default=0, + verbose_name="조회수 값", + ) From c2c8f1e0bfcb627895c5c273a31067d337301cf3 Mon Sep 17 00:00:00 2001 From: retroinspect Date: Wed, 8 Feb 2023 11:51:28 +0000 Subject: [PATCH 02/16] Add PORTAL_JSESSIONID to env example --- .env.example | 1 + 1 file changed, 1 insertion(+) diff --git a/.env.example b/.env.example index f3eacf15..0ded02ac 100644 --- a/.env.example +++ b/.env.example @@ -12,3 +12,4 @@ PORTAL_2FA_KEY=/[2-7A-Z]{16}/ DOCKERHUB_USERNAME= DOCKERHUB_PASSWORD= SENTRY_DSN= +PORTAL_JSESSIONID= \ No newline at end of file From d7e30d2be3922bbab31b2d7ceed484f08b44fecb Mon Sep 17 00:00:00 2001 From: retroinspect Date: Wed, 8 Feb 2023 14:13:52 +0000 Subject: [PATCH 03/16] Add hourly viewcount updating crawler - Hourly update viewcount of portal articles in a week - Save latest viewcount in Article model for fast retrieval --- .../core/management/scripts/portal_crawler.py | 128 +++++++++++++++--- ...45_article_add_latest_portal_view_count.py | 22 +++ apps/core/models/article.py | 11 ++ apps/core/models/portal_view_count.py | 12 +- 4 files changed, 150 insertions(+), 23 deletions(-) create mode 100644 apps/core/migrations/0045_article_add_latest_portal_view_count.py diff --git a/apps/core/management/scripts/portal_crawler.py b/apps/core/management/scripts/portal_crawler.py index c030cede..84f65955 100644 --- a/apps/core/management/scripts/portal_crawler.py +++ b/apps/core/management/scripts/portal_crawler.py @@ -1,8 +1,8 @@ import hashlib import re import uuid -from datetime import datetime from pytz import timezone as pytz_timezone +from datetime import timedelta, datetime import boto3 import requests @@ -53,6 +53,13 @@ def _login_kaist_portal(): return session +def _list_link_to_full_link(link): + board_id = link.split("/")[-2] + num = link.split("/")[-1] + full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko" + return full_link + + def _get_portal_article(url, session): def _already_hyperlinked(html): soup = bs(html, "lxml") @@ -181,6 +188,8 @@ def crawl_hour(day=None): if day is None: day = timezone.datetime.today().date() + print(f"crawl_hour running for day {day}") + session = _login_kaist_portal() def _get_board_today(page_num): @@ -238,10 +247,7 @@ def _get_board_today(page_num): prev_title = "" for link in links: - link = link["link"] - board_id = link.split("/")[-2] - num = link.split("/")[-1] - full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko" + full_link = _list_link_to_full_link(link["link"]) info = _get_portal_article(full_link, session) # Since it is time ordered, consequent ones have been posted more than 1 hour ago. @@ -280,22 +286,18 @@ def _get_board_today(page_num): created_by=user, created_at=created_at_utc, url=full_link, + latest_portal_view_count=info["view_count"], ) - portal_view_count = PortalViewCount( - article=article, - view_count=info["view_count"], - ) - - new_articles.append(article) - new_portal_view_counts.append(portal_view_count) prev_title = article.title # DB의 마지막 포탈글과 방금 크롤링한 글 중 가장 이른 글을 비교 if not new_articles: + print('no new articles') return + earliest_new_article = new_articles[-1] is_same_day = ( last_portal_article_in_db.created_at.date() @@ -310,11 +312,20 @@ def _get_board_today(page_num): new_articles.pop() created_articles = Article.objects.bulk_create(new_articles) + + for article in created_articles: + portal_view_count = PortalViewCount( + article=article, + view_count=article.latest_portal_view_count, + ) + new_portal_view_counts.append(portal_view_count) + PortalViewCount.objects.bulk_create(new_portal_view_counts) for i in range(len(created_articles)): print(f"crawled article: {created_articles[i].title}") + print(f"created {len(created_articles)} articles") def list_contains_article(articles, article_info): for a in articles: @@ -353,9 +364,7 @@ def _get_board(page_num): with transaction.atomic(): for link in tqdm(links): - board_id = link.split("/")[-2] - num = link.split("/")[-1] - full_link = f"{BASE_URL}/board/read.brd?cmd=READ&boardId={board_id}&bltnNo={num}&lang_knd=ko" + full_link = _list_link_to_full_link(link) info = _get_portal_article(full_link, session) user_exist = UserProfile.objects.filter( @@ -400,11 +409,92 @@ def _get_board(page_num): else: break - def crawl_view(): - # TODO: update all portal_view_count of portal articles - # from a week ago until now - pass + ''' + update all portal_view_count of portal articles + from a week ago until now + ''' + print(f"crawl_view running on {timezone.datetime.today().date()}") + + week_ago = timezone.get_current_timezone().localize(datetime.today() - timedelta(days=7)) + + session = _login_kaist_portal() + + def _get_board_week(page_num): + board_req = session.get( + f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page={page_num}&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC", + cookies=COOKIES, + ) + soup = bs(board_req.text, "lxml") + table = soup.select('.req_tbl_01')[0] + info_list_per_page = [] + + for row in table.find('tbody').find_all('tr'): + cells = row.find_all("td") + created_at_str = cells[4].text.strip() + created_at = timezone.get_current_timezone().localize( + datetime.strptime(created_at_str, "%Y.%m.%d") + ) + + if week_ago > created_at: + return info_list_per_page, True # stop + + info = { + "title": cells[0].text.strip(), + "view_count": int(cells[3].text.strip()), + "link": cells[0].find('a').attrs["href"], + "created_at": created_at, + } + + info_list_per_page.append(info) + + return info_list_per_page, False + + info_list = [] + page_num = 1 + + while True: + info_list_per_page, stop = _get_board_week(page_num) + info_list.extend(info_list_per_page) + if stop: + break + + page_num += 1 + + if len(info_list) == 0: + print("no portal notice in a week") + return + + articles = Article.objects.filter(created_at__gte=week_ago, parent_board_id=1) + article_dict = dict() + + for a in articles: + article_dict[a.url] = a + + new_portal_view_counts = [] + updated_articles = [] + + for info in info_list: + full_link = _list_link_to_full_link(info['link']) + + if full_link not in article_dict.keys(): + continue + + article = article_dict[full_link] + + portal_view_count = PortalViewCount( + article=article, + view_count=info["view_count"], + ) + + new_portal_view_counts.append(portal_view_count) + + article.latest_portal_view_count = info["view_count"] + updated_articles.append(article) + + Article.objects.bulk_update(updated_articles, ["latest_portal_view_count"]) + PortalViewCount.objects.bulk_create(new_portal_view_counts) + print(f"crawled view count of {len(new_portal_view_counts)} portal notices") if __name__ == "__main__": _login_kaist_portal() diff --git a/apps/core/migrations/0045_article_add_latest_portal_view_count.py b/apps/core/migrations/0045_article_add_latest_portal_view_count.py new file mode 100644 index 00000000..540c541a --- /dev/null +++ b/apps/core/migrations/0045_article_add_latest_portal_view_count.py @@ -0,0 +1,22 @@ +# Generated by Django 3.2.16 on 2023-02-08 13:31 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0044_portalviewcount'), + ] + + operations = [ + migrations.AlterModelOptions( + name='portalviewcount', + options={'ordering': ('-created_at',)}, + ), + migrations.AddField( + model_name='article', + name='latest_portal_view_count', + field=models.IntegerField(null=True, verbose_name='포탈조회수'), + ), + ] diff --git a/apps/core/models/article.py b/apps/core/models/article.py index 4525bc4e..8a55629f 100644 --- a/apps/core/models/article.py +++ b/apps/core/models/article.py @@ -133,6 +133,17 @@ class Article(MetaDataModel): verbose_name="포탈 링크", max_length=200, blank=True, + default=None, + verbose_name="링크", + # TODO: length 제한 후 index 걸기 + ) + + latest_portal_view_count = models.IntegerField( + null=True, + verbose_name="포탈조회수", + ) + + content_updated_at = models.DateTimeField( null=True, default=None, ) diff --git a/apps/core/models/portal_view_count.py b/apps/core/models/portal_view_count.py index 634d2b18..eefd025d 100644 --- a/apps/core/models/portal_view_count.py +++ b/apps/core/models/portal_view_count.py @@ -1,11 +1,8 @@ from django.db import models from ara.db.models import MetaDataModel - +from django.utils import timezone class PortalViewCount(MetaDataModel): - class Meta(MetaDataModel.Meta): - verbose_name = "포탈 조회 기록" - article = models.ForeignKey( on_delete=models.CASCADE, to="core.Article", @@ -17,3 +14,10 @@ class Meta(MetaDataModel.Meta): default=0, verbose_name="조회수 값", ) + + + created_at = models.DateTimeField( + default=timezone.now, + db_index=True, + verbose_name="생성 시간", + ) From 609f82d1b4f9c22f0f5e93065385fac3c375e57a Mon Sep 17 00:00:00 2001 From: retroinspect Date: Wed, 8 Feb 2023 14:43:45 +0000 Subject: [PATCH 04/16] Fix formatting --- .env.example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.env.example b/.env.example index 0ded02ac..4262afbe 100644 --- a/.env.example +++ b/.env.example @@ -12,4 +12,4 @@ PORTAL_2FA_KEY=/[2-7A-Z]{16}/ DOCKERHUB_USERNAME= DOCKERHUB_PASSWORD= SENTRY_DSN= -PORTAL_JSESSIONID= \ No newline at end of file +PORTAL_JSESSIONID= From 146158f49f5253df1a628c384dd9d5078e811f2d Mon Sep 17 00:00:00 2001 From: retroinspect Date: Thu, 16 Mar 2023 14:32:18 +0000 Subject: [PATCH 05/16] Fix model according to review --- .../migrations/0046_auto_20230316_2303.py | 29 +++++++++++++++++++ apps/core/models/article.py | 13 +++++---- apps/core/models/portal_view_count.py | 3 +- 3 files changed, 38 insertions(+), 7 deletions(-) create mode 100644 apps/core/migrations/0046_auto_20230316_2303.py diff --git a/apps/core/migrations/0046_auto_20230316_2303.py b/apps/core/migrations/0046_auto_20230316_2303.py new file mode 100644 index 00000000..08605876 --- /dev/null +++ b/apps/core/migrations/0046_auto_20230316_2303.py @@ -0,0 +1,29 @@ +# Generated by Django 3.2.16 on 2023-03-16 14:03 + +from django.db import migrations, models +import django.utils.timezone + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0045_article_add_latest_portal_view_count'), + ] + + operations = [ + migrations.AlterField( + model_name='article', + name='latest_portal_view_count', + field=models.PositiveIntegerField(blank=True, default=None, null=True, verbose_name='포탈조회수'), + ), + migrations.AlterField( + model_name='article', + name='url', + field=models.URLField(blank=True, db_index=True, default=None, max_length=256, null=True, verbose_name='링크'), + ), + migrations.AlterField( + model_name='portalviewcount', + name='created_at', + field=models.DateTimeField(default=django.utils.timezone.now, verbose_name='생성 시간'), + ), + ] diff --git a/apps/core/models/article.py b/apps/core/models/article.py index 8a55629f..f2928574 100644 --- a/apps/core/models/article.py +++ b/apps/core/models/article.py @@ -129,17 +129,20 @@ class Article(MetaDataModel): null=True, default=None, ) + url = models.URLField( - verbose_name="포탈 링크", - max_length=200, + null=True, blank=True, default=None, - verbose_name="링크", - # TODO: length 제한 후 index 걸기 + verbose_name="포탈 링크", + max_length=256, + db_index=True, ) - latest_portal_view_count = models.IntegerField( + latest_portal_view_count = models.PositiveIntegerField( null=True, + blank=True, + default=None, verbose_name="포탈조회수", ) diff --git a/apps/core/models/portal_view_count.py b/apps/core/models/portal_view_count.py index eefd025d..7360bfdb 100644 --- a/apps/core/models/portal_view_count.py +++ b/apps/core/models/portal_view_count.py @@ -2,6 +2,7 @@ from ara.db.models import MetaDataModel from django.utils import timezone + class PortalViewCount(MetaDataModel): article = models.ForeignKey( on_delete=models.CASCADE, @@ -15,9 +16,7 @@ class PortalViewCount(MetaDataModel): verbose_name="조회수 값", ) - created_at = models.DateTimeField( default=timezone.now, - db_index=True, verbose_name="생성 시간", ) From 5a41d207cbe2bfe3bd7af9cc3c82d379453abdb6 Mon Sep 17 00:00:00 2001 From: retroinspect Date: Thu, 16 Mar 2023 14:47:29 +0000 Subject: [PATCH 06/16] Fix to use logger to log --- .../core/management/scripts/portal_crawler.py | 59 ++++++++++--------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/apps/core/management/scripts/portal_crawler.py b/apps/core/management/scripts/portal_crawler.py index 84f65955..e372c5c2 100644 --- a/apps/core/management/scripts/portal_crawler.py +++ b/apps/core/management/scripts/portal_crawler.py @@ -22,6 +22,8 @@ PORTAL_JSESSIONID, PORTAL_PASSWORD, ) +from ara.log import log + LOGIN_INFO_SSO2 = { "userid": PORTAL_ID, @@ -49,7 +51,7 @@ def _login_kaist_portal(): f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page=1&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC", cookies=COOKIES, ) - print("_login_kaist_portal status code: ", response.status_code) + log.info("_login_kaist_portal status code: %s", response.status_code) return session @@ -133,10 +135,7 @@ def _save_portal_image(html, session): ) created_at_view_count_str = ( - soup.find("th", text="작성일(조회수)") - .findNext("td") - .contents[0] - .strip() + soup.find("th", text="작성일(조회수)").findNext("td").contents[0].strip() ) created_at_str = created_at_view_count_str.split("(")[0] @@ -187,8 +186,7 @@ def crawl_hour(day=None): # parameter에서 default로 바로 today()하면, 캐싱되어서 업데이트가 안됨 if day is None: day = timezone.datetime.today().date() - - print(f"crawl_hour running for day {day}") + logging.log(f"crawl_hour running for day {day}") session = _login_kaist_portal() @@ -204,9 +202,9 @@ def _get_board_today(page_num): dates = soup.select("table > tbody > tr > td:nth-child(5)") if links: - print("------- portal login success!") + log.info("------- portal login success!") else: - print("------- portal login failed!") + log.info("------- portal login failed!") today_date = str(day).replace("-", ".") for link, date in zip(links, dates): @@ -295,9 +293,9 @@ def _get_board_today(page_num): # DB의 마지막 포탈글과 방금 크롤링한 글 중 가장 이른 글을 비교 if not new_articles: - print('no new articles') + log.info("no new articles") return - + earliest_new_article = new_articles[-1] is_same_day = ( last_portal_article_in_db.created_at.date() @@ -323,9 +321,10 @@ def _get_board_today(page_num): PortalViewCount.objects.bulk_create(new_portal_view_counts) for i in range(len(created_articles)): - print(f"crawled article: {created_articles[i].title}") + log.info(f"crawled article: {created_articles[i].title}") + + log.info(f"created {len(created_articles)} articles") - print(f"created {len(created_articles)} articles") def list_contains_article(articles, article_info): for a in articles: @@ -356,7 +355,7 @@ def _get_board(page_num): page_num = 1 while True: - print("page_num:", page_num) + log.info("page_num:", page_num) links = [] link = _get_board(page_num) if link: @@ -364,7 +363,7 @@ def _get_board(page_num): with transaction.atomic(): for link in tqdm(links): - full_link = _list_link_to_full_link(link) + full_link = _list_link_to_full_link(link) info = _get_portal_article(full_link, session) user_exist = UserProfile.objects.filter( @@ -396,27 +395,30 @@ def _get_board(page_num): a.created_at = info["created_at"] a.save() - print(info["view_count"]) + log.info(info["view_count"]) PortalViewCount.objects.update_or_create( article=a, view_count=info["view_count"], ) - page_num += 1 else: break + def crawl_view(): - ''' + """ update all portal_view_count of portal articles from a week ago until now - ''' - print(f"crawl_view running on {timezone.datetime.today().date()}") + """ + now = timezone.datetime.today().date() + log.info(f"crawl_view running on {now}") - week_ago = timezone.get_current_timezone().localize(datetime.today() - timedelta(days=7)) + week_ago = timezone.get_current_timezone().localize( + datetime.today() - timedelta(days=7) + ) session = _login_kaist_portal() @@ -426,10 +428,10 @@ def _get_board_week(page_num): cookies=COOKIES, ) soup = bs(board_req.text, "lxml") - table = soup.select('.req_tbl_01')[0] + table = soup.select(".req_tbl_01")[0] info_list_per_page = [] - for row in table.find('tbody').find_all('tr'): + for row in table.find("tbody").find_all("tr"): cells = row.find_all("td") created_at_str = cells[4].text.strip() created_at = timezone.get_current_timezone().localize( @@ -437,12 +439,12 @@ def _get_board_week(page_num): ) if week_ago > created_at: - return info_list_per_page, True # stop + return info_list_per_page, True # stop info = { "title": cells[0].text.strip(), "view_count": int(cells[3].text.strip()), - "link": cells[0].find('a').attrs["href"], + "link": cells[0].find("a").attrs["href"], "created_at": created_at, } @@ -462,7 +464,7 @@ def _get_board_week(page_num): page_num += 1 if len(info_list) == 0: - print("no portal notice in a week") + log.info("no portal notice in a week") return articles = Article.objects.filter(created_at__gte=week_ago, parent_board_id=1) @@ -475,7 +477,7 @@ def _get_board_week(page_num): updated_articles = [] for info in info_list: - full_link = _list_link_to_full_link(info['link']) + full_link = _list_link_to_full_link(info["link"]) if full_link not in article_dict.keys(): continue @@ -494,7 +496,8 @@ def _get_board_week(page_num): Article.objects.bulk_update(updated_articles, ["latest_portal_view_count"]) PortalViewCount.objects.bulk_create(new_portal_view_counts) - print(f"crawled view count of {len(new_portal_view_counts)} portal notices") + log.info(f"crawled view count of {len(new_portal_view_counts)} portal notices") + if __name__ == "__main__": _login_kaist_portal() From ad820b255a0cf5b45f1c6cf0a2c8d7c674b42bfe Mon Sep 17 00:00:00 2001 From: retroinspect Date: Thu, 16 Mar 2023 15:01:41 +0000 Subject: [PATCH 07/16] Apply misc. code review --- apps/core/management/scripts/portal_crawler.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/apps/core/management/scripts/portal_crawler.py b/apps/core/management/scripts/portal_crawler.py index e372c5c2..d3a91030 100644 --- a/apps/core/management/scripts/portal_crawler.py +++ b/apps/core/management/scripts/portal_crawler.py @@ -43,7 +43,7 @@ BASE_URL = "https://portal.kaist.ac.kr" KST = pytz_timezone("Asia/Seoul") - +PORTAL_NOTICE_BOARD_ID = 1 def _login_kaist_portal(): session = requests.Session() @@ -234,14 +234,13 @@ def _get_board_today(page_num): last_portal_article_in_db = ( Article.objects.filter( - parent_board_id=1, + parent_board_id=PORTAL_NOTICE_BOARD_ID, ) .order_by("-created_at") .first() ) new_articles = [] - new_portal_view_counts = [] prev_title = "" for link in links: @@ -277,7 +276,7 @@ def _get_board_today(page_num): ) article = Article( - parent_board_id=1, + parent_board_id=PORTAL_NOTICE_BOARD_ID, title=info["title"], content=info["content"], content_text=info["content_text"], @@ -311,6 +310,8 @@ def _get_board_today(page_num): created_articles = Article.objects.bulk_create(new_articles) + new_portal_view_counts = [] + for article in created_articles: portal_view_count = PortalViewCount( article=article, @@ -383,7 +384,7 @@ def _get_board(page_num): ) a, article_created = Article.objects.get_or_create( - parent_board_id=1, # 포탈공지 게시판 + parent_board_id=PORTAL_NOTICE_BOARD_ID, # 포탈공지 게시판 title=info["title"], content=info["content"], content_text=info["content_text"], @@ -467,8 +468,10 @@ def _get_board_week(page_num): log.info("no portal notice in a week") return - articles = Article.objects.filter(created_at__gte=week_ago, parent_board_id=1) - article_dict = dict() + articles = Article.objects.filter( + created_at__gte=week_ago, parent_board_id=PORTAL_NOTICE_BOARD_ID + ) + article_dict = {} for a in articles: article_dict[a.url] = a From f6eb67acebd214df242a543ed4fd43024546652f Mon Sep 17 00:00:00 2001 From: retroinspect Date: Thu, 23 Mar 2023 14:36:01 +0000 Subject: [PATCH 08/16] Change migration file name --- ..._20230316_2303.py => 0046_alter_portal_crawl_related_field.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename apps/core/migrations/{0046_auto_20230316_2303.py => 0046_alter_portal_crawl_related_field.py} (100%) diff --git a/apps/core/migrations/0046_auto_20230316_2303.py b/apps/core/migrations/0046_alter_portal_crawl_related_field.py similarity index 100% rename from apps/core/migrations/0046_auto_20230316_2303.py rename to apps/core/migrations/0046_alter_portal_crawl_related_field.py From 64f1ee2b90521049b513b60777cf71cb4cb828b9 Mon Sep 17 00:00:00 2001 From: retroinspect Date: Thu, 23 Mar 2023 14:41:51 +0000 Subject: [PATCH 09/16] Add command for new crawler --- apps/core/management/commands/crawl_portal_view.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 apps/core/management/commands/crawl_portal_view.py diff --git a/apps/core/management/commands/crawl_portal_view.py b/apps/core/management/commands/crawl_portal_view.py new file mode 100644 index 00000000..134c03e6 --- /dev/null +++ b/apps/core/management/commands/crawl_portal_view.py @@ -0,0 +1,10 @@ +from django.core.management import BaseCommand + +from apps.core.management.scripts.portal_crawler import crawl_view + + +class Command(BaseCommand): + help = "포탈 공지글의 조회수를 크롤링합니다" + + def handle(self, *args, **options): + crawl_view() From 0668bf77d1ad858c07a962b216829cfb55455ec0 Mon Sep 17 00:00:00 2001 From: retroinspect Date: Thu, 23 Mar 2023 14:45:56 +0000 Subject: [PATCH 10/16] Add exception for handling new image format --- apps/core/management/scripts/portal_crawler.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/apps/core/management/scripts/portal_crawler.py b/apps/core/management/scripts/portal_crawler.py index d3a91030..8740810f 100644 --- a/apps/core/management/scripts/portal_crawler.py +++ b/apps/core/management/scripts/portal_crawler.py @@ -102,6 +102,10 @@ def _enable_hyperlink(s): def _get_new_url_and_save_to_s3(url, session): if url.startswith("data:") or "." in url.split("/")[-1]: # not a portal image return url + + if url.startswith("/board"): + return f"https://{BASE_URL}/${url}" + enc = hashlib.md5() enc.update(url.encode()) hash = enc.hexdigest()[:20] @@ -118,8 +122,12 @@ def _save_portal_image(html, session): soup = bs(html, "lxml") for child in soup.find_all("img", {}): old_url = child.attrs.get("src") - new_url = _get_new_url_and_save_to_s3(old_url, session) - child["src"] = new_url + try: + new_url = _get_new_url_and_save_to_s3(old_url, session) + child["src"] = new_url + except Exception as exc: + log.info(child) + raise exec return str(soup) From 63628a8b727139f92e8c7c4a77a1fb984914acec Mon Sep 17 00:00:00 2001 From: retroinspect Date: Thu, 23 Mar 2023 14:59:11 +0000 Subject: [PATCH 11/16] Fix log handler to handle int --- ara/log/handler.py | 6 +++++- ara/settings/log.py | 8 ++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/ara/log/handler.py b/ara/log/handler.py index 7d09c3cf..b0a42700 100644 --- a/ara/log/handler.py +++ b/ara/log/handler.py @@ -13,7 +13,11 @@ class LogMiddlewareHandler(logging.Handler): @staticmethod def message_from_record(record): - if isinstance(record.msg, dict) or isinstance(record.msg, str): + if ( + isinstance(record.msg, dict) + or isinstance(record.msg, str) + or isinstance(record.msg, int) + ): message = {"raw": record.msg} elif isinstance(record.msg, Exception): message = ErrorLogObject.format_exception(record.msg) diff --git a/ara/settings/log.py b/ara/settings/log.py index 72b92c67..13974d73 100644 --- a/ara/settings/log.py +++ b/ara/settings/log.py @@ -33,9 +33,13 @@ }, }, "loggers": { - "default": {"handlers": ["default"], "level": "DEBUG", "propagate": True}, + "default": { + "handlers": ["default"], + "level": "DEBUG", + "propagate": True, + }, "ara_logger": { - "handlers": ["rotating_file"], + "handlers": ["rotating_file", "console"], "level": "DEBUG", "propagate": False, }, From 35003d8b892ef44b2153fb1ca5c2a2f5cba06666 Mon Sep 17 00:00:00 2001 From: retroinspect Date: Tue, 9 Jan 2024 14:33:43 +0000 Subject: [PATCH 12/16] chore: Merge migration script --- apps/core/migrations/0058_merge_20240109_2331.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 apps/core/migrations/0058_merge_20240109_2331.py diff --git a/apps/core/migrations/0058_merge_20240109_2331.py b/apps/core/migrations/0058_merge_20240109_2331.py new file mode 100644 index 00000000..3f5420fe --- /dev/null +++ b/apps/core/migrations/0058_merge_20240109_2331.py @@ -0,0 +1,12 @@ +# Generated by Django 4.2.7 on 2024-01-09 14:31 + +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0046_alter_portal_crawl_related_field"), + ("core", "0057_alter_article_name_type_and_more"), + ] + + operations = [] From c6a0ae69ef8adf3413289f290b29d648eef54327 Mon Sep 17 00:00:00 2001 From: retroinspect Date: Tue, 9 Jan 2024 15:22:15 +0000 Subject: [PATCH 13/16] fix: timezone for crawl_view --- apps/core/management/scripts/portal_crawler.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/apps/core/management/scripts/portal_crawler.py b/apps/core/management/scripts/portal_crawler.py index 8740810f..31b5963c 100644 --- a/apps/core/management/scripts/portal_crawler.py +++ b/apps/core/management/scripts/portal_crawler.py @@ -1,8 +1,7 @@ import hashlib import re import uuid -from pytz import timezone as pytz_timezone -from datetime import timedelta, datetime +from datetime import datetime, timedelta import boto3 import requests @@ -11,19 +10,19 @@ from django.db import transaction from django.utils import timezone from django.utils.translation import gettext +from pytz import timezone as pytz_timezone from tqdm import tqdm from apps.core.models import Article from apps.core.models.portal_view_count import PortalViewCount from apps.user.models import UserProfile +from ara.log import log from ara.settings import ( AWS_S3_BUCKET_NAME, PORTAL_ID, PORTAL_JSESSIONID, PORTAL_PASSWORD, ) -from ara.log import log - LOGIN_INFO_SSO2 = { "userid": PORTAL_ID, @@ -45,6 +44,7 @@ KST = pytz_timezone("Asia/Seoul") PORTAL_NOTICE_BOARD_ID = 1 + def _login_kaist_portal(): session = requests.Session() response = session.get( @@ -425,8 +425,8 @@ def crawl_view(): now = timezone.datetime.today().date() log.info(f"crawl_view running on {now}") - week_ago = timezone.get_current_timezone().localize( - datetime.today() - timedelta(days=7) + week_ago = ( + (datetime.today() - timedelta(days=7)).astimezone(KST).astimezone(timezone.utc) ) session = _login_kaist_portal() @@ -443,8 +443,10 @@ def _get_board_week(page_num): for row in table.find("tbody").find_all("tr"): cells = row.find_all("td") created_at_str = cells[4].text.strip() - created_at = timezone.get_current_timezone().localize( + created_at = ( datetime.strptime(created_at_str, "%Y.%m.%d") + .astimezone(KST) + .astimezone(timezone.utc) ) if week_ago > created_at: From 5c4f19b5622e79e06cd8a4cb37f0631a6141f759 Mon Sep 17 00:00:00 2001 From: retroinspect <48230029+retroinspect@users.noreply.github.com> Date: Tue, 6 Feb 2024 21:13:14 +0900 Subject: [PATCH 14/16] fix: Apply suggestions from code review Co-authored-by: Giyeong Kim --- apps/core/management/scripts/portal_crawler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/core/management/scripts/portal_crawler.py b/apps/core/management/scripts/portal_crawler.py index 31b5963c..dd527a1b 100644 --- a/apps/core/management/scripts/portal_crawler.py +++ b/apps/core/management/scripts/portal_crawler.py @@ -51,7 +51,7 @@ def _login_kaist_portal(): f"{BASE_URL}/board/list.brd?boardId=today_notice&lang_knd=ko&userAgent=Chrome&isMobile=false&page=1&userAgent=Chrome&isMobile=False&sortColumn=REG_DATIM&sortMethod=DESC", cookies=COOKIES, ) - log.info("_login_kaist_portal status code: %s", response.status_code) + log.info(f"_login_kaist_portal status code: {response.status_code}") return session @@ -194,7 +194,7 @@ def crawl_hour(day=None): # parameter에서 default로 바로 today()하면, 캐싱되어서 업데이트가 안됨 if day is None: day = timezone.datetime.today().date() - logging.log(f"crawl_hour running for day {day}") + log.info(f"crawl_hour running for day {day}") session = _login_kaist_portal() From 7f8ddc7de398e3ca527ef426e9ac22b137687a97 Mon Sep 17 00:00:00 2001 From: DoyunShin Date: Wed, 21 Feb 2024 07:47:00 +0000 Subject: [PATCH 15/16] feat(unregister): Add unregister with email smtp --- apps/user/urls.py | 2 + apps/user/views/unregister.py | 82 +++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 apps/user/views/unregister.py diff --git a/apps/user/urls.py b/apps/user/urls.py index 9281562a..4b6852e0 100644 --- a/apps/user/urls.py +++ b/apps/user/urls.py @@ -3,9 +3,11 @@ from apps.user.views.fcmtoken import FCMTokenView from apps.user.views.me import MeView from apps.user.views.router import router +from apps.user.views.unregister import Unregister urlpatterns = [ path("api/", include(router.urls)), path("api/me", MeView.as_view(), name="me"), + path("api/unregister", Unregister.as_view(), name="unregister"), path("api/fcm_token/", FCMTokenView.as_view(), name="fcm"), ] diff --git a/apps/user/views/unregister.py b/apps/user/views/unregister.py new file mode 100644 index 00000000..a5d427d5 --- /dev/null +++ b/apps/user/views/unregister.py @@ -0,0 +1,82 @@ +import smtplib +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText + +from django.utils.decorators import method_decorator +from django.views.decorators.csrf import ensure_csrf_cookie +from rest_framework import status +from rest_framework.response import Response +from rest_framework.views import APIView + + +def create_msg( + title: str, sender_mail: str, message: str, receiver_mail: str +) -> MIMEMultipart: + msg = MIMEMultipart() + msg["Subject"] = title + msg["From"] = sender_mail + msg.attach(MIMEText(message, "plain")) # TODO: Use HTML instead of plain text + msg["To"] = receiver_mail + return msg + + +def smtp_send( + title: str, + message: str, + sender_mail: str, + mailing_list: list[str], + each: bool = True, +): + """ + Send email using SMTP relay gmail server. + + each True: Send email to each receiver. Receivers cannot see other receivers. + each False: Send email to all receivers. Receivers can see other receivers. + """ + allowed_mail_domain = ["@sparcs.org"] + + if not sender_mail.endswith(tuple(allowed_mail_domain)): + raise ValueError("Invalid email domain") + + smtp = smtplib.SMTP("smtp-relay.gmail.com", 587) + smtp.starttls() + # smtp.login("", "") # TODO: Use ID, PW instead of IP Address Authentication + smtp.ehlo() + + if each: + for receiver in mailing_list: + # print(f"[{mailing_list.index(receiver) + 1}/{len(mailing_list)}] Sending email to [{receiver}]") # FOR DEBUG + msg = create_msg(title, sender_mail, message, receiver) + smtp.sendmail(sender_mail, receiver, msg.as_string()) + else: + receivers = ", ".join(mailing_list) + msg = create_msg(title, sender_mail, message, receivers) + smtp.sendmail(sender_mail, mailing_list, msg.as_string()) + + smtp.quit() + + +class Unregister(APIView): + """ + Request to unregister + """ + + @method_decorator(ensure_csrf_cookie) + def get(self, request): + if not request.user.is_authenticated: + return Response(status=status.HTTP_401_UNAUTHORIZED) + message = f"\n\nuserid: {request.user.id}\nuseremail: {request.user.email}\n\n" + try: + smtp_send( + "NewAra 회원 탈퇴 요청", + message, + "new-ara@sparcs.org", + ["new-ara@sparcs.org"], + False, + ) + rtn = {"message": "탈퇴 요청이 접수되었습니다. 확인 후 처리하겠습니다."} + return Response(rtn) + except Exception as e: + print("ERROR:", e) + rtn = {"message": "탈퇴 요청 중 오류가 발생했습니다. 관리자에게 문의해주세요."} + return Response(rtn, status=status.HTTP_500_INTERNAL_SERVER_ERROR) From e61c73982027d2b98f5f05aed1603fcfd4d38f05 Mon Sep 17 00:00:00 2001 From: yuwol Date: Wed, 21 Feb 2024 18:50:23 +0900 Subject: [PATCH 16/16] refactor(unregister): use tuple for allowed mail domains --- apps/user/views/unregister.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/user/views/unregister.py b/apps/user/views/unregister.py index a5d427d5..bf006f76 100644 --- a/apps/user/views/unregister.py +++ b/apps/user/views/unregister.py @@ -33,9 +33,9 @@ def smtp_send( each True: Send email to each receiver. Receivers cannot see other receivers. each False: Send email to all receivers. Receivers can see other receivers. """ - allowed_mail_domain = ["@sparcs.org"] + allowed_mail_domains = ("@sparcs.org",) - if not sender_mail.endswith(tuple(allowed_mail_domain)): + if not sender_mail.endswith(allowed_mail_domains): raise ValueError("Invalid email domain") smtp = smtplib.SMTP("smtp-relay.gmail.com", 587)