Skip to content

Commit

Permalink
阅读量过滤爬虫
Browse files Browse the repository at this point in the history
  • Loading branch information
hopetree committed Jan 17, 2024
1 parent 20c8590 commit 7171988
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 31 deletions.
58 changes: 40 additions & 18 deletions apps/blog/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,27 @@ class RedisKeys:
feed_hub_data = 'feed.hub.data.{hour}' # feed 数据


def check_request_headers(headers_obj):
"""
校验请求头信息,比如识别User-Agent,从而过滤掉该请求
@param headers_obj: request.headers对象
@return:
use: flag = check_request_headers(request.headers)
"""
# 常见的搜索引擎爬虫的请求头,还有Python的
# 无请求头或者请求头里面包含爬虫信息则返回False,否则返回True
user_agent_black_keys = ['spider', 'bot', 'python']
if not headers_obj.get('user-agent'):
return False
else:
user_agent = str(headers_obj.get('user-agent')).lower()
for key in user_agent_black_keys:
if key in user_agent:
logger.warning(f'Bot/Spider request user-agent:{user_agent}')
return False
return True


def add_views(url, name=None, is_cache=True):
"""
单页面访问量统计的视图函数装饰器
Expand All @@ -163,27 +184,28 @@ def wrapper(request, *args, **kwargs):
# 仅访问页面的时候才进行计算,接口调用不计算,管理员访问也不计算
if request.method == "GET" and not request.is_ajax() and not request.user.is_superuser:
# 获取或者创建一个实例
logger.info(request.headers.items())
page_views = PageView.objects.filter(url=url)
if page_views:
obj = page_views.first()
else:
obj = PageView(url=url, name=name, views=0)
obj.save()

if is_cache: # 要判断缓存,则存状态
cache_key = f'page_views:read:{url}'
is_read_time = request.session.get(cache_key)
if not is_read_time:
obj.update_views()
request.session[cache_key] = time.time()
# logger.info(request.headers.items())
if check_request_headers(request.headers):
page_views = PageView.objects.filter(url=url)
if page_views:
obj = page_views.first()
else:
t = time.time() - is_read_time
if t > 60 * 30:
obj = PageView(url=url, name=name, views=0)
obj.save()

if is_cache: # 要判断缓存,则存状态
cache_key = f'page_views:read:{url}'
is_read_time = request.session.get(cache_key)
if not is_read_time:
obj.update_views()
request.session[cache_key] = time.time()
else:
obj.update_views()
else:
t = time.time() - is_read_time
if t > 60 * 30:
obj.update_views()
request.session[cache_key] = time.time()
else:
obj.update_views()
# ******* 浏览量增加的逻辑 *******

return result
Expand Down
32 changes: 19 additions & 13 deletions apps/blog/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,12 @@
from markdown.extensions.toc import TocExtension # 锚点的拓展

from .models import Article, Tag, Category, Timeline, Silian, AboutBlog, FriendLink, Subject
from .utils import site_full_url, CustomHtmlFormatter, ApiResponse, ErrorApiResponse, add_views
from .utils import (site_full_url,
CustomHtmlFormatter,
ApiResponse,
ErrorApiResponse,
add_views,
check_request_headers)


# Create your views here.
Expand Down Expand Up @@ -84,21 +89,22 @@ def get_object(self, queryset=None):
obj = super().get_object()
# 设置浏览量增加时间判断,同一篇文章两次浏览超过半小时才重新统计阅览量,作者浏览忽略
u = self.request.user
ses = self.request.session
the_key = self.context_object_name + ':read:{}'.format(obj.id)
is_read_time = ses.get(the_key)
if u == obj.author or u.is_superuser:
pass
else:
if not is_read_time:
obj.update_views()
ses[the_key] = time.time()
if check_request_headers(self.request.headers): # 请求头校验通过才计算阅读量
ses = self.request.session
the_key = self.context_object_name + ':read:{}'.format(obj.id)
is_read_time = ses.get(the_key)
if u == obj.author or u.is_superuser:
pass
else:
now_time = time.time()
t = now_time - is_read_time
if t > 60 * 30:
if not is_read_time:
obj.update_views()
ses[the_key] = time.time()
else:
now_time = time.time()
t = now_time - is_read_time
if t > 60 * 30:
obj.update_views()
ses[the_key] = time.time()
# 获取文章更新的时间,判断是否从缓存中取文章的markdown,可以避免每次都转换
ud = obj.update_date.strftime("%Y%m%d%H%M%S")
md_key = self.context_object_name + ':markdown:{}:{}'.format(obj.id, ud)
Expand Down

0 comments on commit 7171988

Please sign in to comment.