Skip to content

Commit

Permalink
✨ 新增基础去重;
Browse files Browse the repository at this point in the history
✨ 新增状态记录
✏️ 修改错别字
⬆️ 升级依赖包
🔥 删除多余文件
  • Loading branch information
Sitoi committed Sep 7, 2021
1 parent d9dc875 commit fd0659b
Show file tree
Hide file tree
Showing 18 changed files with 444 additions and 197 deletions.
1 change: 0 additions & 1 deletion CNAME

This file was deleted.

29 changes: 18 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,23 @@
![GitHub last commit](https://img.shields.io/github/last-commit/crawlmap/scrapy-redis-sentinel)
![PyPI - Downloads](https://img.shields.io/pypi/dw/scrapy-redis-sentinel)

本项目基于原项目 [scrpy-redis](https://github.com/rmax/scrapy-redis)
本项目基于原项目 [scrapy-redis](https://github.com/rmax/scrapy-redis)

进行修改,修改内容如下:

1. 添加了 `Redis` 哨兵连接支持
2. 添加了 `Redis` 集群连接支持
3. 添加了 `Bloomfilter` 去重

## 安装
## 安装

```bash
pip install scrapy-redis-sentinel --user
```

## 配置示例

> 原版本 scrpy-redis 的所有配置都支持, 优先级:哨兵模式 > 集群模式 > 单机模式
> 原版本 scrapy-redis 的所有配置都支持, 优先级:哨兵模式 > 集群模式 > 单机模式
```python
# ----------------------------------------Bloomfilter 配置-------------------------------------
Expand Down Expand Up @@ -55,8 +55,8 @@ REDIS_SENTINELS = [
]

# REDIS_SENTINEL_PARAMS 哨兵模式配置参数。
REDIS_SENTINEL_PARAMS= {
"service_name":"mymaster",
REDIS_SENTINEL_PARAMS = {
"service_name": "mymaster",
"password": "password",
"db": 0
}
Expand All @@ -71,18 +71,23 @@ REDIS_STARTUP_NODES = [
]

# REDIS_CLUSTER_PARAMS 集群模式配置参数
REDIS_CLUSTER_PARAMS= {
REDIS_CLUSTER_PARAMS = {
"password": "password"
}

# ----------------------------------------Scrapy 其他参数-------------------------------------

# 在 redis 中保持 scrapy-redis 用到的各个队列,从而允许暂停和暂停后恢复,也就是不清理 redis queues
SCHEDULER_PERSIST = True
SCHEDULER_PERSIST = True
# 调度队列
SCHEDULER = "scrapy_redis_sentinel.scheduler.Scheduler"
# 去重
DUPEFILTER_CLASS = "scrapy_redis_sentinel.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis_sentinel.scheduler.Scheduler"
# 基础去重
DUPEFILTER_CLASS = "scrapy_redis_sentinel.dupefilter.RedisDupeFilter"
# BloomFilter
# DUPEFILTER_CLASS = "scrapy_redis_sentinel.dupefilter.RedisBloomFilter"

# 启用基于 Redis 统计信息
STATS_CLASS = "scrapy_redis_sentinel.stats.RedisStatsCollector"

# 指定排序爬取地址时使用的队列
# 默认的 按优先级排序( Scrapy 默认),由 sorted set 实现的一种非 FIFO、LIFO 方式。
Expand All @@ -99,11 +104,12 @@ DUPEFILTER_CLASS = "scrapy_redis_sentinel.dupefilter.RFPDupeFilter"

**修改 RedisSpider 引入方式**

原版本 `scrpy-redis` 使用方式
原版本 `scrapy-redis` 使用方式

```python
from scrapy_redis.spiders import RedisSpider


class Spider(RedisSpider):
...

Expand All @@ -114,6 +120,7 @@ class Spider(RedisSpider):
```python
from scrapy_redis_sentinel.spiders import RedisSpider


class Spider(RedisSpider):
...

Expand Down
125 changes: 0 additions & 125 deletions README.rst

This file was deleted.

9 changes: 6 additions & 3 deletions makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
.PHONY: sdist upload
.PHONY: clean sdist upload

sdist:
python setup.py sdist
python setup.py sdist bdist_wheel --universa

upload:
twine upload dist/*
python setup.py upload

clean:
rm -rf build scrapy_redis_sentinel.egg-info dist
7 changes: 3 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
scrapy
redis==3.0.1
six>=1.5.2
redis-py-cluster==2.0.0
redis==3.5.3
redis-py-cluster==2.1.3
Scrapy
2 changes: 1 addition & 1 deletion scrapy_redis_sentinel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
__original_author__ = "Rolando Espinoza"
__author__ = "Shi Tao"
__email__ = "shitao0418@gmail.com"
__version__ = "0.6.8"
__version__ = "0.7.1"
1 change: 1 addition & 0 deletions scrapy_redis_sentinel/bloomfilter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
class HashMap(object):
def __init__(self, m, seed):
self.m = m
Expand Down
7 changes: 7 additions & 0 deletions scrapy_redis_sentinel/connection.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# -*- coding: utf-8 -*-
import sys

import six
from scrapy.utils.misc import load_object

Expand All @@ -8,9 +11,13 @@
"REDIS_URL": "url",
"REDIS_HOST": "host",
"REDIS_PORT": "port",
"REDIS_DB": "db",
"REDIS_ENCODING": "encoding"
}

if sys.version_info > (3,):
SETTINGS_PARAMS_MAP["REDIS_DECODE_RESPONSES"] = "decode_responses"


def get_redis_from_settings(settings):
"""Returns a redis client instance from given Scrapy settings object.
Expand Down
15 changes: 10 additions & 5 deletions scrapy_redis_sentinel/defaults.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
# -*- coding: utf-8 -*-
import redis

# For standalone use.
import rediscluster
from redis.sentinel import Sentinel

DUPEFILTER_KEY = "dupefilter:%(timestamp)s"

PIPELINE_KEY = "%(spider)s:items"

STATS_KEY = '%(spider)s:stats'

REDIS_CLS = redis.StrictRedis
REDIS_CLUSTER_CLS = rediscluster.RedisCluster
REDIS_SENTINEL_CLS = Sentinel

REDIS_ENCODING = "utf-8"
# Sane connection defaults.
REDIS_PARAMS = {
Expand All @@ -21,10 +26,10 @@
SCHEDULER_QUEUE_KEY = "%(spider)s:requests"
SCHEDULER_QUEUE_CLASS = "scrapy_redis_sentinel.queue.PriorityQueue"
SCHEDULER_DUPEFILTER_KEY = "%(spider)s:dupefilter"
SCHEDULER_DUPEFILTER_CLASS = "scrapy_redis_sentinel.dupefilter.RFPDupeFilter"
SCHEDULER_DUPEFILTER_CLASS = "scrapy_redis_sentinel.dupefilter.RedisDupeFilter"

SCHEDULER_PERSIST = False

START_URLS_KEY = "%(name)s:start_urls"
START_URLS_AS_SET = False

REDIS_CLUSTER_CLS = rediscluster.RedisCluster
REDIS_SENTINEL_CLS = Sentinel
START_URLS_AS_ZSET = False
Loading

0 comments on commit fd0659b

Please sign in to comment.