Skip to content

Commit 170e0f0

Browse files
committed
synced branch with master
2 parents b182885 + 6eb8e06 commit 170e0f0

File tree

7 files changed

+191
-1
lines changed

7 files changed

+191
-1
lines changed

frontera/contrib/scrapy/middlewares/seeds/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from __future__ import absolute_import
2-
from scrapy.http.request import Request
32

43

54
class SeedLoader(object):

requirements/tests.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,6 @@ msgpack-python
1010
kafka-python<=0.9.5
1111
pytest-cov
1212
happybase>=1.0.0
13+
mock
14+
boto>=2.42.0
1315
-r logging.txt

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@
8282
"tldextract>=1.5.1",
8383
"SQLAlchemy>=1.0.0",
8484
"cachetools",
85+
"mock",
86+
"boto>=2.42.0"
8587
"colorlog>=2.4.0",
8688
"python-json-logger>=0.1.5"
8789
]

tests/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
try:
2+
import unittest.mock as mock
3+
except ImportError:
4+
import mock

tests/mocks/boto.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import six
2+
3+
4+
class Content(object):
5+
6+
def __init__(self, obj):
7+
self.obj = obj
8+
9+
def split(self):
10+
return self.obj
11+
12+
13+
class MockKey(object):
14+
15+
def __init__(self, name, data):
16+
self.name = name
17+
self.content = Content(data)
18+
19+
def get_contents_as_string(self, *args, **kwargs):
20+
return self.content
21+
22+
23+
class MockBucket(object):
24+
25+
def __init__(self):
26+
self.keys = {}
27+
28+
def list(self, prefix):
29+
return [key for name, key in six.iteritems(self.keys) if name.startswith(prefix)]
30+
31+
def add_key(self, name, data):
32+
if name in self.keys:
33+
raise Exception('key: %s already exists' % name)
34+
self.keys[name] = MockKey(name, data)
35+
36+
37+
class MockConnection(object):
38+
39+
def __init__(self):
40+
self.buckets = {}
41+
42+
def get_bucket(self, bucket_name):
43+
try:
44+
return self.buckets[bucket_name]
45+
except:
46+
raise Exception('Bucket: %s not found' % bucket_name)
47+
48+
def create_bucket(self, name):
49+
if name in self.buckets:
50+
raise Exception('Bucket: %s already exists' % name)
51+
self.buckets[name] = MockBucket()
52+
return self.buckets[name]

tests/test_seed_loader.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
import os
2+
import unittest
3+
from shutil import rmtree
4+
from tempfile import mkdtemp
5+
6+
from scrapy.spiders import Spider
7+
8+
from frontera.settings import Settings
9+
from frontera.contrib.scrapy.middlewares.seeds.file import FileSeedLoader, NotConfigured
10+
from frontera.contrib.scrapy.middlewares.seeds.s3 import S3SeedLoader
11+
12+
from tests.mocks.boto import MockConnection
13+
from tests import mock
14+
15+
16+
class TestFileSeedLoader(unittest.TestCase):
17+
18+
def setUp(self):
19+
self.tmp_path = mkdtemp()
20+
21+
def tearDown(self):
22+
rmtree(self.tmp_path)
23+
24+
def seed_loader_setup(self, seeds_content=None):
25+
seed_path = os.path.join(self.tmp_path, 'seeds.txt')
26+
default_content = """
27+
https://www.example.com
28+
https://www.scrapy.org
29+
"""
30+
seeds_content = seeds_content or default_content
31+
with open(seed_path, 'wb') as tmpl_file:
32+
tmpl_file.write(seeds_content.encode('utf-8'))
33+
assert os.path.isfile(seed_path) # Failure of test itself
34+
settings = Settings()
35+
settings.SEEDS_SOURCE = seed_path
36+
crawler = type('crawler', (object,), {})
37+
crawler.settings = settings
38+
return FileSeedLoader(crawler)
39+
40+
def test_seeds_not_configured(self):
41+
crawler = type('crawler', (object,), {})
42+
crawler.settings = Settings()
43+
self.assertRaises(NotConfigured, FileSeedLoader, crawler)
44+
45+
def test_load_seeds(self):
46+
seed_loader = self.seed_loader_setup()
47+
seeds = seed_loader.load_seeds()
48+
self.assertEqual(seeds, ['https://www.example.com', 'https://www.scrapy.org'])
49+
50+
def test_process_start_requests(self):
51+
seed_loader = self.seed_loader_setup()
52+
requests = seed_loader.process_start_requests(None, Spider(name='spider'))
53+
self.assertEqual([r.url for r in requests], ['https://www.example.com', 'https://www.scrapy.org'])
54+
55+
def test_process_start_requests_ignore_comments(self):
56+
seeds_content = """
57+
https://www.example.com
58+
# https://www.dmoz.org
59+
https://www.scrapy.org
60+
# https://www.test.com
61+
"""
62+
seed_loader = self.seed_loader_setup(seeds_content)
63+
requests = seed_loader.process_start_requests(None, Spider(name='spider'))
64+
self.assertEqual([r.url for r in requests], ['https://www.example.com', 'https://www.scrapy.org'])
65+
66+
67+
class TestS3SeedLoader(unittest.TestCase):
68+
69+
def setUp(self):
70+
self.tmp_path = mkdtemp()
71+
settings = Settings()
72+
settings.SEEDS_SOURCE = 's3://some-bucket/seeds-folder'
73+
settings.SEEDS_AWS_ACCESS_KEY = 'access_key'
74+
settings.SEEDS_AWS_SECRET_ACCESS_KEY = 'secret_key'
75+
crawler = type('crawler', (object,), {})
76+
crawler.settings = settings
77+
self.seed_path_1 = os.path.join(self.tmp_path, 'seeds1.txt')
78+
self.seed_path_2 = os.path.join(self.tmp_path, 'seeds2.txt')
79+
s1_content = """
80+
https://www.example.com
81+
https://www.scrapy.org
82+
"""
83+
s2_content = """
84+
https://www.dmoz.org
85+
https://www.test.com
86+
"""
87+
88+
with open(self.seed_path_1, 'wb') as tmpl_file:
89+
tmpl_file.write(s1_content.encode('utf-8'))
90+
with open(self.seed_path_2, 'wb') as tmpl_file:
91+
tmpl_file.write(s2_content.encode('utf-8'))
92+
self.seed_loader = S3SeedLoader(crawler)
93+
94+
def tearDown(self):
95+
rmtree(self.tmp_path)
96+
97+
def test_invalid_s3_seed_source(self):
98+
crawler = type('crawler', (object,), {})
99+
settings = Settings()
100+
settings.SEEDS_SOURCE = 'invalid_url'
101+
crawler.settings = settings
102+
self.assertRaises(NotConfigured, S3SeedLoader, crawler)
103+
104+
def test_process_start_requests(self):
105+
urls = ['https://www.example.com', 'https://www.scrapy.org',
106+
'https://www.dmoz.org', 'https://www.test.com']
107+
self.check_request_urls(urls)
108+
109+
def test_s3_loader_ignores_non_txt_files(self):
110+
urls = []
111+
self.check_request_urls(urls, '.ini')
112+
113+
def check_request_urls(self, urls, key_extension='.txt'):
114+
with open(self.seed_path_1, 'rU') as s1:
115+
with open(self.seed_path_2, 'rU') as s2:
116+
conn = MockConnection()
117+
bucket = conn.create_bucket('some-bucket')
118+
bucket.add_key('seeds-folder/seeds1%s' % key_extension, s1)
119+
bucket.add_key('seeds-folder/seeds2%s' % key_extension, s2)
120+
121+
def mocked_connect_s3(*args, **kwargs):
122+
return conn
123+
124+
with mock.patch('frontera.contrib.scrapy.middlewares.seeds.s3.connect_s3',
125+
side_effect=mocked_connect_s3):
126+
requests = self.seed_loader.process_start_requests(None, Spider(name='spider'))
127+
self.assertEqual(set([r.url for r in requests]), set(urls))

tox.ini

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ envlist = py27,flake8
99
skip_missing_interpreters = True
1010

1111
[testenv]
12+
# do not load /etc/boto.cfg with Python 3 incompatible plugin
13+
# https://github.com/travis-ci/travis-ci/issues/5246#issuecomment-166460882
14+
setenv =
15+
BOTO_CONFIG = /tmp/nowhere
1216
deps =
1317
-r{toxinidir}/requirements.txt
1418
-r{toxinidir}/requirements/tests.txt

0 commit comments

Comments
 (0)