-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.flux
88 lines (75 loc) · 1.93 KB
/
crawler.flux
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
name: "crawler"
includes:
- resource: true
file: "/crawler-default.yaml"
override: false
- resource: false
file: "crawler-conf.yaml"
override: true
spouts:
- id: "spout"
className: "com.digitalpebble.stormcrawler.spout.MemorySpout"
parallelism: 1
constructorArgs:
- ["http://www.lequipe.fr/", "http://www.mobie01.com/", "http://www.lemonde.fr/", "http://www.bbc.co.uk/", "http://storm.apache.org/", "http://digitalpebble.com/"]
bolts:
- id: "partitioner"
className: "com.digitalpebble.stormcrawler.bolt.URLPartitionerBolt"
parallelism: 1
- id: "fetcher"
className: "com.digitalpebble.stormcrawler.bolt.FetcherBolt"
parallelism: 1
- id: "sitemap"
className: "com.digitalpebble.stormcrawler.bolt.SiteMapParserBolt"
parallelism: 1
- id: "parse"
className: "com.digitalpebble.stormcrawler.bolt.JSoupParserBolt"
parallelism: 1
- id: "index"
className: "com.digitalpebble.stormcrawler.indexing.StdOutIndexer"
parallelism: 1
- id: "status"
className: "com.digitalpebble.stormcrawler.persistence.StdOutStatusUpdater"
parallelism: 1
streams:
- from: "spout"
to: "partitioner"
grouping:
type: SHUFFLE
- from: "partitioner"
to: "fetcher"
grouping:
type: FIELDS
args: ["key"]
- from: "fetcher"
to: "sitemap"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "sitemap"
to: "parse"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "parse"
to: "index"
grouping:
type: LOCAL_OR_SHUFFLE
- from: "fetcher"
to: "status"
grouping:
type: LOCAL_OR_SHUFFLE
streamId: "status"
- from: "sitemap"
to: "status"
grouping:
type: LOCAL_OR_SHUFFLE
streamId: "status"
- from: "parse"
to: "status"
grouping:
type: LOCAL_OR_SHUFFLE
streamId: "status"
- from: "index"
to: "status"
grouping:
type: LOCAL_OR_SHUFFLE
streamId: "status"