Skip to content

Commit 9292515

Browse files
authored
Merge pull request #6 from ipfs-shipyard/feat/use-dht-crawler
DHT Crawler
2 parents 6c30019 + 4bc3ab5 commit 9292515

File tree

16 files changed

+1526
-503
lines changed

16 files changed

+1526
-503
lines changed

.circleci/config.yml

Lines changed: 0 additions & 14 deletions
This file was deleted.

.github/workflows/go.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
name: Go
2+
3+
on:
4+
push:
5+
branches: [ master ]
6+
pull_request:
7+
branches: [ master ]
8+
9+
jobs:
10+
11+
build:
12+
name: Build
13+
runs-on: ubuntu-latest
14+
steps:
15+
16+
- name: Set up Go 1.x
17+
uses: actions/setup-go@v2
18+
with:
19+
go-version: ^1.15
20+
id: go
21+
22+
- name: Check out code into the Go module directory
23+
uses: actions/checkout@v2
24+
25+
- name: Get dependencies
26+
run: go get -v -t -d ./...
27+
28+
- name: Build
29+
run: CGO_ENABLED=0 go build -v ./...
30+
31+
- name: Run golangci-lint
32+
uses: actions-contrib/golangci-lint@v1
33+
env:
34+
GOROOT: ""
35+
with:
36+
args: "run"
37+
38+
- name: Test
39+
run: go test -v .

.github/workflows/package.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
name: Publish Docker image
2+
on:
3+
release:
4+
types: [published]
5+
jobs:
6+
push_to_registry:
7+
name: Push Docker image to GitHub Packages
8+
runs-on: ubuntu-latest
9+
steps:
10+
- name: Check out the repo
11+
uses: actions/checkout@v2
12+
- name: Push to GitHub Packages
13+
uses: docker/build-push-action@v1
14+
with:
15+
username: ${{ github.actor }}
16+
password: ${{ secrets.GITHUB_TOKEN }}
17+
registry: docker.pkg.github.com
18+
repository: ipfs-shipyard/ipfs-counter/ipfs-counter
19+
tags: latest
20+
tag_with_ref: true
21+
add_git_labels: true

.gitignore

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Binaries for programs and plugins
2+
*.exe
3+
*.exe~
4+
*.dll
5+
*.so
6+
*.dylib
7+
8+
# Test binary, built with `go test -c`
9+
*.test
10+
11+
# Output of the go coverage tool, specifically when used with LiteIDE
12+
*.out
13+
14+
# Dependency directories (remove the comment below to include it)
15+
# vendor/
16+
17+
*.json

Dockerfile

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
FROM golang:alpine AS builder
2+
3+
# Git is required for fetching the dependencies.
4+
# Ca-certificates is required to call HTTPS endpoints.
5+
RUN apk update && apk add --no-cache git ca-certificates && update-ca-certificates
6+
7+
# Create appuser
8+
ENV USER=appuser
9+
ENV UID=10001
10+
11+
# See https://stackoverflow.com/a/55757473/12429735RUN
12+
RUN adduser \
13+
--disabled-password \
14+
--gecos "" \
15+
--home "/nonexistent" \
16+
--shell "/sbin/nologin" \
17+
--no-create-home \
18+
--uid "${UID}" \
19+
"${USER}"
20+
21+
WORKDIR $GOPATH/src/github.com/ipfs-shipyard/ipfs-counter/
22+
23+
COPY . .
24+
RUN go mod download
25+
RUN go mod verify
26+
27+
RUN GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -ldflags="-w -s" -o /go/bin/ipfs-counter
28+
29+
# STEP 2 build a small image
30+
FROM scratch
31+
32+
COPY --from=builder /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/
33+
COPY --from=builder /etc/passwd /etc/passwd
34+
COPY --from=builder /etc/group /etc/group
35+
36+
COPY --from=builder /go/bin/ipfs-counter /go/bin/ipfs-counter
37+
38+
USER appuser:appuser
39+
40+
ENTRYPOINT ["/go/bin/ipfs-counter"]

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2020 Adin Schmahmann
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,11 @@ Crawls the IPFS DHT to give some metrics. Note: this project is a WIP and result
1515

1616
`go build`
1717

18-
### Usage
1918

20-
Running the application output prometheus metrics on the path `/metrics` with port 1234. Set the environment variable `IPFS_METRICS_PASSWORD` to control access to the prometheus metrics.
21-
22-
Network data is output into the `netdata` folder via levelDB
23-
24-
## Lead Maintainer
19+
## Lead Maintainers
2520

2621
[Adin Schmahmann](https://github.com/aschmahmann)
22+
[Will Scott](https://github.com/willscott)
2723

2824
## Contributing
2925

crawl.go

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
package main
2+
3+
import (
4+
"context"
5+
"time"
6+
7+
logging "github.com/ipfs/go-log"
8+
"github.com/urfave/cli/v2"
9+
10+
"github.com/libp2p/go-libp2p"
11+
"github.com/libp2p/go-libp2p-core/crypto"
12+
"github.com/libp2p/go-libp2p-core/host"
13+
"github.com/libp2p/go-libp2p-core/peer"
14+
noise "github.com/libp2p/go-libp2p-noise"
15+
quic "github.com/libp2p/go-libp2p-quic-transport"
16+
secio "github.com/libp2p/go-libp2p-secio"
17+
tls "github.com/libp2p/go-libp2p-tls"
18+
19+
"github.com/libp2p/go-libp2p-kad-dht/crawler"
20+
"github.com/multiformats/go-multiaddr"
21+
)
22+
23+
var crawlFlags = []cli.Flag{
24+
&cli.StringFlag{
25+
Name: "output",
26+
TakesFile: true,
27+
Usage: "Output file location",
28+
Value: "crawl-output",
29+
},
30+
&cli.StringFlag{
31+
Name: "dataset",
32+
Usage: "Google biquery dataset ID for insertion",
33+
},
34+
&cli.StringFlag{
35+
Name: "table",
36+
Usage: "Google bigquery table prefix for insertion",
37+
},
38+
&cli.BoolFlag{
39+
Name: "create-tables",
40+
Usage: "To create bigquery tables if they do not exist",
41+
},
42+
&cli.StringFlag{
43+
Name: "seed-file",
44+
TakesFile: true,
45+
Usage: "Use peers from a file to seed crawling",
46+
},
47+
&cli.StringFlag{
48+
Name: "seed-table",
49+
Usage: "Use peers / multiaddrs from previous trial table to seed crawling",
50+
},
51+
&cli.DurationFlag{
52+
Name: "seed-table-duration",
53+
Usage: "when seeding from table, select date range for querying hosts",
54+
Value: 7 * 24 * time.Hour,
55+
},
56+
&cli.IntFlag{
57+
Name: "parallelism",
58+
Usage: "How many connections to open at once",
59+
Value: 1000,
60+
},
61+
&cli.DurationFlag{
62+
Name: "timeout",
63+
Usage: "How long to wait on dial attempts",
64+
Value: 5 * time.Second,
65+
},
66+
&cli.DurationFlag{
67+
Name: "crawltime",
68+
Usage: "How long to crawl for",
69+
Value: 20 * time.Hour,
70+
},
71+
&cli.BoolFlag{
72+
Name: "debug",
73+
Usage: "Print debugging messages",
74+
},
75+
}
76+
77+
func must(m multiaddr.Multiaddr, e error) multiaddr.Multiaddr {
78+
if e != nil {
79+
panic(e)
80+
}
81+
return m
82+
}
83+
84+
var bootstrapAddrs = []multiaddr.Multiaddr{
85+
must(multiaddr.NewMultiaddr("/ip4/139.178.89.189/tcp/4001/p2p/QmZa1sAxajnQjVM8WjWXoMbmPd7NsWhfKsPkErzpm9wGkp")),
86+
must(multiaddr.NewMultiaddr("/ip4/104.131.131.82/tcp/4001/p2p/QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ")),
87+
must(multiaddr.NewMultiaddr("/ip4/207.148.19.196/tcp/20074/p2p/12D3KooWGXBbSZ3ko3UvoekdnnSrdmuFic3XHuNKvGcZyrH1mVxr")),
88+
must(multiaddr.NewMultiaddr("/ip4/18.185.241.99/tcp/20001/p2p/12D3KooWA4NVc1GytssyhxGqaT22kJ9XwdhCpS2VwNPPMw59Ctf4")),
89+
must(multiaddr.NewMultiaddr("/ip4/64.225.116.25/tcp/30017/p2p/12D3KooWHHVPRYiXuWsVmATm8nduX7dXXpw3kC5Co1QSUYVLNXZN")),
90+
91+
must(multiaddr.NewMultiaddr("/ip4/104.131.131.82/tcp/4001/ipfs/QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ")), // mars.i.ipfs.io
92+
must(multiaddr.NewMultiaddr("/ip4/104.236.179.241/tcp/4001/ipfs/QmSoLPppuBtQSGwKDZT2M73ULpjvfd3aZ6ha4oFGL1KrGM")), // pluto.i.ipfs.io
93+
must(multiaddr.NewMultiaddr("/ip4/128.199.219.111/tcp/4001/ipfs/QmSoLSafTMBsPKadTEgaXctDQVcqN88CNLHXMkTNwMKPnu")), // saturn.i.ipfs.io
94+
must(multiaddr.NewMultiaddr("/ip4/104.236.76.40/tcp/4001/ipfs/QmSoLV4Bbm51jM9C4gDYZQ9Cy3U6aXMJDAbzgu2fzaDs64")), // venus.i.ipfs.io
95+
must(multiaddr.NewMultiaddr("/ip4/178.62.158.247/tcp/4001/ipfs/QmSoLer265NRgSp2LA3dPaeykiS1J6DifTC88f5uVQKNAd")), // earth.i.ipfs.io
96+
must(multiaddr.NewMultiaddr("/ip4/104.236.151.122/tcp/4001/ipfs/QmSoLju6m7xTh3DuokvT3886QRYqxAzb1kShaanJgW36yx")),
97+
must(multiaddr.NewMultiaddr("/ip4/188.40.114.11/tcp/4001/ipfs/QmZY7MtK8ZbG1suwrxc7xEYZ2hQLf1dAWPRHhjxC8rjq8E")),
98+
must(multiaddr.NewMultiaddr("/ip4/5.9.59.34/tcp/4001/ipfs/QmRv1GNseNP1krEwHDjaQMeQVJy41879QcDwpJVhY8SWve")),
99+
}
100+
101+
func makeHost(c *cli.Context, r *Recorder) (host.Host, error) {
102+
crypto.MinRsaKeyBits = 512
103+
104+
h, err := libp2p.New(c.Context,
105+
libp2p.ConnectionGater(r),
106+
libp2p.ListenAddrStrings("/ip4/0.0.0.0/tcp/4001"),
107+
libp2p.Transport(quic.NewTransport),
108+
libp2p.DefaultTransports,
109+
// libp2p.Transport(tcp.NewTCPTransport),
110+
// libp2p.Transport(ws.New),
111+
libp2p.Security(tls.ID, tls.New),
112+
libp2p.Security(noise.ID, noise.New),
113+
libp2p.Security(secio.ID, secio.New),
114+
)
115+
if err != nil {
116+
return nil, err
117+
}
118+
if err := r.setHost(h); err != nil {
119+
return nil, err
120+
}
121+
122+
return h, nil
123+
}
124+
125+
func crawl(c *cli.Context) error {
126+
ll := "info"
127+
if c.Bool("debug") {
128+
ll = "debug"
129+
}
130+
logger := logging.Logger("dht-crawler")
131+
if err := logging.SetLogLevel("dht-crawler", ll); err != nil {
132+
return err
133+
}
134+
135+
ctx := c.Context
136+
137+
r, err := NewRecorder(c)
138+
if err != nil {
139+
return err
140+
}
141+
142+
host, err := makeHost(c, r)
143+
if err != nil {
144+
return err
145+
}
146+
147+
pending := newMAList()
148+
149+
if c.IsSet("seed-file") {
150+
ok, err := pending.AddFile(c.String("seed-file"))
151+
if !ok {
152+
return err
153+
} else if err != nil {
154+
logger.Warnf("Some multiaddrs could not be parsed: %v", err)
155+
}
156+
} else if c.IsSet("seed-table") {
157+
addrs, err := r.getMultiAddrs(ctx, c.String("dataset"), c.String("seed-table"), c.Duration("seed-table-duration"))
158+
if err != nil {
159+
return err
160+
}
161+
ok, err := pending.AddStrings(addrs)
162+
if !ok {
163+
return err
164+
} else if err != nil {
165+
logger.Warnf("Some multiaddrs could not be parsed: %v", err)
166+
}
167+
}
168+
169+
for _, ma := range bootstrapAddrs {
170+
if err := pending.Add(ma); err != nil {
171+
logger.Warnf("Unable to parse address %s: %w", ma, err)
172+
continue
173+
}
174+
}
175+
logger.Infof("Seeding crawl with %d peer addresses", len(pending))
176+
177+
// populate host info
178+
peers := make([]*peer.AddrInfo, 0, len(pending))
179+
for _, p := range pending {
180+
pis, err := peer.AddrInfosFromP2pAddrs(p.Addrs...)
181+
if err != nil {
182+
logger.Warnf("Failed to parse addresses for %s: %w", p.ID, err)
183+
continue
184+
}
185+
for _, pi := range pis {
186+
peers = append(peers, &pi)
187+
}
188+
189+
nonIDAddrs := make([]multiaddr.Multiaddr, 0, len(p.Addrs))
190+
// Remove the /p2p/<id> portion of the addresses.
191+
for _, a := range p.Addrs {
192+
na, _ := multiaddr.SplitFunc(a, func(c multiaddr.Component) bool {
193+
return c.Protocol().Code == multiaddr.P_P2P
194+
})
195+
nonIDAddrs = append(nonIDAddrs, na)
196+
}
197+
host.Peerstore().AddAddrs(p.ID, nonIDAddrs, time.Hour)
198+
}
199+
200+
crawl, err := crawler.New(host,
201+
crawler.WithParallelism(c.Int("parallelism")),
202+
crawler.WithMsgTimeout(c.Duration("timeout")))
203+
if err != nil {
204+
panic(err)
205+
}
206+
207+
// TODO: configure timeout.
208+
short, c2 := context.WithTimeout(ctx, c.Duration("crawltime"))
209+
defer c2()
210+
crawl.Run(short, peers,
211+
r.onPeerSuccess,
212+
r.onPeerFailure)
213+
214+
logger.Info("Crawl complete. Collecting Output...")
215+
return Output(c.String("output"), r)
216+
}

0 commit comments

Comments
 (0)