|
| 1 | +package main |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "time" |
| 6 | + |
| 7 | + logging "github.com/ipfs/go-log" |
| 8 | + "github.com/urfave/cli/v2" |
| 9 | + |
| 10 | + "github.com/libp2p/go-libp2p" |
| 11 | + "github.com/libp2p/go-libp2p-core/crypto" |
| 12 | + "github.com/libp2p/go-libp2p-core/host" |
| 13 | + "github.com/libp2p/go-libp2p-core/peer" |
| 14 | + noise "github.com/libp2p/go-libp2p-noise" |
| 15 | + quic "github.com/libp2p/go-libp2p-quic-transport" |
| 16 | + secio "github.com/libp2p/go-libp2p-secio" |
| 17 | + tls "github.com/libp2p/go-libp2p-tls" |
| 18 | + |
| 19 | + "github.com/libp2p/go-libp2p-kad-dht/crawler" |
| 20 | + "github.com/multiformats/go-multiaddr" |
| 21 | +) |
| 22 | + |
| 23 | +var crawlFlags = []cli.Flag{ |
| 24 | + &cli.StringFlag{ |
| 25 | + Name: "output", |
| 26 | + TakesFile: true, |
| 27 | + Usage: "Output file location", |
| 28 | + Value: "crawl-output", |
| 29 | + }, |
| 30 | + &cli.StringFlag{ |
| 31 | + Name: "dataset", |
| 32 | + Usage: "Google biquery dataset ID for insertion", |
| 33 | + }, |
| 34 | + &cli.StringFlag{ |
| 35 | + Name: "table", |
| 36 | + Usage: "Google bigquery table prefix for insertion", |
| 37 | + }, |
| 38 | + &cli.BoolFlag{ |
| 39 | + Name: "create-tables", |
| 40 | + Usage: "To create bigquery tables if they do not exist", |
| 41 | + }, |
| 42 | + &cli.StringFlag{ |
| 43 | + Name: "seed-file", |
| 44 | + TakesFile: true, |
| 45 | + Usage: "Use peers from a file to seed crawling", |
| 46 | + }, |
| 47 | + &cli.StringFlag{ |
| 48 | + Name: "seed-table", |
| 49 | + Usage: "Use peers / multiaddrs from previous trial table to seed crawling", |
| 50 | + }, |
| 51 | + &cli.DurationFlag{ |
| 52 | + Name: "seed-table-duration", |
| 53 | + Usage: "when seeding from table, select date range for querying hosts", |
| 54 | + Value: 7 * 24 * time.Hour, |
| 55 | + }, |
| 56 | + &cli.IntFlag{ |
| 57 | + Name: "parallelism", |
| 58 | + Usage: "How many connections to open at once", |
| 59 | + Value: 1000, |
| 60 | + }, |
| 61 | + &cli.DurationFlag{ |
| 62 | + Name: "timeout", |
| 63 | + Usage: "How long to wait on dial attempts", |
| 64 | + Value: 5 * time.Second, |
| 65 | + }, |
| 66 | + &cli.DurationFlag{ |
| 67 | + Name: "crawltime", |
| 68 | + Usage: "How long to crawl for", |
| 69 | + Value: 20 * time.Hour, |
| 70 | + }, |
| 71 | + &cli.BoolFlag{ |
| 72 | + Name: "debug", |
| 73 | + Usage: "Print debugging messages", |
| 74 | + }, |
| 75 | +} |
| 76 | + |
| 77 | +func must(m multiaddr.Multiaddr, e error) multiaddr.Multiaddr { |
| 78 | + if e != nil { |
| 79 | + panic(e) |
| 80 | + } |
| 81 | + return m |
| 82 | +} |
| 83 | + |
| 84 | +var bootstrapAddrs = []multiaddr.Multiaddr{ |
| 85 | + must(multiaddr.NewMultiaddr("/ip4/139.178.89.189/tcp/4001/p2p/QmZa1sAxajnQjVM8WjWXoMbmPd7NsWhfKsPkErzpm9wGkp")), |
| 86 | + must(multiaddr.NewMultiaddr("/ip4/104.131.131.82/tcp/4001/p2p/QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ")), |
| 87 | + must(multiaddr.NewMultiaddr("/ip4/207.148.19.196/tcp/20074/p2p/12D3KooWGXBbSZ3ko3UvoekdnnSrdmuFic3XHuNKvGcZyrH1mVxr")), |
| 88 | + must(multiaddr.NewMultiaddr("/ip4/18.185.241.99/tcp/20001/p2p/12D3KooWA4NVc1GytssyhxGqaT22kJ9XwdhCpS2VwNPPMw59Ctf4")), |
| 89 | + must(multiaddr.NewMultiaddr("/ip4/64.225.116.25/tcp/30017/p2p/12D3KooWHHVPRYiXuWsVmATm8nduX7dXXpw3kC5Co1QSUYVLNXZN")), |
| 90 | + |
| 91 | + must(multiaddr.NewMultiaddr("/ip4/104.131.131.82/tcp/4001/ipfs/QmaCpDMGvV2BGHeYERUEnRQAwe3N8SzbUtfsmvsqQLuvuJ")), // mars.i.ipfs.io |
| 92 | + must(multiaddr.NewMultiaddr("/ip4/104.236.179.241/tcp/4001/ipfs/QmSoLPppuBtQSGwKDZT2M73ULpjvfd3aZ6ha4oFGL1KrGM")), // pluto.i.ipfs.io |
| 93 | + must(multiaddr.NewMultiaddr("/ip4/128.199.219.111/tcp/4001/ipfs/QmSoLSafTMBsPKadTEgaXctDQVcqN88CNLHXMkTNwMKPnu")), // saturn.i.ipfs.io |
| 94 | + must(multiaddr.NewMultiaddr("/ip4/104.236.76.40/tcp/4001/ipfs/QmSoLV4Bbm51jM9C4gDYZQ9Cy3U6aXMJDAbzgu2fzaDs64")), // venus.i.ipfs.io |
| 95 | + must(multiaddr.NewMultiaddr("/ip4/178.62.158.247/tcp/4001/ipfs/QmSoLer265NRgSp2LA3dPaeykiS1J6DifTC88f5uVQKNAd")), // earth.i.ipfs.io |
| 96 | + must(multiaddr.NewMultiaddr("/ip4/104.236.151.122/tcp/4001/ipfs/QmSoLju6m7xTh3DuokvT3886QRYqxAzb1kShaanJgW36yx")), |
| 97 | + must(multiaddr.NewMultiaddr("/ip4/188.40.114.11/tcp/4001/ipfs/QmZY7MtK8ZbG1suwrxc7xEYZ2hQLf1dAWPRHhjxC8rjq8E")), |
| 98 | + must(multiaddr.NewMultiaddr("/ip4/5.9.59.34/tcp/4001/ipfs/QmRv1GNseNP1krEwHDjaQMeQVJy41879QcDwpJVhY8SWve")), |
| 99 | +} |
| 100 | + |
| 101 | +func makeHost(c *cli.Context, r *Recorder) (host.Host, error) { |
| 102 | + crypto.MinRsaKeyBits = 512 |
| 103 | + |
| 104 | + h, err := libp2p.New(c.Context, |
| 105 | + libp2p.ConnectionGater(r), |
| 106 | + libp2p.ListenAddrStrings("/ip4/0.0.0.0/tcp/4001"), |
| 107 | + libp2p.Transport(quic.NewTransport), |
| 108 | + libp2p.DefaultTransports, |
| 109 | + // libp2p.Transport(tcp.NewTCPTransport), |
| 110 | + // libp2p.Transport(ws.New), |
| 111 | + libp2p.Security(tls.ID, tls.New), |
| 112 | + libp2p.Security(noise.ID, noise.New), |
| 113 | + libp2p.Security(secio.ID, secio.New), |
| 114 | + ) |
| 115 | + if err != nil { |
| 116 | + return nil, err |
| 117 | + } |
| 118 | + if err := r.setHost(h); err != nil { |
| 119 | + return nil, err |
| 120 | + } |
| 121 | + |
| 122 | + return h, nil |
| 123 | +} |
| 124 | + |
| 125 | +func crawl(c *cli.Context) error { |
| 126 | + ll := "info" |
| 127 | + if c.Bool("debug") { |
| 128 | + ll = "debug" |
| 129 | + } |
| 130 | + logger := logging.Logger("dht-crawler") |
| 131 | + if err := logging.SetLogLevel("dht-crawler", ll); err != nil { |
| 132 | + return err |
| 133 | + } |
| 134 | + |
| 135 | + ctx := c.Context |
| 136 | + |
| 137 | + r, err := NewRecorder(c) |
| 138 | + if err != nil { |
| 139 | + return err |
| 140 | + } |
| 141 | + |
| 142 | + host, err := makeHost(c, r) |
| 143 | + if err != nil { |
| 144 | + return err |
| 145 | + } |
| 146 | + |
| 147 | + pending := newMAList() |
| 148 | + |
| 149 | + if c.IsSet("seed-file") { |
| 150 | + ok, err := pending.AddFile(c.String("seed-file")) |
| 151 | + if !ok { |
| 152 | + return err |
| 153 | + } else if err != nil { |
| 154 | + logger.Warnf("Some multiaddrs could not be parsed: %v", err) |
| 155 | + } |
| 156 | + } else if c.IsSet("seed-table") { |
| 157 | + addrs, err := r.getMultiAddrs(ctx, c.String("dataset"), c.String("seed-table"), c.Duration("seed-table-duration")) |
| 158 | + if err != nil { |
| 159 | + return err |
| 160 | + } |
| 161 | + ok, err := pending.AddStrings(addrs) |
| 162 | + if !ok { |
| 163 | + return err |
| 164 | + } else if err != nil { |
| 165 | + logger.Warnf("Some multiaddrs could not be parsed: %v", err) |
| 166 | + } |
| 167 | + } |
| 168 | + |
| 169 | + for _, ma := range bootstrapAddrs { |
| 170 | + if err := pending.Add(ma); err != nil { |
| 171 | + logger.Warnf("Unable to parse address %s: %w", ma, err) |
| 172 | + continue |
| 173 | + } |
| 174 | + } |
| 175 | + logger.Infof("Seeding crawl with %d peer addresses", len(pending)) |
| 176 | + |
| 177 | + // populate host info |
| 178 | + peers := make([]*peer.AddrInfo, 0, len(pending)) |
| 179 | + for _, p := range pending { |
| 180 | + pis, err := peer.AddrInfosFromP2pAddrs(p.Addrs...) |
| 181 | + if err != nil { |
| 182 | + logger.Warnf("Failed to parse addresses for %s: %w", p.ID, err) |
| 183 | + continue |
| 184 | + } |
| 185 | + for _, pi := range pis { |
| 186 | + peers = append(peers, &pi) |
| 187 | + } |
| 188 | + |
| 189 | + nonIDAddrs := make([]multiaddr.Multiaddr, 0, len(p.Addrs)) |
| 190 | + // Remove the /p2p/<id> portion of the addresses. |
| 191 | + for _, a := range p.Addrs { |
| 192 | + na, _ := multiaddr.SplitFunc(a, func(c multiaddr.Component) bool { |
| 193 | + return c.Protocol().Code == multiaddr.P_P2P |
| 194 | + }) |
| 195 | + nonIDAddrs = append(nonIDAddrs, na) |
| 196 | + } |
| 197 | + host.Peerstore().AddAddrs(p.ID, nonIDAddrs, time.Hour) |
| 198 | + } |
| 199 | + |
| 200 | + crawl, err := crawler.New(host, |
| 201 | + crawler.WithParallelism(c.Int("parallelism")), |
| 202 | + crawler.WithMsgTimeout(c.Duration("timeout"))) |
| 203 | + if err != nil { |
| 204 | + panic(err) |
| 205 | + } |
| 206 | + |
| 207 | + // TODO: configure timeout. |
| 208 | + short, c2 := context.WithTimeout(ctx, c.Duration("crawltime")) |
| 209 | + defer c2() |
| 210 | + crawl.Run(short, peers, |
| 211 | + r.onPeerSuccess, |
| 212 | + r.onPeerFailure) |
| 213 | + |
| 214 | + logger.Info("Crawl complete. Collecting Output...") |
| 215 | + return Output(c.String("output"), r) |
| 216 | +} |
0 commit comments