workspace/main/conf/default.yaml

# First let's settle on some widely used terminology - remember this, it's used everywhere ;)
#	JOB(=domain, =set of TASKs)
#	TASK(=page, =part of JOB)
#

host: crawler-1 # for metrics

log:
  # fancy some colors? - disable if redirect to file is intended
  ansi: false
  # base-line log level
  level: info
  # my_crate=info,my_crate::my_mod=debug,[my_span]=trace
  # see https://tracing.rs/tracing_subscriber/filter/struct.envfilter
  #"[task{name=Crusty::go}]=info", "[task{name=Crusty::job_reader}]=info"
  #filter: ["[task{name=TaskProcessor::go}]=warn", "[task{name=TaskScheduler::go}]=warn", "[task{name=Crawler::go}]=warn"]

# as soon as we see SIGTERM we stop consuming new jobs from redis queue and switch to one of shutdown states
# graceful - we have time to spare
#     1) start counting graceful_timeout
#     2) WAIT for second SIGTERM or graceful_timeout elapsed - switch to immediate shutdown mode
#     3) WAIT if we processed all jobs and persisted everything to db THEN exit normally
# immediate - we need to exit asap
#     1) stop all crawling activity no matter the state of it
#     2) WAIT if we persisted everything to db THEN exit normally
shutdown:
  # putting graceful_timeout to 0 basically means we go straight to immediate shutdown mode
  graceful_timeout: 5s

# Redis settings (queue)
queue:
  redis:
    hosts: ["redis://172.16.2.1:6379/", "redis://172.16.2.2:6379/"]
  jobs:
    # min shard number we have access to(so called scoped shards)
    shard_min: 0
    # max shard number we have access to(so called scoped shards), effective scoped shards - 0, 1
    shard_max: 2
    # total number of all shards, in a multi-node setup shard_total >= shard_max - shard_min
    shard_total: 2

    # scoped shards - shards we dequeue jobs from and update job status at
    # total shards - shards we insert into, this is always 0..shard_total

    #we resolve IP of all discovered domains and calculate addr_key
    #1. Take both IPv4 and IPv6
    #2. Sort them lexicographically(separately)
    #3. Concatenate the resulting address list with IPv4 always going first
    #4. Take first IP and apply appropriate masking
    #5. addr_key = addr_key | addr_key_4_mask;
    #we now use addr_key in shard calculation, we never select more than domain_top_n domains from a given addr_key
    #this ensures we are being polite to websites with different domains hosted on the same IP(or subnet, depending on masking)
    # For now we use either ipv4 or ipv6 and assume given addresses are always valid and available
    addr_key_v4_mask: 24 #read as /24 meaning first 24 bits are significant while last 8 are not(will be masked)
    addr_key_v6_mask: 24 #separate mask for ipv6
    enqueue:
      options:
        # each added jobs has a TTL, we don't want jobs handing in redis memory forever...
        # the nature of broad web crawling is - almost everything significant will be discovered, eventually
        # dropping job here and there is no biggie
        ttl: 10m
      driver: #those are relabuf settings, see https://github.com/let4be/relabuf
        soft_cap: 500       # release when we've buffered soft_cap or more
        hard_cap: 1000      # do not buffer more than hard_cap, slow down the producer after hard_cap is reached
        release_after: 1s   # release when release_after since last successful release(or start) has passed
    finish:
      options:
        ttl: 10m
        # initial in-history bloom filter capacity, keep this big enough.
        # Scaling of bloom filter is relatively expensive and will affect performance of all future computations against this filter
        bf_initial_capacity: 10000000
        # we won't check bf_error_rate of all discovered domains... it's the price for predictable memory footprint && efficiency
        bf_error_rate: 0.001
        # when filter is at capacity expand it by multiplying current capacity by this factor
        bf_expansion_factor: 2
      driver:
        soft_cap: 500
        hard_cap: 1000
        release_after: 1s
    dequeue:
      options:
        # up to limit jobs at once
        limit: 10000
        # dequeued jobs are being put in the in-processing list with TTL in case of sudden Crusty node explosion
        # we do NOT want jobs getting stuck "in processing" state forever!
        # keep this strictly > reader.default_crawling_settings job_hard_timeout
        ttl: 10m
        # do not emit job permits more frequently than
        emit_permit_delay: 1s
      # we reuse the same relabuf functionality when looking for new jobs
      # the basic concept is - we regularly emit so-called job look permits, and the rest of relabuf rules apply(for releasing those permits)
      driver:
        # look for new jobs when internal buffer gets empty AND we have at least 1 outstanding permit
        soft_cap: 1
        # do not keep more than hard_cap permits(backoff permit producer)
        hard_cap: 2
        # look for new jobs also by time(if permit is available)
        release_after: 1s
    # select new jobs(domains) from a sharded queue stored in redis
    reader:
      # jobs are sharded, do not ask the same shard for job unless duration has passed since last time we asked
      shard_min_last_read: 1s
      # initial list of seed URLs to start the broad crawling from, additionally we also read seeds from CRUSTY_SEEDS env. variable
      seeds: [ ]

# Clickhouse database settings (metrics)
metrics:
  clickhouse:
    url: http://172.16.1.1:8123
    username: default
    password: ""
    database: crusty

  # We monitor various internal queues and persist their status to db
  queue_monitor_interval: 1s

  # We persist various queue metrics
  metrics_queue:
    table_name: metrics_queue
    label: ""
    # we always try to write in bulk, buffer up to max items before writing
    buffer_capacity: 1000
    # while we're waiting for buffer to fill wake once in a while to check for force_write_duration
    check_for_force_write_duration: 100ms
    # if force_write_duration elapsed since last write, but we yet not filled buffer_capacity force the writing anyway
    force_write_duration: 500ms
    concurrency: 3

  # We persist some db metrics for further analysis
  metrics_db:
    table_name: metrics_db
    label: ""
    buffer_capacity: 1000
    check_for_force_write_duration: 100ms
    force_write_duration: 500ms
    concurrency: 3

  # We persist metrics and various meta-data for each completed TASK
  metrics_task:
    table_name: metrics_task
    label: ""
    buffer_capacity: 25000
    check_for_force_write_duration: 500ms
    force_write_duration: 5000ms
    concurrency: 3

  # We persist metrics and various meta-data for each finished JOB
  metrics_job:
    table_name: metrics_job
    label: ""
    buffer_capacity: 25000
    check_for_force_write_duration: 500ms
    force_write_duration: 5000ms
    concurrency: 3

  # We persist TOP K domains from redis into clickhouse(heavy hitters, glued on top of "TOP-K from redisbloom")
  topk:
    table_name: domain_topk
    label: ""
    buffer_capacity: 2500
    check_for_force_write_duration: 2500ms
    force_write_duration: 15000ms
    concurrency: 3

# topk settings
topk:
  redis:
    hosts: ["redis://172.16.3.1:6379/"]
  collect:
    # when set to true collects info only about second level domains(like example.com but not bla-bla.example.com)
    second_level_only: false
  options:
    name: "top-domains"
    k: 100
    width: 25000
    depth: 8
    decay: 0.92
    consume_interval: 15s
  driver:
    soft_cap: 250
    hard_cap: 1000
    release_after: 10s

# crawling rules
rules:
  skip_no_follow_links: true
  # we limit N of links we can collect from a given JOB
  total_link_budget: 1000
  # we limit N of links we can collect from a given TASK
  links_per_task_budget: 100
  # do not go deeper than this level
  max_level: 25
  # should we respect robots.txt?
  robots_txt: true
  # after max redirects we stop trying to load a given page
  max_redirect: 5

# those settings relate to a crawler running on some particular domain
default_crawling_settings:
  # up to N pages concurrently, keep this number low to avoid excess stress
  concurrency: 2
  internal_read_buffer_size: 32kib
  max_response_size: 2mib
  # 1s-5s is a safe bet to avoid extra stress on sites
  delay: 1s
  # vary delay time by this jitter(0..)
  delay_jitter: 1s
  # timeout for status loading(headers only)
  status_timeout: 5s
  # timeout for page loading and buffering(data)
  load_timeout: 10s
  # after soft timeout elapses we no longer queue new tasks for domain
  job_soft_timeout: 180s
  # after hard timeout elapses we forcibly stop the crawling job for this domain
  job_hard_timeout: 300s
  # killing a lot of jobs at the same time may negatively affect performance on high loaded setups, so we introduce a prng jitter
  job_hard_timeout_jitter: 60s
  # Optional user-agent string. Supports dynamic replacement of some variables
  user_agent: "crusty/{GIT_SHA}"
  compression: true
  # custom headers are supported
  custom_headers:
    accept:
      - "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"

# various networking settings
networking:
  profile:
    # Fancy local address binding for monster setups with several NICs(local port limitation)
    bind_local_ipv4: []
    bind_local_ipv6: []
    # task will select source IP just once -
    # no matter the concurrency, for a particular task requests will always be coming from the same IP
    static_binding_within_the_task: true
    socket_read_buffer_size:
    socket_write_buffer_size:
    connect_timeout: 5s
  #dns resolver config && options, more available(check sources)
  resolver:
    config:
      name_servers:
        - socket_addr: 172.16.0.254:53
          protocol: udp
    options:
      timeout: 5s
  #optionally blacklist some nets, reserved ipv4 are already blacklisted
  net_v4_blacklist: []

# domain discovery settings
domain_discovery:
  # LRU cache capacity, this cache helps to ease load on redis queue(so we do not try to insert billions of duplicated records)
  # but because cache is local it's effectiveness will drop when adding new Crusty nodes
  # but worry not, redis queue is so fast(and pre-sharded) we could run couple hundred crusty nodes with almost no additional setup
  cap: 25000000

# resolver settings
# leave empty for auto-conf
resolver:
  # number of concurrent green threads for name resolution(be mindful of your dns server capacity)
  # this should be configured carefully, low setting will lead to job starvation(inability to satisfy requested concurrency_profile.domain_concurrency)
  concurrency: 150

  # how we select address to connect to(in case several are specified)
  # one address always has to be selected before task can be queued(we need to know address to calculate addr_key && shard),
  # we currently do not handle per-task errors(such as connection rejected or 5xx codes)
  # for now we simply assume those will be purged from Q externally && rediscovered eventually(not implemented yet)
  # Disabled - ignore ipv6 addresses
  # Preferred - prefer ipv6 over ipv4
  # Fallback - use ipv6 only when ipv4 is not available
  addr_ipv6_policy: Fallback

#parser:
  # We parse HTML in a separate thread pool, stack size is configurable
  # When doing full DOM parsing keep this huge, when using LoL parser stack can be left at default, we use LoL
  # stack_size: 128mib
  #= N of physical cores by default
  # concurrency: 8
  # pin N parser threads to a core
  # pin: 0

# vertical scaling point(make sure to bump resolver.concurrency accordingly)
concurrency:
  # We check multiple domains concurrently, set accordingly to saturate your hardware(cpu/network bound)
  # in future we should add adaptive auto-tuning
  domain_concurrency: 300