-
Notifications
You must be signed in to change notification settings - Fork 1
/
urlgrep
executable file
·107 lines (96 loc) · 3.43 KB
/
urlgrep
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env bash
#
# NAME
#
# urlgrep - print all HTTP links from a URL
#
# SYNOPSIS
#
# urlgrep [--all|--chrome|--elinks|--wget] [--filter-web] <url>...
#
# DESCRIPTION
#
# urlgrep prints all the HTTP links from a given URL. To do so, we
# cannot simply use something like
#
# curl <url> | grep <url_regex>
#
# This is because most links in a webpage are relative links and they
# need to be resolved to produce absolute URLs. For this, we need to
# use a backend. wget and lynx work as backends in many cases, but
# when the page is loaded dynamically using JavaScript, we need an
# actual browser to do the job.
#
# DEPENDENCIES
#
# wget(1), google-chrome(1), elinks(1)
#
fetch="fetch_all"
filter="filter_none"
while [[ "$1" =~ ^-- ]]
do
case "$1" in
--chrome|--elinks|--wget|--all)
fetch="fetch_${1#--}"
shift ;;
--filter-web)
filter="filter_web"
shift ;;
-*)
echo >&2 "${0##*/} unknown option $1"
exit 1 ;;
esac
done
tmpfile="/tmp/urlgrep.$RANDOM"
trap 'rm -rf "$tmpfile" &>/dev/null' EXIT
trap 'exit 2' HUP INT QUIT TERM
# Each fetch function must print the contents of a URL to the stdout.
fetch_chrome() {
# The below Chrome options were procured from various discussions on
# the Internet. They're probably not all required.
google-chrome \
--disable-audio-output \
--disable-dev-shm-usage \
--disable-gpu \
--headless \
--icognito \
--no-default-browser-check \
--no-first-run \
--no-sandbox \
--single-process \
--timeout=10000 \
--virtual-time-budget=10000 \
--dump-dom "$1" 2>/dev/null
}
fetch_elinks() {
elinks -dump "$1"
}
fetch_wget() {
wget \
--convert-links \
--execute robots=off \
--no-config \
--quiet \
--retry-connrefused \
--user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36' \
--output-document "$tmpfile" "$url"
cat "$tmpfile"
}
fetch_all() {
fetch_chrome "$1"
fetch_elinks "$1"
fetch_wget "$1"
}
# Filter URLs that point to files with common web extensions.
filter_web() {
grep -v -E '\.((r|x|s)?html?|adp|ashx|asmx|aspx?|asx|atom|axd|ccss|cer|cgi|css|dtl|erb|hcsp|hss|hta|htc|js|jsp|less|php|pl|rjs|rss|sass|ts|woff|xml|yaws)(\?|$)' | sed 's|/$||' | sort | uniq
}
filter_none() {
sort | uniq
}
for url
do
"$fetch" "$url" \
| grep -o -E "https?://[][[:alnum:]._~:/?#@!$&'()*+,;%=-]+" \
| "$filter"
done