This repository has been archived by the owner on Jun 7, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathsecraper.bash
82 lines (75 loc) · 2.09 KB
/
secraper.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/bin/bash
QUERY="${1}"
FILESAVE="secraper-result.txt"
echo ' ___ ___ ';
echo ' / __| __|__ _ _ __ _ _ __ ___ _ _ ';
echo ' \__ \ _|/ _| `_/ _` | `_ \/ -_) `_| ';
echo ' |___/___\__|_| \__,_| .__/\___|_| ';
echo ' by zerobyte.id |_| V. 2020.02 ';
echo ' ------ SEARCH ENGINE SCRAPER ------ ';
echo '';
if [[ -z ${QUERY} ]]; then
echo "ERROR: Query is empty"
echo "HINT: bash $0 \"QUERY HERE\""
exit
fi
function urlencode() {
python -c "import sys, urllib as ul; print ul.quote_plus(sys.argv[1])" "$1"
}
##### SEARCH.YAHOO.COM #####
PAGE="1"
i=0
while true
do
((i++))
YAHOO_SEARCH=$(curl -sk "https://search.yahoo.com/search?p=$(urlencode "${QUERY}")&b=${PAGE}")
echo " ======= YAHOO PAGE ${i} ======="
for URLs in $(echo "${YAHOO_SEARCH}" | grep -Po '<a class=" ac-algo fz-l ac-21th lh-24" href="\K.*?(?=")')
do
echo " => ${URLs}"
echo "${URLs}" >> ${FILESAVE}
done
PAGE=$(echo "${YAHOO_SEARCH}" | grep -Po '<a class="next" href="(.*?)b=\K.*?(?=\&)')
if [[ ! -n ${PAGE} ]]; then
break
fi
done
##### BING.COM #####
PAGE="1"
i=0
while true
do
((i++))
LASTPAGE=${PAGE}
BING_SEARCH=$(curl -sk "https://www.bing.com/search?q=$(urlencode "${QUERY}")&first=${PAGE}&FORM=PORE")
PAGE=$(echo "${BING_SEARCH}" | grep -Po 'title="Next page" href="(.*?)first=\K.*?(?=\&)')
echo " ======= BING PAGE ${i} ======="
for URLs in $(echo "${BING_SEARCH}" | grep -Po '<h2><a href="\K.*?(?=")')
do
echo " => ${URLs}"
echo "${URLs}" >> ${FILESAVE}
done
if [[ ! -n ${PAGE} ]]; then
break
elif [[ ${PAGE} -lt ${LASTPAGE} ]]; then
break
fi
done
##### ASK.COM #####
PAGE="1"
i=0
while true
do
((i++))
ASK_SEARCH=$(curl -sk "https://www.ask.com/web?o=0&l=dir&qo=pagination&q=$(urlencode "${QUERY}")&qsrc=998&page=${PAGE}")
PAGE=$(echo "$ASK_SEARCH" | grep -B1 '<li class="PartialWebPagination-next">Next' | grep -Po '<a href="(.*?)page=\K.*?(?=")')
if [[ ! -n ${PAGE} ]]; then
break
fi
echo " ======= ASK PAGE ${i} ======="
for URLs in $(echo "${ASK_SEARCH}" | grep -Po "target=\"_blank\" href='\K.*?(?=')")
do
echo " => ${URLs}"
echo "${URLs}" >> ${FILESAVE}
done
done