This repository has been archived by the owner on Apr 25, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
webScraper
150 lines (123 loc) · 4.65 KB
/
webScraper
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#Purpose: Google Image Web Scraper
#Author: Rokawoo Woof
#Created: 4/02/2023
from bs4 import BeautifulSoup
from discord_webhook import DiscordWebhook
import requests
import time
import math
def askIndex():
'''Purpose: asks and verifies user entered index
@param: none
@return: start and end index in tuple
'''
while True:
try:
start = int(input('Enter Start Index: '))
if start < 0:
print('Error: Start index is below 0.')
else:
break
except ValueError:
print('Error: An integer was expected.')
while True:
try:
end = int(input('Enter End Index: '))
if end < start:
print('Error: End index is below start index.')
elif end == start:
print('Error: End index is equal to start index.')
else:
break
except ValueError:
print('Error: An integer was expected.')
return start, end
def askYN(user_answer):
'''Purpose: asks and verifies if user wants safe search
@param: user's answer of yes or no
@return: on or off to set safeSearch
'''
user_answer = user_answer.lower().strip()
while True:
if user_answer in ['yes', 'y']:
return 'on'
elif user_answer in ['no', 'n']:
return 'off'
else:
user_answer = input(
'Please enter a valid answer (yes or no): ').lower().strip()
bool_retry = 'on'
while bool_retry == 'on':
# Set the tag word you want to search for
search_term = input('Enter your search tag(s): ')
formatted_search_term = search_term.strip().lower().replace(' ', '+')
# Set safe mode
safe_search = askYN(input('Keep safe search? (y/n): '))
# Set search index
search_index = askIndex()
ijn = math.ceil(search_index[1] / 100) # ijn 1 = 100, 2 = 200... images
with open('image_urls.txt', 'w') as f:
pass # an empty block to clear the file
start = search_index[0]
end = ((search_index[0] // 20) + 1) * 20
result_count = 0
while start < search_index[1]:
if end > search_index[1]:
end = search_index[1]
index_diffrence = end - start + 1
result_count += index_diffrence
# Create the URL for the search term
url = f'https://www.google.com/search?q={formatted_search_term}&tbm=isch&start={start}&end={end}&ijn={ijn}&safe={safe_search}'
# Send a GET request to the URL
response = requests.get(url)
# Create a BeautifulSoup object from the HTML content of the response
soup = BeautifulSoup(response.content, 'html.parser')
# Find all the image tags on the page
img_tags = soup.find_all('img')
# Get the webhook URL from an environment variable + start message
webhook_url = '[PUT DISCORD WEBHOOK URL HERE]'
webhook_content = f"For '{search_term}' from index {start} to {end}, there are {len(img_tags)} results. Here are {index_diffrence} results."
webhook = DiscordWebhook(url=webhook_url, content=webhook_content)
response = webhook.execute()
# Loop through the image tags and send each image to the Discord webhook
for i, img in enumerate(img_tags):
# Stop looping once we reach the endIndex
if i >= index_diffrence:
break
# Extract the URL of the image
img_url = img['src']
# Send the image to the Discord webhook
webhook_content = (img_url)
webhook = DiscordWebhook(url=webhook_url, content=webhook_content)
while True:
response = webhook.execute()
if response.status_code == 429:
print(f'Rate Limited: Retrying after 30 seconds...')
time.sleep(30) # Wait a bit before retrying
else:
break
# Delay before sending the next image
time.sleep(.225)
# Write the list of URLs to a file
with open('image_urls.txt', 'a') as f:
f.write(
f"For '{search_term}' from index {start} to {end}, there are {len(img_tags)} results. Here are {index_diffrence} results.\n\n"
)
for i, img in enumerate(img_tags):
# Stop looping once we reach the endIndex
if i >= index_diffrence:
break
img_url = img['src']
f.write(img_url + '\n')
f.write('\n\n')
start += 21
end += 21
time.sleep(5)
webhook_content = f"For '{search_term}' from overall index {search_index[0]} to {search_index[1]}, there were {result_count} total results. SafeSearch was {safe_search}."
webhook = DiscordWebhook(url=webhook_url, content=webhook_content)
response = webhook.execute()
with open('image_urls.txt', 'a') as f:
f.write(
f"\n\nFor '{search_term}' from overall index {search_index[0]} to {search_index[1]}, there were {result_count} total results. SafeSearch was {safe_search}."
)
bool_retry = askYN(input('Do you want to run the program again? (y/n): '))