webScraper

#Purpose: Google Image Web Scraper
#Author: Rokawoo Woof
#Created: 4/02/2023

from bs4 import BeautifulSoup
from discord_webhook import DiscordWebhook
import requests
import time
import math


def askIndex():
  '''Purpose: asks and verifies user entered index
  @param: none
  @return: start and end index in tuple
  '''

  while True:
    try:
      start = int(input('Enter Start Index: '))
      if start < 0:
        print('Error: Start index is below 0.')
      else:
        break
    except ValueError:
      print('Error: An integer was expected.')

  while True:
    try:
      end = int(input('Enter End Index: '))
      if end < start:
        print('Error: End index is below start index.')
      elif end == start:
        print('Error: End index is equal to start index.')
      else:
        break
    except ValueError:
      print('Error: An integer was expected.')
  return start, end


def askYN(user_answer):
  '''Purpose: asks and verifies if user wants safe search
  @param: user's answer of yes or no
  @return: on or off to set safeSearch
  '''

  user_answer = user_answer.lower().strip()
  while True:
    if user_answer in ['yes', 'y']:
      return 'on'
    elif user_answer in ['no', 'n']:
      return 'off'
    else:
      user_answer = input(
        'Please enter a valid answer (yes or no): ').lower().strip()


bool_retry = 'on'
while bool_retry == 'on':
  # Set the tag word you want to search for
  search_term = input('Enter your search tag(s): ')
  formatted_search_term = search_term.strip().lower().replace(' ', '+')

  # Set safe mode
  safe_search = askYN(input('Keep safe search? (y/n): '))

  # Set search index
  search_index = askIndex()
  ijn = math.ceil(search_index[1] / 100)  # ijn 1 = 100, 2 = 200... images

  with open('image_urls.txt', 'w') as f:
    pass  # an empty block to clear the file

  start = search_index[0]
  end = ((search_index[0] // 20) + 1) * 20
  result_count = 0
  while start < search_index[1]:
    if end > search_index[1]:
      end = search_index[1]
    index_diffrence = end - start + 1
    result_count += index_diffrence

    # Create the URL for the search term
    url = f'https://www.google.com/search?q={formatted_search_term}&tbm=isch&start={start}&end={end}&ijn={ijn}&safe={safe_search}'

    # Send a GET request to the URL
    response = requests.get(url)

    # Create a BeautifulSoup object from the HTML content of the response
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the image tags on the page
    img_tags = soup.find_all('img')

    # Get the webhook URL from an environment variable + start message
    webhook_url = '[PUT DISCORD WEBHOOK URL HERE]'
    webhook_content = f"For '{search_term}' from index {start} to {end}, there are {len(img_tags)} results. Here are {index_diffrence} results."
    webhook = DiscordWebhook(url=webhook_url, content=webhook_content)
    response = webhook.execute()

    # Loop through the image tags and send each image to the Discord webhook
    for i, img in enumerate(img_tags):
      # Stop looping once we reach the endIndex
      if i >= index_diffrence:
        break

      # Extract the URL of the image
      img_url = img['src']

      # Send the image to the Discord webhook
      webhook_content = (img_url)
      webhook = DiscordWebhook(url=webhook_url, content=webhook_content)
      while True:
        response = webhook.execute()
        if response.status_code == 429:
          print(f'Rate Limited: Retrying after 30 seconds...')
          time.sleep(30)  # Wait a bit before retrying
        else:
          break

      # Delay before sending the next image
      time.sleep(.225)

    # Write the list of URLs to a file
    with open('image_urls.txt', 'a') as f:
      f.write(
        f"For '{search_term}' from index {start} to {end}, there are {len(img_tags)} results. Here are {index_diffrence} results.\n\n"
      )
      for i, img in enumerate(img_tags):
        # Stop looping once we reach the endIndex
        if i >= index_diffrence:
          break
        img_url = img['src']
        f.write(img_url + '\n')
      f.write('\n\n')
    start += 21
    end += 21
    time.sleep(5)

  webhook_content = f"For '{search_term}' from overall index {search_index[0]} to {search_index[1]}, there were {result_count} total results. SafeSearch was {safe_search}."
  webhook = DiscordWebhook(url=webhook_url, content=webhook_content)
  response = webhook.execute()
  
  with open('image_urls.txt', 'a') as f:
    f.write(
      f"\n\nFor '{search_term}' from overall index {search_index[0]} to {search_index[1]}, there were {result_count} total results. SafeSearch was {safe_search}."
    )

  bool_retry = askYN(input('Do you want to run the program again? (y/n): '))