-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtwisearch.py
99 lines (94 loc) · 3.71 KB
/
twisearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from selenium.common.exceptions import TimeoutException, JavascriptException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
EXECUTABLE_PATH = 'gecko/chromedriver'
def setDriver(executable_path, headless = False, maximize = True):
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
if maximize:
chrome_options.add_argument("--start-maximized")
if headless:
chrome_options.add_argument("--headless")
return webdriver.Chrome(executable_path = executable_path, chrome_options=chrome_options)
def scroll(driver, numScrolls = 20000, fastScroll = False):
scroll_time = 8
if fastScroll:
driver.execute_script("document.body.style.transform = 'scale(0.05)'")
current_scrolls = 0
old_height = 0
sleep(0.5)
while True:
try:
if current_scrolls == numScrolls:
return
try:
old_height = driver.execute_script("return document.body.scrollHeight")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
WebDriverWait(driver, scroll_time, 0.05).until(
lambda driver: check_height(driver, old_height)
)
current_scrolls += 1
except JavascriptException:
pass
except TimeoutException:
break
driver.execute_script("document.body.style.transform = 'scale(1.00)'")
return
def check_height(driver, old_height):
new_height = driver.execute_script("return document.body.scrollHeight")
return new_height != old_height
def twitterSearch(query, scrollDepth = 3):
driver = setDriver(executable_path = EXECUTABLE_PATH)
query = query.lower()
url = f'https://twitter.com/search?q={query}&src=typed_query&f=user'
wait = WebDriverWait(driver, 10)
driver.get(url)
try:
element = wait.until(EC.presence_of_element_located((By.XPATH, '//div[@aria-label="Timeline: Search timeline"]')))
scroll(driver, numScrolls = scrollDepth)
wait.until(EC.visibility_of_all_elements_located((By.XPATH, '//div[@data-testid="UserCell"]')))
except KeyboardInterrupt:
raise Exception("Interrupted")
except TimeoutException:
print("Failed")
return []
results = []
soup = BeautifulSoup(element.get_attribute("innerHTML"), "html.parser")
for data in soup.find_all('div', {'data-testid': 'UserCell'}):
result = {
"id": "",
"username": "",
"full_name": "",
"picture_url": ""
}
a = data.find('a')
if a:
user = a.get('href', '')
if user != '':
username = user.split('/')[-1]
result["username"] = username
i = data.find('img')
if i:
result["picture_url"] = i.get('src', '')
a = a.findNext('a')
if a:
username = a.get('href', '')
if user == username:
a = a.find('span')
if a:
result["full_name"] = a.text
a = data.find(lambda tag: tag.name == 'div' and '-follow' in tag.get('data-testid', ''))
if a:
result["id"] = a.get('data-testid', '').partition('-')[0]
results.append(result)
driver.close()
return {
"site": "twitter",
"data": results
}