1+ import time
2+
13import requests
24from selenium import webdriver
35from selenium .webdriver .common .by import By
1113
1214class Person (Scraper ):
1315
14- __TOP_CARD = "pv-top-card "
16+ __TOP_CARD = "scaffold-layout__main "
1517 __WAIT_FOR_ELEMENT_TIMEOUT = 5
1618
1719 def __init__ (
@@ -113,13 +115,15 @@ def get_experiences(self):
113115 main = self .wait_for_element_to_load (by = By .TAG_NAME , name = "main" )
114116 self .scroll_to_half ()
115117 self .scroll_to_bottom ()
116- main_list = self .wait_for_element_to_load (name = "pvs-list " , base = main )
117- for position in main_list .find_elements (By .XPATH , "li " ):
118- position = position .find_element (By .CLASS_NAME , "pvs- entity" )
119- company_logo_elem , position_details = position .find_elements (By .XPATH ,"*" )
118+ main_list = self .wait_for_element_to_load (name = "pvs-list__container " , base = main )
119+ for position in main_list .find_elements (By .CLASS_NAME , "pvs-list__paged-list-item " ):
120+ position = position .find_element (By .CSS_SELECTOR , "div[data-view-name='profile-component- entity'] " )
121+ company_logo_elem , position_details = position .find_elements (By .XPATH , "*" )
120122
121123 # company elem
122124 company_linkedin_url = company_logo_elem .find_element (By .XPATH ,"*" ).get_attribute ("href" )
125+ if not company_linkedin_url :
126+ continue
123127
124128 # position details
125129 position_details_list = position_details .find_elements (By .XPATH ,"*" )
@@ -143,15 +147,26 @@ def get_experiences(self):
143147 company = outer_positions [0 ].find_element (By .TAG_NAME ,"span" ).text
144148 work_times = outer_positions [1 ].find_element (By .TAG_NAME ,"span" ).text
145149 location = outer_positions [2 ].find_element (By .TAG_NAME ,"span" ).text
150+ else :
151+ position_title = ""
152+ company = outer_positions [0 ].find_element (By .TAG_NAME ,"span" ).text
153+ work_times = ""
154+ location = ""
155+
146156
147157 times = work_times .split ("·" )[0 ].strip () if work_times else ""
148158 duration = work_times .split ("·" )[1 ].strip () if len (work_times .split ("·" )) > 1 else None
149159
150160 from_date = " " .join (times .split (" " )[:2 ]) if times else ""
151161 to_date = " " .join (times .split (" " )[3 :]) if times else ""
152-
153- if position_summary_text and len (position_summary_text .find_element (By .CLASS_NAME ,"pvs-list" ).find_element (By .CLASS_NAME ,"pvs-list" ).find_elements (By .XPATH ,"li" )) > 1 :
154- descriptions = position_summary_text .find_element (By .CLASS_NAME ,"pvs-list" ).find_element (By .CLASS_NAME ,"pvs-list" ).find_elements (By .XPATH ,"li" )
162+ if position_summary_text and any (element .get_attribute ("pvs-list__container" ) for element in position_summary_text .find_elements (By .TAG_NAME , "*" )):
163+ inner_positions = (position_summary_text .find_element (By .CLASS_NAME ,"pvs-list__container" )
164+ .find_element (By .XPATH ,"*" ).find_element (By .XPATH ,"*" ).find_element (By .XPATH ,"*" )
165+ .find_elements (By .CLASS_NAME ,"pvs-list__paged-list-item" ))
166+ else :
167+ inner_positions = []
168+ if len (inner_positions ) > 1 :
169+ descriptions = inner_positions
155170 for description in descriptions :
156171 res = description .find_element (By .TAG_NAME ,"a" ).find_elements (By .XPATH ,"*" )
157172 position_title_elem = res [0 ] if len (res ) > 0 else None
@@ -200,8 +215,9 @@ def get_educations(self):
200215 main = self .wait_for_element_to_load (by = By .TAG_NAME , name = "main" )
201216 self .scroll_to_half ()
202217 self .scroll_to_bottom ()
203- main_list = self .wait_for_element_to_load (name = "pvs-list" , base = main )
204- for position in main_list .find_elements (By .CLASS_NAME ,"pvs-entity" ):
218+ main_list = self .wait_for_element_to_load (name = "pvs-list__container" , base = main )
219+ for position in main_list .find_elements (By .CLASS_NAME ,"pvs-list__paged-list-item" ):
220+ position = position .find_element (By .XPATH ,"//div[@data-view-name='profile-component-entity']" )
205221 institution_logo_elem , position_details = position .find_elements (By .XPATH ,"*" )
206222
207223 # company elem
@@ -214,13 +230,17 @@ def get_educations(self):
214230 outer_positions = position_summary_details .find_element (By .XPATH ,"*" ).find_elements (By .XPATH ,"*" )
215231
216232 institution_name = outer_positions [0 ].find_element (By .TAG_NAME ,"span" ).text
217- degree = outer_positions [1 ].find_element (By .TAG_NAME ,"span" ).text
233+ if len (outer_positions ) > 1 :
234+ degree = outer_positions [1 ].find_element (By .TAG_NAME ,"span" ).text
235+ else :
236+ degree = None
218237
219238 if len (outer_positions ) > 2 :
220239 times = outer_positions [2 ].find_element (By .TAG_NAME ,"span" ).text
221240
222- from_date = " " .join (times .split (" " )[:2 ])
223- to_date = " " .join (times .split (" " )[3 :])
241+ if times != "" :
242+ from_date = times .split (" " )[times .split (" " ).index ("-" )- 1 ] if len (times .split (" " ))> 3 else times .split (" " )[0 ]
243+ to_date = times .split (" " )[- 1 ]
224244 else :
225245 from_date = None
226246 to_date = None
@@ -240,10 +260,9 @@ def get_educations(self):
240260 self .add_education (education )
241261
242262 def get_name_and_location (self ):
243- top_panels = self .driver .find_elements (By .CLASS_NAME ,"pv-text-details__left-panel" )
244- self .name = top_panels [0 ].find_elements (By .XPATH ,"*" )[0 ].text
245- self .location = top_panels [1 ].find_element (By .TAG_NAME ,"span" ).text
246-
263+ top_panel = self .driver .find_element (By .XPATH , "//*[@class='mt2 relative']" )
264+ self .name = top_panel .find_element (By .TAG_NAME , "h1" ).text
265+ self .location = top_panel .find_element (By .XPATH , "//*[@class='text-body-small inline t-black--light break-words']" ).text
247266
248267 def get_about (self ):
249268 try :
@@ -288,75 +307,6 @@ def scrape_logged_in(self, close_on_complete=True):
288307 self .get_educations ()
289308
290309 driver .get (self .linkedin_url )
291-
292- # get interest
293- try :
294-
295- _ = WebDriverWait (driver , self .__WAIT_FOR_ELEMENT_TIMEOUT ).until (
296- EC .presence_of_element_located (
297- (
298- By .XPATH ,
299- "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']" ,
300- )
301- )
302- )
303- interestContainer = driver .find_element (By .XPATH ,
304- "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']"
305- )
306- for interestElement in interestContainer .find_elements (By .XPATH ,
307- "//*[@class='pv-interest-entity pv-profile-section__card-item ember-view']"
308- ):
309- interest = Interest (
310- interestElement .find_element (By .TAG_NAME , "h3" ).text .strip ()
311- )
312- self .add_interest (interest )
313- except :
314- pass
315-
316- # get accomplishment
317- try :
318- _ = WebDriverWait (driver , self .__WAIT_FOR_ELEMENT_TIMEOUT ).until (
319- EC .presence_of_element_located (
320- (
321- By .XPATH ,
322- "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']" ,
323- )
324- )
325- )
326- acc = driver .find_element (By .XPATH ,
327- "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']"
328- )
329- for block in acc .find_elements (By .XPATH ,
330- "//div[@class='pv-accomplishments-block__content break-words']"
331- ):
332- category = block .find_element (By .TAG_NAME , "h3" )
333- for title in block .find_element (By .TAG_NAME ,
334- "ul"
335- ).find_elements (By .TAG_NAME , "li" ):
336- accomplishment = Accomplishment (category .text , title .text )
337- self .add_accomplishment (accomplishment )
338- except :
339- pass
340-
341- # get connections
342- try :
343- driver .get ("https://www.linkedin.com/mynetwork/invite-connect/connections/" )
344- _ = WebDriverWait (driver , self .__WAIT_FOR_ELEMENT_TIMEOUT ).until (
345- EC .presence_of_element_located ((By .CLASS_NAME , "mn-connections" ))
346- )
347- connections = driver .find_element (By .CLASS_NAME , "mn-connections" )
348- if connections is not None :
349- for conn in connections .find_elements (By .CLASS_NAME , "mn-connection-card" ):
350- anchor = conn .find_element (By .CLASS_NAME , "mn-connection-card__link" )
351- url = anchor .get_attribute ("href" )
352- name = conn .find_element (By .CLASS_NAME , "mn-connection-card__details" ).find_element (By .CLASS_NAME , "mn-connection-card__name" ).text .strip ()
353- occupation = conn .find_element (By .CLASS_NAME , "mn-connection-card__details" ).find_element (By .CLASS_NAME , "mn-connection-card__occupation" ).text .strip ()
354-
355- contact = Contact (name = name , occupation = occupation , url = url )
356- self .add_contact (contact )
357- except :
358- connections = None
359-
360310 if close_on_complete :
361311 driver .quit ()
362312
0 commit comments