Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 31 additions & 50 deletions linkedin_scraper/person.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
self.accomplishments = accomplishments or []
self.also_viewed_urls = []
self.contacts = contacts or []
self.scraped_education_keys = set() # Includes the fix for duplicates

if driver is None:
try:
Expand Down Expand Up @@ -116,31 +117,28 @@ def get_experiences(self):
main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"):
position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")

# Fix: Handle case where more than 2 elements are returned

elements = position.find_elements(By.XPATH, "*")
if len(elements) < 2:
continue # Skip if we don't have enough elements
continue

company_logo_elem = elements[0]
position_details = elements[1]

# company elem
try:
company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
if not company_linkedin_url:
continue
except NoSuchElementException:
continue

# position details
position_details_list = position_details.find_elements(By.XPATH,"*")
position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None
position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None

if not position_summary_details:
continue

outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")

if len(outer_positions) == 4:
Expand All @@ -165,7 +163,6 @@ def get_experiences(self):
work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text if len(outer_positions) > 1 else ""
location = ""

# Safely extract times and duration
if work_times:
parts = work_times.split("·")
times = parts[0].strip() if parts else ""
Expand All @@ -176,7 +173,7 @@ def get_experiences(self):

from_date = " ".join(times.split(" ")[:2]) if times else ""
to_date = " ".join(times.split(" ")[3:]) if times and len(times.split(" ")) > 3 else ""

if position_summary_text and any(element.get_attribute("class") == "pvs-list__container" for element in position_summary_text.find_elements(By.XPATH, "*")):
try:
inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
Expand All @@ -186,7 +183,7 @@ def get_experiences(self):
inner_positions = []
else:
inner_positions = []

if len(inner_positions) > 1:
descriptions = inner_positions
for description in descriptions:
Expand All @@ -199,8 +196,7 @@ def get_experiences(self):
location = location_elem.find_element(By.XPATH,"*").text if location_elem else None
position_title = position_title_elem.find_element(By.XPATH,"*").find_element(By.TAG_NAME,"*").text if position_title_elem else ""
work_times = work_times_elem.find_element(By.XPATH,"*").text if work_times_elem else ""

# Safely extract times and duration

if work_times:
parts = work_times.split("·")
times = parts[0].strip() if parts else ""
Expand All @@ -224,7 +220,6 @@ def get_experiences(self):
)
self.add_experience(experience)
except (NoSuchElementException, IndexError) as e:
# Skip this description if elements are missing
continue
else:
description = position_summary_text.text if position_summary_text else ""
Expand Down Expand Up @@ -253,24 +248,20 @@ def get_educations(self):
try:
position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")

# Fix: Handle case where more than 2 elements are returned
elements = position.find_elements(By.XPATH,"*")
if len(elements) < 2:
continue # Skip if we don't have enough elements
continue

institution_logo_elem = elements[0]
position_details = elements[1]

# institution elem
try:
institution_linkedin_url = institution_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
except NoSuchElementException:
institution_linkedin_url = None

# position details
position_details_list = position_details.find_elements(By.XPATH,"*")
position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None
position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None

if not position_summary_details:
continue
Expand All @@ -280,38 +271,35 @@ def get_educations(self):
institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text if outer_positions else ""
degree = outer_positions[1].find_element(By.TAG_NAME,"span").text if len(outer_positions) > 1 else None

from_date = None
to_date = None
from_date, to_date = None, None

if len(outer_positions) > 2:
try:
times = outer_positions[2].find_element(By.TAG_NAME,"span").text

if times and "-" in times:
split_times = times.split(" ")
dash_index = split_times.index("-") if "-" in split_times else -1

if dash_index > 0:
from_date = split_times[dash_index-1]
if dash_index < len(split_times) - 1:
to_date = split_times[-1]
parts = [p.strip() for p in times.split("-")]
from_date = parts[0]
to_date = parts[1]
except (NoSuchElementException, ValueError):
from_date = None
to_date = None
pass

description = position_summary_text.text if position_summary_text else ""
description = position_details_list[1].text if len(position_details_list) > 1 else ""

education_key = (institution_name, degree, from_date, to_date)

if education_key not in self.scraped_education_keys:
education = Education(
from_date=from_date,
to_date=to_date,
description=description,
degree=degree,
institution_name=institution_name,
linkedin_url=institution_linkedin_url
)
self.add_education(education)
self.scraped_education_keys.add(education_key)

education = Education(
from_date=from_date,
to_date=to_date,
description=description,
degree=degree,
institution_name=institution_name,
linkedin_url=institution_linkedin_url
)
self.add_education(education)
except (NoSuchElementException, IndexError) as e:
# Skip this education entry if elements are missing
continue

def get_name_and_location(self):
Expand Down Expand Up @@ -341,12 +329,10 @@ def scrape_logged_in(self, close_on_complete=True):
self.focus()
self.wait(5)

# get name and location
self.get_name_and_location()

self.open_to_work = self.is_open_to_work()

# get about
self.get_about()
driver.execute_script(
"window.scrollTo(0, Math.ceil(document.body.scrollHeight/2));"
Expand All @@ -355,15 +341,12 @@ def scrape_logged_in(self, close_on_complete=True):
"window.scrollTo(0, Math.ceil(document.body.scrollHeight/1.5));"
)

# get experience
self.get_experiences()

# get education
self.get_educations()

driver.get(self.linkedin_url)

# get interest
try:

_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
Expand All @@ -387,7 +370,6 @@ def scrape_logged_in(self, close_on_complete=True):
except:
pass

# get accomplishment
try:
_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
EC.presence_of_element_located(
Expand All @@ -412,7 +394,6 @@ def scrape_logged_in(self, close_on_complete=True):
except:
pass

# get connections
try:
driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
Expand Down Expand Up @@ -465,4 +446,4 @@ def __repr__(self):
int=self.interests,
acc=self.accomplishments,
conn=self.contacts,
)
)