joeyism · syed0596 · Jul 17, 2025
diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py
@@ -41,6 +41,7 @@ def __init__(
         self.accomplishments = accomplishments or []
         self.also_viewed_urls = []
         self.contacts = contacts or []
+        self.scraped_education_keys = set() # Includes the fix for duplicates
 
         if driver is None:
             try:
@@ -116,31 +117,28 @@ def get_experiences(self):
         main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
         for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"):
             position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
-
-            # Fix: Handle case where more than 2 elements are returned
+
             elements = position.find_elements(By.XPATH, "*")
             if len(elements) < 2:
-                continue  # Skip if we don't have enough elements
-                
+                continue
+
             company_logo_elem = elements[0]
             position_details = elements[1]
 
-            # company elem
             try:
                 company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
                 if not company_linkedin_url:
                     continue
             except NoSuchElementException:
                 continue
 
-            # position details
             position_details_list = position_details.find_elements(By.XPATH,"*")
             position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None
             position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None
-            
+
             if not position_summary_details:
                 continue
-                
+
             outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")
 
             if len(outer_positions) == 4:
@@ -165,7 +163,6 @@ def get_experiences(self):
                 work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text if len(outer_positions) > 1 else ""
                 location = ""
 
-            # Safely extract times and duration
             if work_times:
                 parts = work_times.split("·")
                 times = parts[0].strip() if parts else ""
@@ -176,7 +173,7 @@ def get_experiences(self):
 
             from_date = " ".join(times.split(" ")[:2]) if times else ""
             to_date = " ".join(times.split(" ")[3:]) if times and len(times.split(" ")) > 3 else ""
-            
+
             if position_summary_text and any(element.get_attribute("class") == "pvs-list__container" for element in position_summary_text.find_elements(By.XPATH, "*")):
                 try:
                     inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
@@ -186,7 +183,7 @@ def get_experiences(self):
                     inner_positions = []
             else:
                 inner_positions = []
-            
+
             if len(inner_positions) > 1:
                 descriptions = inner_positions
                 for description in descriptions:
@@ -199,8 +196,7 @@ def get_experiences(self):
                         location = location_elem.find_element(By.XPATH,"*").text if location_elem else None
                         position_title = position_title_elem.find_element(By.XPATH,"*").find_element(By.TAG_NAME,"*").text if position_title_elem else ""
                         work_times = work_times_elem.find_element(By.XPATH,"*").text if work_times_elem else ""
-
-                        # Safely extract times and duration
+
                         if work_times:
                             parts = work_times.split("·")
                             times = parts[0].strip() if parts else ""
@@ -224,7 +220,6 @@ def get_experiences(self):
                         )
                         self.add_experience(experience)
                     except (NoSuchElementException, IndexError) as e:
-                        # Skip this description if elements are missing
                         continue
             else:
                 description = position_summary_text.text if position_summary_text else ""
@@ -253,24 +248,20 @@ def get_educations(self):
             try:
                 position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
 
-                # Fix: Handle case where more than 2 elements are returned
                 elements = position.find_elements(By.XPATH,"*")
                 if len(elements) < 2:
-                    continue  # Skip if we don't have enough elements
+                    continue
 
                 institution_logo_elem = elements[0]
                 position_details = elements[1]
 
-                # institution elem
                 try:
                     institution_linkedin_url = institution_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
                 except NoSuchElementException:
                     institution_linkedin_url = None
 
-                # position details
                 position_details_list = position_details.find_elements(By.XPATH,"*")
                 position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None
-                position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None
 
                 if not position_summary_details:
                     continue
@@ -280,38 +271,35 @@ def get_educations(self):
                 institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text if outer_positions else ""
                 degree = outer_positions[1].find_element(By.TAG_NAME,"span").text if len(outer_positions) > 1 else None
 
-                from_date = None
-                to_date = None
+                from_date, to_date = None, None
 
                 if len(outer_positions) > 2:
                     try:
                         times = outer_positions[2].find_element(By.TAG_NAME,"span").text
-
                         if times and "-" in times:
-                            split_times = times.split(" ")
-                            dash_index = split_times.index("-") if "-" in split_times else -1
-
-                            if dash_index > 0:
-                                from_date = split_times[dash_index-1]
-                            if dash_index < len(split_times) - 1:
-                                to_date = split_times[-1]
+                            parts = [p.strip() for p in times.split("-")]
+                            from_date = parts[0]
+                            to_date = parts[1]
                     except (NoSuchElementException, ValueError):
-                        from_date = None
-                        to_date = None
+                        pass
 
-                description = position_summary_text.text if position_summary_text else ""
+                description = position_details_list[1].text if len(position_details_list) > 1 else ""
+
+                education_key = (institution_name, degree, from_date, to_date)
+
+                if education_key not in self.scraped_education_keys:
+                    education = Education(
+                        from_date=from_date,
+                        to_date=to_date,
+                        description=description,
+                        degree=degree,
+                        institution_name=institution_name,
+                        linkedin_url=institution_linkedin_url
+                    )
+                    self.add_education(education)
+                    self.scraped_education_keys.add(education_key)
 
-                education = Education(
-                    from_date=from_date,
-                    to_date=to_date,
-                    description=description,
-                    degree=degree,
-                    institution_name=institution_name,
-                    linkedin_url=institution_linkedin_url
-                )
-                self.add_education(education)
             except (NoSuchElementException, IndexError) as e:
-                # Skip this education entry if elements are missing
                 continue
 
     def get_name_and_location(self):
@@ -341,12 +329,10 @@ def scrape_logged_in(self, close_on_complete=True):
         self.focus()
         self.wait(5)
 
-        # get name and location
         self.get_name_and_location()
 
         self.open_to_work = self.is_open_to_work()
 
-        # get about
         self.get_about()
         driver.execute_script(
             "window.scrollTo(0, Math.ceil(document.body.scrollHeight/2));"
@@ -355,15 +341,12 @@ def scrape_logged_in(self, close_on_complete=True):
             "window.scrollTo(0, Math.ceil(document.body.scrollHeight/1.5));"
         )
 
-        # get experience
         self.get_experiences()
 
-        # get education
         self.get_educations()
 
         driver.get(self.linkedin_url)
 
-        # get interest
         try:
 
             _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
@@ -387,7 +370,6 @@ def scrape_logged_in(self, close_on_complete=True):
         except:
             pass
 
-        # get accomplishment
         try:
             _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
                 EC.presence_of_element_located(
@@ -412,7 +394,6 @@ def scrape_logged_in(self, close_on_complete=True):
         except:
             pass
 
-        # get connections
         try:
             driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
             _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
@@ -465,4 +446,4 @@ def __repr__(self):
             int=self.interests,
             acc=self.accomplishments,
             conn=self.contacts,
-        )
+        )