Skip to content

Commit c4f5aa9

Browse files
committed
Update verion
1 parent 35f96a7 commit c4f5aa9

File tree

2 files changed

+75
-27
lines changed

2 files changed

+75
-27
lines changed

ScienceDirect-Scraper- v1.0.0/Respaperlinkscrap.py

Lines changed: 51 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,24 @@
22
# selenium 4
33
from asyncio import Handle
44
from selenium import webdriver
5+
56
# importing necessary firefox webdriver modules
67
from selenium.webdriver.firefox.service import Service as FirefoxService
78
from webdriver_manager.firefox import GeckoDriverManager
89
from selenium.webdriver.common.by import By
9-
import sys # For Retrirving the address from the commandline
10+
import sys # For Retrirving the address from the commandline
11+
1012
# import os
1113
from time import sleep
1214
from random import randrange
1315
import pandas as pd
1416

17+
1518
def to_raw(string):
1619
return fr"{string}"
1720

18-
# REVIEW: Build a Web Scraper to fetch Research Paper links and short summary on the papers and their
21+
22+
# REVIEW: Build a Web Scraper to fetch Research Paper links and short summary on the papers and their
1923
# REVIEW: authors that matches the topics given in the command line input.
2024
# REVIEW: from the Science Direct website using python and selenium
2125
# REVIEW: save the links to a .txt file named with tags (ex:ResearchPapers-<tag1>_<tag2>)
@@ -24,42 +28,42 @@ def to_raw(string):
2428

2529
# get the address from either the command line or clipboard.
2630
# if len(sys.argv) > 2:
27-
topic = ' '.join(sys.argv[1:])
31+
topic = " ".join(sys.argv[1:])
2832
# else:
2933
# # TODO: Implement a function to read file named 'topics.txt' from the current directory.
3034
# topic = input("Enter the topics for which you want to search:\t")
3135

32-
topic_search = topic.replace(',', '')
36+
topic_search = topic.replace(",", "")
3337
driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
34-
driver.get('https://www.sciencedirect.com/search')
38+
driver.get("https://www.sciencedirect.com/search")
3539

3640
# TODO: Enter the tags/topics into the 'Keywords' field
3741
# Handling exceptions
38-
try:
42+
try:
3943
driver.find_element(By.ID, "qs").send_keys(str(topic_search))
4044
except:
41-
print('Text SearchBox not found!')
42-
45+
print("Text SearchBox not found!")
46+
4347
# TODO: Make a search by clicking the search button
44-
#FIXME: Make the XPaths concise.
48+
# FIXME: Make the XPaths concise.
4549

4650
try:
47-
sleep(2) # seconds
51+
sleep(2) # seconds
4852
# bttn_Xpath = "/html/body/div/div/div/div/div/div/div/section/div/div[1]/div/div/div/div[2]/div/form/div/div/div[4]/div/div[2]/button"
4953
bttn_Xpath = "//button[@class = 'button button-primary move-right']"
5054
driver.find_element(By.XPATH, value=bttn_Xpath).click()
5155
except:
52-
print('Text Search Button not found!')
56+
print("Text Search Button not found!")
5357

5458
# Wait for the page to load
55-
sleep(randrange(1,3))# seconds
59+
sleep(randrange(1, 3)) # seconds
5660

5761
# bttn_Xpath = "/html/body/div[1]/div/div/div/div/div/div/section/div/div[2]/div[2]/div[1]/div[2]/div[3]/div[1]/ol/li[3]/a"
5862
bttn_Xpath = "//a[@data-aa-name='srp-100-results-per-page']"
5963
driver.find_element(By.XPATH, value=bttn_Xpath).click()
6064

6165
# Wait for the page to load
62-
sleep(randrange(3,6))# seconds
66+
sleep(randrange(3, 6)) # seconds
6367

6468
## Intitialize empty lists to store data
6569
articles_type = []
@@ -73,31 +77,38 @@ def to_raw(string):
7377
doi_links = []
7478

7579
# Collect all the info in articles:
76-
article_containers = driver.find_elements(By.CSS_SELECTOR, '.ResultItem col-xs-24 push-m'.replace(' ', '.'))
80+
article_containers = driver.find_elements(
81+
By.CSS_SELECTOR, ".ResultItem col-xs-24 push-m".replace(" ", ".")
82+
)
7783

7884
for article in article_containers:
7985
try:
80-
article_type = article.find_element(By.CSS_SELECTOR, '.article-type u-clr-grey8'.replace(' ', '.')).text
81-
title = article.find_element(By.CSS_SELECTOR, '.anchor result-list-title-link u-font-serif text-s anchor-default'.replace(' ', '.')).text
82-
journal = article.find_element(By.CSS_SELECTOR, '.anchor subtype-srctitle-link anchor-default anchor-has-inherit-color'.replace(' ', '.')).text
83-
date = article.find_element(By.XPATH, "//span[@class='srctitle-date-fields']/span[2]").text
84-
# FIXME: Get the Journal info
85-
article.find_element(By.XPATH, "//button [@data-aa-button='srp-show-hidden-volume-issue-info']").click()
86+
article_type = article.find_element(By.CSS_SELECTOR, ".article-type u-clr-grey8".replace(" ", ".")).text
87+
title = article.find_element(By.CSS_SELECTOR, ".anchor result-list-title-link u-font-serif text-s anchor-default".replace(" ", ".")).text
88+
journal = article.find_element(By.CSS_SELECTOR, ".anchor subtype-srctitle-link anchor-default anchor-has-inherit-color".replace(" ", ".")).text
89+
date = article.find_element(
90+
By.XPATH, "//span[@class='srctitle-date-fields']/span[2]"
91+
).text
92+
# FIXME: Get the Journal info *FIXED
93+
article.find_element(
94+
By.XPATH, "//button [@data-aa-button='srp-show-hidden-volume-issue-info']"
95+
).click()
8696
journal_info = article.find_elements(By.XPATH, "//span[@class='hidden-fields']")
8797
for j in journal_info:
8898
journo = j.text
8999
# FIXME: Get the Author List displayed on the website ***FIXED
90100
# Working on the authors list.
91101
article_auth = []
92-
author = article.find_elements(By.TAG_NAME, 'li')
102+
author = article.find_elements(By.TAG_NAME, "li")
93103
for auth in author:
94-
if auth != 'Download PDF': # Removing some unnecessary stuffs by Hardcoding
104+
if auth != "Download PDF": # Removing some unnecessary stuffs by Hardcoding
95105
article_auth.append(auth.text)
96106
art_auth = ", ".join(article_auth)
97107

98-
url = to_raw(article.find_element(By.CSS_SELECTOR, '.anchor result-list-title-link u-font-serif text-s anchor-default'.replace(' ', '.')).get_attribute('href'))
108+
url = to_raw(article.find_element(By.CSS_SELECTOR,".anchor result-list-title-link u-font-serif text-s anchor-default".replace(" ", ".")).get_attribute("href"))
99109
# FIXME: doi_link is not getting scraped ***FIXED
100-
doi_link = article.find_element(By.XPATH, "//li[@class= 'ResultItem col-xs-24 push-m']").get_attribute('data-doi')
110+
doi_link = article.find_element(By.XPATH, "//li[@class= 'ResultItem col-xs-24 push-m']").get_attribute("data-doi")
111+
101112
articles_type.append(article_type)
102113
titles.append(title)
103114
journals.append(journal)
@@ -107,14 +118,27 @@ def to_raw(string):
107118
urls.append(url)
108119
doi_links.append(doi_link)
109120
except Exception as err:
110-
print(f'Error while scraping data: {err}')
121+
print(f"Error while scraping data: {err}")
111122
continue
112123

113-
article_container_df = pd.DataFrame({'article-type': articles_type, 'title': titles, 'journal': journals, 'published-on':dates, 'journal-info': journal_infos, 'authors':authors, 'url': urls, 'doi-link': doi_links})
124+
article_container_df = pd.DataFrame(
125+
{
126+
"article-type": articles_type,
127+
"title": titles,
128+
"journal": journals,
129+
"published-on": dates,
130+
"journal-info": journal_infos,
131+
"authors": authors,
132+
"url": urls,
133+
"doi-link": doi_links,
134+
}
135+
)
114136
# article_container_df = pd.DataFrame({'article_type': articles_type, 'title': titles, 'journal': journals, 'doi-link': doi_links})
115137

116138
# OPTIMIZE: Task:-1 Optimize and clean the entire code using classes and functions.
117139
# TODO: Add paginations for extracting articles from other pages as well.
118140

119141
# TODO: Exporting the results to a .json file.
120-
article_container_df.to_json(r'Analysis-ScienceDirectPapers\Researchpaper_list.json', orient = 'records', compression = 'infer')
142+
article_container_df.to_json( r"Analysis-ScienceDirectPapers\Researchpaper_list.json",
143+
orient="records",
144+
compression="infer",)
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Import Selenium 4: firefox webdriver modules:
2+
from selenium import webdriver
3+
from selenium.webdriver.firefox.service import Service as FirefoxService
4+
from webdriver_manager.firefox import GeckoDriverManager
5+
from selenium.webdriver.common.by import By
6+
# Importing time functions:
7+
from time import sleep
8+
from random import randrange
9+
10+
class ScienceDirectScraper():
11+
"""
12+
Science-Direct Articles Scraper - v1.5.0
13+
"""
14+
def __init__(self):
15+
self.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
16+
17+
def get_articles(self, *args):
18+
"""
19+
Function to generate a list of articles using a list of input keywords.
20+
21+
22+
"""
23+
self.driver.get(f'https://www.sciencedirect.com/search')
24+

0 commit comments

Comments
 (0)