22# selenium 4
33from asyncio import Handle
44from selenium import webdriver
5+
56# importing necessary firefox webdriver modules
67from selenium .webdriver .firefox .service import Service as FirefoxService
78from webdriver_manager .firefox import GeckoDriverManager
89from selenium .webdriver .common .by import By
9- import sys # For Retrirving the address from the commandline
10+ import sys # For Retrirving the address from the commandline
11+
1012# import os
1113from time import sleep
1214from random import randrange
1315import pandas as pd
1416
17+
1518def to_raw (string ):
1619 return fr"{ string } "
1720
18- # REVIEW: Build a Web Scraper to fetch Research Paper links and short summary on the papers and their
21+
22+ # REVIEW: Build a Web Scraper to fetch Research Paper links and short summary on the papers and their
1923# REVIEW: authors that matches the topics given in the command line input.
2024# REVIEW: from the Science Direct website using python and selenium
2125# REVIEW: save the links to a .txt file named with tags (ex:ResearchPapers-<tag1>_<tag2>)
@@ -24,42 +28,42 @@ def to_raw(string):
2428
2529# get the address from either the command line or clipboard.
2630# if len(sys.argv) > 2:
27- topic = ' ' .join (sys .argv [1 :])
31+ topic = " " .join (sys .argv [1 :])
2832# else:
2933# # TODO: Implement a function to read file named 'topics.txt' from the current directory.
3034# topic = input("Enter the topics for which you want to search:\t")
3135
32- topic_search = topic .replace (',' , '' )
36+ topic_search = topic .replace ("," , "" )
3337driver = webdriver .Firefox (service = FirefoxService (GeckoDriverManager ().install ()))
34- driver .get (' https://www.sciencedirect.com/search' )
38+ driver .get (" https://www.sciencedirect.com/search" )
3539
3640# TODO: Enter the tags/topics into the 'Keywords' field
3741# Handling exceptions
38- try :
42+ try :
3943 driver .find_element (By .ID , "qs" ).send_keys (str (topic_search ))
4044except :
41- print (' Text SearchBox not found!' )
42-
45+ print (" Text SearchBox not found!" )
46+
4347# TODO: Make a search by clicking the search button
44- #FIXME: Make the XPaths concise.
48+ # FIXME: Make the XPaths concise.
4549
4650try :
47- sleep (2 ) # seconds
51+ sleep (2 ) # seconds
4852 # bttn_Xpath = "/html/body/div/div/div/div/div/div/div/section/div/div[1]/div/div/div/div[2]/div/form/div/div/div[4]/div/div[2]/button"
4953 bttn_Xpath = "//button[@class = 'button button-primary move-right']"
5054 driver .find_element (By .XPATH , value = bttn_Xpath ).click ()
5155except :
52- print (' Text Search Button not found!' )
56+ print (" Text Search Button not found!" )
5357
5458# Wait for the page to load
55- sleep (randrange (1 ,3 ))# seconds
59+ sleep (randrange (1 , 3 )) # seconds
5660
5761# bttn_Xpath = "/html/body/div[1]/div/div/div/div/div/div/section/div/div[2]/div[2]/div[1]/div[2]/div[3]/div[1]/ol/li[3]/a"
5862bttn_Xpath = "//a[@data-aa-name='srp-100-results-per-page']"
5963driver .find_element (By .XPATH , value = bttn_Xpath ).click ()
6064
6165# Wait for the page to load
62- sleep (randrange (3 ,6 ))# seconds
66+ sleep (randrange (3 , 6 )) # seconds
6367
6468## Intitialize empty lists to store data
6569articles_type = []
@@ -73,31 +77,38 @@ def to_raw(string):
7377doi_links = []
7478
7579# Collect all the info in articles:
76- article_containers = driver .find_elements (By .CSS_SELECTOR , '.ResultItem col-xs-24 push-m' .replace (' ' , '.' ))
80+ article_containers = driver .find_elements (
81+ By .CSS_SELECTOR , ".ResultItem col-xs-24 push-m" .replace (" " , "." )
82+ )
7783
7884for article in article_containers :
7985 try :
80- article_type = article .find_element (By .CSS_SELECTOR , '.article-type u-clr-grey8' .replace (' ' , '.' )).text
81- title = article .find_element (By .CSS_SELECTOR , '.anchor result-list-title-link u-font-serif text-s anchor-default' .replace (' ' , '.' )).text
82- journal = article .find_element (By .CSS_SELECTOR , '.anchor subtype-srctitle-link anchor-default anchor-has-inherit-color' .replace (' ' , '.' )).text
83- date = article .find_element (By .XPATH , "//span[@class='srctitle-date-fields']/span[2]" ).text
84- # FIXME: Get the Journal info
85- article .find_element (By .XPATH , "//button [@data-aa-button='srp-show-hidden-volume-issue-info']" ).click ()
86+ article_type = article .find_element (By .CSS_SELECTOR , ".article-type u-clr-grey8" .replace (" " , "." )).text
87+ title = article .find_element (By .CSS_SELECTOR , ".anchor result-list-title-link u-font-serif text-s anchor-default" .replace (" " , "." )).text
88+ journal = article .find_element (By .CSS_SELECTOR , ".anchor subtype-srctitle-link anchor-default anchor-has-inherit-color" .replace (" " , "." )).text
89+ date = article .find_element (
90+ By .XPATH , "//span[@class='srctitle-date-fields']/span[2]"
91+ ).text
92+ # FIXME: Get the Journal info *FIXED
93+ article .find_element (
94+ By .XPATH , "//button [@data-aa-button='srp-show-hidden-volume-issue-info']"
95+ ).click ()
8696 journal_info = article .find_elements (By .XPATH , "//span[@class='hidden-fields']" )
8797 for j in journal_info :
8898 journo = j .text
8999 # FIXME: Get the Author List displayed on the website ***FIXED
90100 # Working on the authors list.
91101 article_auth = []
92- author = article .find_elements (By .TAG_NAME , 'li' )
102+ author = article .find_elements (By .TAG_NAME , "li" )
93103 for auth in author :
94- if auth != ' Download PDF' : # Removing some unnecessary stuffs by Hardcoding
104+ if auth != " Download PDF" : # Removing some unnecessary stuffs by Hardcoding
95105 article_auth .append (auth .text )
96106 art_auth = ", " .join (article_auth )
97107
98- url = to_raw (article .find_element (By .CSS_SELECTOR , ' .anchor result-list-title-link u-font-serif text-s anchor-default' .replace (' ' , '.' )).get_attribute (' href' ))
108+ url = to_raw (article .find_element (By .CSS_SELECTOR ," .anchor result-list-title-link u-font-serif text-s anchor-default" .replace (" " , "." )).get_attribute (" href" ))
99109 # FIXME: doi_link is not getting scraped ***FIXED
100- doi_link = article .find_element (By .XPATH , "//li[@class= 'ResultItem col-xs-24 push-m']" ).get_attribute ('data-doi' )
110+ doi_link = article .find_element (By .XPATH , "//li[@class= 'ResultItem col-xs-24 push-m']" ).get_attribute ("data-doi" )
111+
101112 articles_type .append (article_type )
102113 titles .append (title )
103114 journals .append (journal )
@@ -107,14 +118,27 @@ def to_raw(string):
107118 urls .append (url )
108119 doi_links .append (doi_link )
109120 except Exception as err :
110- print (f' Error while scraping data: { err } ' )
121+ print (f" Error while scraping data: { err } " )
111122 continue
112123
113- article_container_df = pd .DataFrame ({'article-type' : articles_type , 'title' : titles , 'journal' : journals , 'published-on' :dates , 'journal-info' : journal_infos , 'authors' :authors , 'url' : urls , 'doi-link' : doi_links })
124+ article_container_df = pd .DataFrame (
125+ {
126+ "article-type" : articles_type ,
127+ "title" : titles ,
128+ "journal" : journals ,
129+ "published-on" : dates ,
130+ "journal-info" : journal_infos ,
131+ "authors" : authors ,
132+ "url" : urls ,
133+ "doi-link" : doi_links ,
134+ }
135+ )
114136# article_container_df = pd.DataFrame({'article_type': articles_type, 'title': titles, 'journal': journals, 'doi-link': doi_links})
115137
116138# OPTIMIZE: Task:-1 Optimize and clean the entire code using classes and functions.
117139# TODO: Add paginations for extracting articles from other pages as well.
118140
119141# TODO: Exporting the results to a .json file.
120- article_container_df .to_json (r'Analysis-ScienceDirectPapers\Researchpaper_list.json' , orient = 'records' , compression = 'infer' )
142+ article_container_df .to_json ( r"Analysis-ScienceDirectPapers\Researchpaper_list.json" ,
143+ orient = "records" ,
144+ compression = "infer" ,)
0 commit comments