@@ -1538,3 +1538,256 @@ def execute(self, exec_context: knext.ExecutionContext):
15381538 raise RuntimeError (
15391539 f"Failed to download or process Natural Earth data: { str (e )} { url } "
15401540 )
1541+
1542+
1543+ ############################################
1544+ # Dataverse File Downloader
1545+ ############################################
1546+
1547+
1548+ def validate_path (path : str ) -> None :
1549+ # no path check
1550+ pass
1551+
1552+
1553+ class ExistingFile (knext .EnumParameterOptions ):
1554+ FAIL = (
1555+ "Fail" ,
1556+ "Will issue an error during the node's execution (to prevent unintentional overwrite)." ,
1557+ )
1558+ OVERWRITE = (
1559+ "Overwrite" ,
1560+ "Will replace any existing file." ,
1561+ )
1562+
1563+
1564+ @knext .node (
1565+ name = "Dataverse File Downloader" ,
1566+ node_type = knext .NodeType .SOURCE ,
1567+ icon_path = __NODE_ICON_PATH + "dataverse.png" ,
1568+ category = __category ,
1569+ after = "" ,
1570+ )
1571+ @knext .output_table (
1572+ name = "Downloader File Path" ,
1573+ description = "Retrieved data from Dataverse" ,
1574+ )
1575+ class DataverseFileDownloaderNode :
1576+ """Downloads a file from a Dataverse repository.
1577+
1578+ This node downloads a file from a Dataverse repository based on the provided File ID.
1579+ The default Dataverse reposiotry that is used is [Harvard Dataverse](https://dataverse.harvard.edu/) and can
1580+ be changed in the advanced settings.
1581+ """
1582+
1583+ server_url = knext .StringParameter (
1584+ label = "Dataverse server URL" ,
1585+ description = "Base URL of the Dataverse server (e.g., https://dataverse.harvard.edu)." ,
1586+ default_value = "https://dataverse.harvard.edu" ,
1587+ is_advanced = True ,
1588+ )
1589+
1590+ file_id = knext .StringParameter (
1591+ label = "File ID" ,
1592+ description = "The unique file identifier in Dataverse." ,
1593+ default_value = "" ,
1594+ )
1595+
1596+ save_path = knext .LocalPathParameter (
1597+ label = "Save path" ,
1598+ description = "Select the directory to save the downloaded file." ,
1599+ placeholder_text = "Select output directory..." ,
1600+ validator = validate_path ,
1601+ )
1602+
1603+ timeout = knext .IntParameter (
1604+ label = "Request timeout (seconds)" ,
1605+ description = "Maximum time to wait for the server response." ,
1606+ default_value = 120 ,
1607+ min_value = 1 ,
1608+ is_advanced = True ,
1609+ )
1610+
1611+ existing_file = knext .EnumParameter (
1612+ "If exists:" ,
1613+ "Specify the behavior of the node in case the output file already exists." ,
1614+ lambda v : (
1615+ ExistingFile .OVERWRITE .name
1616+ if v < knext .Version (1 , 2 , 0 )
1617+ else ExistingFile .FAIL .name
1618+ ),
1619+ enum = ExistingFile ,
1620+ )
1621+
1622+ def configure (self , configure_context ):
1623+ return knext .Schema .from_columns ([knext .Column (knext .string (), "File Path" )])
1624+
1625+ def execute (self , exec_context : knext .ExecutionContext ):
1626+ import requests
1627+ import os
1628+ import pandas as pd
1629+
1630+ base_url = self .server_url .rstrip ("/" )
1631+ download_url = f"{ base_url } /api/access/datafile/{ self .file_id } "
1632+ self .__check_overwrite (self .save_path )
1633+ try :
1634+ save_dir = os .path .dirname (self .save_path )
1635+ if save_dir :
1636+ os .makedirs (save_dir , exist_ok = True )
1637+
1638+ response = requests .get (download_url , timeout = self .timeout )
1639+ response .raise_for_status ()
1640+
1641+ with open (self .save_path , "wb" ) as file :
1642+ file .write (response .content )
1643+
1644+ output_table = pd .DataFrame ({"File Path" : [self .save_path ]})
1645+
1646+ return knext .Table .from_pandas (output_table )
1647+
1648+ except Exception as e :
1649+ raise ValueError (f"Download Error: { str (e )} " )
1650+
1651+ def __check_overwrite (self , fileurl ):
1652+ if self .existing_file == ExistingFile .FAIL .name :
1653+ import os .path
1654+
1655+ if os .path .exists (fileurl ):
1656+ raise knext .InvalidParametersError (
1657+ "File already exists and should not be overwritten."
1658+ )
1659+
1660+
1661+ @knext .node (
1662+ name = "Dataverse Search" ,
1663+ node_type = knext .NodeType .SOURCE ,
1664+ icon_path = __NODE_ICON_PATH + "dataverse.png" ,
1665+ category = __category ,
1666+ after = "" ,
1667+ )
1668+ @knext .output_table (
1669+ name = "Search Results" ,
1670+ description = "Retrieved data from Dataverse" ,
1671+ )
1672+ class DataverseSearchNode :
1673+ """Search for datasets and files in Dataverse repositories.
1674+
1675+ This node allows you to search [Dataverse](https://dataverse.org/) using various parameters.
1676+ The default search uses [Harvard Dataverse](https://dataverse.harvard.edu/) as the server.
1677+
1678+ Query Syntax Examples:
1679+
1680+ 1. Simple keyword search: "climate change"
1681+ - Searches for items containing these terms anywhere
1682+
1683+ 2. Field-specific search: "title:climate" or "title:climate+temperature"
1684+ - Searches only in the title field
1685+ - Other fields: author, description, keywords
1686+
1687+ 3. Boolean operators: "climate AND temperature"
1688+ - AND: Both terms must be present
1689+ - OR: Either term must be present
1690+ - NOT: Exclude items with the term
1691+
1692+ 4. Combining operators: "climate AND (temperature OR rainfall)"
1693+ - Use parentheses for complex queries
1694+
1695+ Search results are returned as a table with all available metadata from the API.
1696+ """
1697+
1698+ server_url = knext .StringParameter (
1699+ label = "Dataverse server URL" ,
1700+ description = "Base URL of the Dataverse server." ,
1701+ default_value = "https://dataverse.harvard.edu" ,
1702+ is_advanced = True ,
1703+ )
1704+
1705+ query = knext .StringParameter (
1706+ label = "Search query" ,
1707+ description = "Search keywords. Examples: climate, title:climate, climate AND temperature" ,
1708+ default_value = "" ,
1709+ )
1710+
1711+ search_type = knext .StringParameter (
1712+ label = "Search type" ,
1713+ description = "Limit the type of objects to search for." ,
1714+ default_value = "file" ,
1715+ enum = ["dataverse" , "dataset" , "file" , "all" ],
1716+ is_advanced = True ,
1717+ )
1718+
1719+ max_results = knext .IntParameter (
1720+ label = "Maximum results" ,
1721+ description = "Maximum number of results to return" ,
1722+ default_value = 100 ,
1723+ min_value = 1 ,
1724+ max_value = 1000 ,
1725+ is_advanced = True ,
1726+ )
1727+
1728+ timeout = knext .IntParameter (
1729+ label = "Timeout (seconds)" ,
1730+ description = "Request timeout in seconds." ,
1731+ default_value = 30 ,
1732+ min_value = 1 ,
1733+ is_advanced = True ,
1734+ )
1735+
1736+ def configure (self , configure_context ):
1737+ return None
1738+
1739+ def execute (self , exec_context : knext .ExecutionContext ):
1740+ import requests
1741+ import pandas as pd
1742+
1743+ if not self .query :
1744+ raise ValueError ("Search query must be provided" )
1745+
1746+ base_url = self .server_url .rstrip ("/" )
1747+ search_url = f"{ base_url } /api/search"
1748+
1749+ params = {"q" : self .query , "per_page" : 20 }
1750+ if self .search_type != "all" :
1751+ params ["type" ] = self .search_type
1752+
1753+ all_results = []
1754+ start = 0
1755+
1756+ try :
1757+ while len (all_results ) < self .max_results :
1758+ params ["start" ] = start
1759+
1760+ query_params = "&" .join ([f"{ k } ={ v } " for k , v in params .items ()])
1761+ full_url = f"{ search_url } ?{ query_params } "
1762+
1763+ response = requests .get (full_url , timeout = self .timeout )
1764+
1765+ if response .status_code != 200 :
1766+ break
1767+
1768+ data = response .json ()
1769+ items = data .get ("data" , {}).get ("items" , [])
1770+
1771+ if not items :
1772+ break
1773+
1774+ all_results .extend (items )
1775+
1776+ start += params ["per_page" ]
1777+
1778+ if len (all_results ) >= self .max_results :
1779+ break
1780+
1781+ all_results = all_results [: self .max_results ]
1782+
1783+ results_df = pd .DataFrame (all_results )
1784+
1785+ if results_df .empty :
1786+ results_df = pd .DataFrame ({"no_results_found" : []})
1787+
1788+ return knext .Table .from_pandas (results_df )
1789+
1790+ except requests .exceptions .RequestException as e :
1791+ raise ValueError (f"Search error: { str (e )} " )
1792+ except Exception as e :
1793+ raise ValueError (f"Processing error: { str (e )} " )
0 commit comments