Skip to content

Commit 0e29b72

Browse files
authored
Add Dataverse downloader and search node
1 parent 160b40b commit 0e29b72

File tree

3 files changed

+254
-0
lines changed

3 files changed

+254
-0
lines changed
647 Bytes
Loading
Lines changed: 1 addition & 0 deletions
Loading

knime_extension/src/nodes/opendata.py

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1538,3 +1538,256 @@ def execute(self, exec_context: knext.ExecutionContext):
15381538
raise RuntimeError(
15391539
f"Failed to download or process Natural Earth data: {str(e)}{url}"
15401540
)
1541+
1542+
1543+
############################################
1544+
# Dataverse File Downloader
1545+
############################################
1546+
1547+
1548+
def validate_path(path: str) -> None:
1549+
# no path check
1550+
pass
1551+
1552+
1553+
class ExistingFile(knext.EnumParameterOptions):
1554+
FAIL = (
1555+
"Fail",
1556+
"Will issue an error during the node's execution (to prevent unintentional overwrite).",
1557+
)
1558+
OVERWRITE = (
1559+
"Overwrite",
1560+
"Will replace any existing file.",
1561+
)
1562+
1563+
1564+
@knext.node(
1565+
name="Dataverse File Downloader",
1566+
node_type=knext.NodeType.SOURCE,
1567+
icon_path=__NODE_ICON_PATH + "dataverse.png",
1568+
category=__category,
1569+
after="",
1570+
)
1571+
@knext.output_table(
1572+
name="Downloader File Path",
1573+
description="Retrieved data from Dataverse",
1574+
)
1575+
class DataverseFileDownloaderNode:
1576+
"""Downloads a file from a Dataverse repository.
1577+
1578+
This node downloads a file from a Dataverse repository based on the provided File ID.
1579+
The default Dataverse reposiotry that is used is [Harvard Dataverse](https://dataverse.harvard.edu/) and can
1580+
be changed in the advanced settings.
1581+
"""
1582+
1583+
server_url = knext.StringParameter(
1584+
label="Dataverse server URL",
1585+
description="Base URL of the Dataverse server (e.g., https://dataverse.harvard.edu).",
1586+
default_value="https://dataverse.harvard.edu",
1587+
is_advanced=True,
1588+
)
1589+
1590+
file_id = knext.StringParameter(
1591+
label="File ID",
1592+
description="The unique file identifier in Dataverse.",
1593+
default_value="",
1594+
)
1595+
1596+
save_path = knext.LocalPathParameter(
1597+
label="Save path",
1598+
description="Select the directory to save the downloaded file.",
1599+
placeholder_text="Select output directory...",
1600+
validator=validate_path,
1601+
)
1602+
1603+
timeout = knext.IntParameter(
1604+
label="Request timeout (seconds)",
1605+
description="Maximum time to wait for the server response.",
1606+
default_value=120,
1607+
min_value=1,
1608+
is_advanced=True,
1609+
)
1610+
1611+
existing_file = knext.EnumParameter(
1612+
"If exists:",
1613+
"Specify the behavior of the node in case the output file already exists.",
1614+
lambda v: (
1615+
ExistingFile.OVERWRITE.name
1616+
if v < knext.Version(1, 2, 0)
1617+
else ExistingFile.FAIL.name
1618+
),
1619+
enum=ExistingFile,
1620+
)
1621+
1622+
def configure(self, configure_context):
1623+
return knext.Schema.from_columns([knext.Column(knext.string(), "File Path")])
1624+
1625+
def execute(self, exec_context: knext.ExecutionContext):
1626+
import requests
1627+
import os
1628+
import pandas as pd
1629+
1630+
base_url = self.server_url.rstrip("/")
1631+
download_url = f"{base_url}/api/access/datafile/{self.file_id}"
1632+
self.__check_overwrite(self.save_path)
1633+
try:
1634+
save_dir = os.path.dirname(self.save_path)
1635+
if save_dir:
1636+
os.makedirs(save_dir, exist_ok=True)
1637+
1638+
response = requests.get(download_url, timeout=self.timeout)
1639+
response.raise_for_status()
1640+
1641+
with open(self.save_path, "wb") as file:
1642+
file.write(response.content)
1643+
1644+
output_table = pd.DataFrame({"File Path": [self.save_path]})
1645+
1646+
return knext.Table.from_pandas(output_table)
1647+
1648+
except Exception as e:
1649+
raise ValueError(f"Download Error: {str(e)}")
1650+
1651+
def __check_overwrite(self, fileurl):
1652+
if self.existing_file == ExistingFile.FAIL.name:
1653+
import os.path
1654+
1655+
if os.path.exists(fileurl):
1656+
raise knext.InvalidParametersError(
1657+
"File already exists and should not be overwritten."
1658+
)
1659+
1660+
1661+
@knext.node(
1662+
name="Dataverse Search",
1663+
node_type=knext.NodeType.SOURCE,
1664+
icon_path=__NODE_ICON_PATH + "dataverse.png",
1665+
category=__category,
1666+
after="",
1667+
)
1668+
@knext.output_table(
1669+
name="Search Results",
1670+
description="Retrieved data from Dataverse",
1671+
)
1672+
class DataverseSearchNode:
1673+
"""Search for datasets and files in Dataverse repositories.
1674+
1675+
This node allows you to search [Dataverse](https://dataverse.org/) using various parameters.
1676+
The default search uses [Harvard Dataverse](https://dataverse.harvard.edu/) as the server.
1677+
1678+
Query Syntax Examples:
1679+
1680+
1. Simple keyword search: "climate change"
1681+
- Searches for items containing these terms anywhere
1682+
1683+
2. Field-specific search: "title:climate" or "title:climate+temperature"
1684+
- Searches only in the title field
1685+
- Other fields: author, description, keywords
1686+
1687+
3. Boolean operators: "climate AND temperature"
1688+
- AND: Both terms must be present
1689+
- OR: Either term must be present
1690+
- NOT: Exclude items with the term
1691+
1692+
4. Combining operators: "climate AND (temperature OR rainfall)"
1693+
- Use parentheses for complex queries
1694+
1695+
Search results are returned as a table with all available metadata from the API.
1696+
"""
1697+
1698+
server_url = knext.StringParameter(
1699+
label="Dataverse server URL",
1700+
description="Base URL of the Dataverse server.",
1701+
default_value="https://dataverse.harvard.edu",
1702+
is_advanced=True,
1703+
)
1704+
1705+
query = knext.StringParameter(
1706+
label="Search query",
1707+
description="Search keywords. Examples: climate, title:climate, climate AND temperature",
1708+
default_value="",
1709+
)
1710+
1711+
search_type = knext.StringParameter(
1712+
label="Search type",
1713+
description="Limit the type of objects to search for.",
1714+
default_value="file",
1715+
enum=["dataverse", "dataset", "file", "all"],
1716+
is_advanced=True,
1717+
)
1718+
1719+
max_results = knext.IntParameter(
1720+
label="Maximum results",
1721+
description="Maximum number of results to return",
1722+
default_value=100,
1723+
min_value=1,
1724+
max_value=1000,
1725+
is_advanced=True,
1726+
)
1727+
1728+
timeout = knext.IntParameter(
1729+
label="Timeout (seconds)",
1730+
description="Request timeout in seconds.",
1731+
default_value=30,
1732+
min_value=1,
1733+
is_advanced=True,
1734+
)
1735+
1736+
def configure(self, configure_context):
1737+
return None
1738+
1739+
def execute(self, exec_context: knext.ExecutionContext):
1740+
import requests
1741+
import pandas as pd
1742+
1743+
if not self.query:
1744+
raise ValueError("Search query must be provided")
1745+
1746+
base_url = self.server_url.rstrip("/")
1747+
search_url = f"{base_url}/api/search"
1748+
1749+
params = {"q": self.query, "per_page": 20}
1750+
if self.search_type != "all":
1751+
params["type"] = self.search_type
1752+
1753+
all_results = []
1754+
start = 0
1755+
1756+
try:
1757+
while len(all_results) < self.max_results:
1758+
params["start"] = start
1759+
1760+
query_params = "&".join([f"{k}={v}" for k, v in params.items()])
1761+
full_url = f"{search_url}?{query_params}"
1762+
1763+
response = requests.get(full_url, timeout=self.timeout)
1764+
1765+
if response.status_code != 200:
1766+
break
1767+
1768+
data = response.json()
1769+
items = data.get("data", {}).get("items", [])
1770+
1771+
if not items:
1772+
break
1773+
1774+
all_results.extend(items)
1775+
1776+
start += params["per_page"]
1777+
1778+
if len(all_results) >= self.max_results:
1779+
break
1780+
1781+
all_results = all_results[: self.max_results]
1782+
1783+
results_df = pd.DataFrame(all_results)
1784+
1785+
if results_df.empty:
1786+
results_df = pd.DataFrame({"no_results_found": []})
1787+
1788+
return knext.Table.from_pandas(results_df)
1789+
1790+
except requests.exceptions.RequestException as e:
1791+
raise ValueError(f"Search error: {str(e)}")
1792+
except Exception as e:
1793+
raise ValueError(f"Processing error: {str(e)}")

0 commit comments

Comments
 (0)