From 738f7d152b897cd802e73e33bf1b6762ce19fdcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 31 Oct 2025 12:01:41 +0100 Subject: [PATCH 1/3] docs(bigquery): schema resolver strategy --- .../src/datahub/ingestion/source/bigquery_v2/bigquery.py | 7 +++++++ .../src/datahub/sql_parsing/schema_resolver.py | 5 ++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index d3b94d3808240f..af586703a7c5c8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -219,6 +219,13 @@ def test_connection(config_dict: dict) -> TestConnectionReport: return BigQueryTestConnection.test_connection(config_dict) def _init_schema_resolver(self) -> SchemaResolver: + """ + The ininitialization of SchemaResolver prefetches all existing urns and schemas in the env/platform/instance. + Because of that, it's important all classes requiring a SchemaResolver use this instance, as it has an already pre-populated cache. + An alternative strategy would be to do an on-demand resolution of the urns/schemas. + + TODO: prove pre-fetch is better strategy than on-demand resolution or make this behaviour configurable. + """ schema_resolution_required = ( self.config.use_queries_v2 or self.config.lineage_use_sql_parser ) diff --git a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py index eca043ac579222..d8e85f3e50d1dc 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py +++ b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py @@ -168,7 +168,10 @@ def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]: self._track_cache_hit() return urn_mixed, schema_info - # Track cache miss for the final attempt + logger.debug( + f"Schema resolution failed for table {table}. Tried URNs: " + f"primary={urn}, lower={urn_lower}, mixed={urn_mixed}" + ) self._track_cache_miss() if self._prefers_urn_lower(): From b169c5906c735e65e596eeb5cfa5dadffafbc7ec Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 24 Nov 2025 08:39:12 +0000 Subject: [PATCH 2/3] Add logging to schema resolver Co-authored-by: sergio.gomez --- metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py index d8e85f3e50d1dc..0a2eaed670fab6 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py +++ b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py @@ -1,10 +1,13 @@ import contextlib +import logging import pathlib from dataclasses import dataclass from typing import Dict, List, Optional, Protocol, Set, Tuple from typing_extensions import TypedDict +logger = logging.getLogger(__name__) + from datahub.emitter.mce_builder import ( DEFAULT_ENV, make_dataset_urn_with_platform_instance, From f3a3c066a657b214e7e39a6a96077447829feb91 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 24 Nov 2025 08:47:47 +0000 Subject: [PATCH 3/3] Refactor: Move logger initialization in schema_resolver Co-authored-by: sergio.gomez --- metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py index 0a2eaed670fab6..2a776ce47e0859 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py +++ b/metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py @@ -6,8 +6,6 @@ from typing_extensions import TypedDict -logger = logging.getLogger(__name__) - from datahub.emitter.mce_builder import ( DEFAULT_ENV, make_dataset_urn_with_platform_instance, @@ -22,6 +20,8 @@ from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path +logger = logging.getLogger(__name__) + # A lightweight table schema: column -> type mapping. SchemaInfo = Dict[str, str]