fixing tests

anupkalburgi · anupkalburgi · commit d37de684738c · 2025-11-05T15:55:16.000-05:00
diff --git a/dbldatagen/spec/column_spec.py b/dbldatagen/spec/column_spec.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+from typing import Any, Literal
+
+from .compat import BaseModel, root_validator
+
+
+DbldatagenBasicType = Literal[
+    "string",
+    "int",
+    "long",
+    "float",
+    "double",
+    "decimal",
+    "boolean",
+    "date",
+    "timestamp",
+    "short",
+    "byte",
+    "binary",
+    "integer",
+    "bigint",
+    "tinyint",
+]
+class ColumnDefinition(BaseModel):
+    name: str
+    type: DbldatagenBasicType | None = None
+    primary: bool = False
+    options: dict[str, Any] | None = None
+    nullable: bool | None = False
+    omit: bool | None = False
+    baseColumn: str | None = "id"
+    baseColumnType: str | None = "auto"
+
+    @root_validator()
+    def check_model_constraints(cls, values: dict[str, Any]) -> dict[str, Any]:
+        """
+        Validates constraints across the entire model after individual fields are processed.
+        """
+        is_primary = values.get("primary")
+        options = values.get("options") or {}  # Handle None case
+        name = values.get("name")
+        is_nullable = values.get("nullable")
+        column_type = values.get("type")
+
+        if is_primary:
+            if "min" in options or "max" in options:
+                raise ValueError(f"Primary column '{name}' cannot have min/max options.")
+
+            if is_nullable:
+                raise ValueError(f"Primary column '{name}' cannot be nullable.")
+
+            if column_type is None:
+                raise ValueError(f"Primary column '{name}' must have a type defined.")
+        return values
diff --git a/dbldatagen/spec/compat.py b/dbldatagen/spec/compat.py
@@ -2,13 +2,12 @@
 
 try:
     # This will succeed on environments with Pydantic V2.x
-    # It imports the V1 API that is bundled within V2.
-    from pydantic.v1 import BaseModel, Field, validator, constr
-
+    from pydantic.v1 import BaseModel, Field, constr, root_validator, validator
 except ImportError:
     # This will be executed on environments with only Pydantic V1.x
-    from pydantic import BaseModel, Field, validator, constr, root_validator, field_validator
+    from pydantic import BaseModel, Field, constr, root_validator, validator  # type: ignore[assignment,no-redef]
 
+__all__ = ["BaseModel", "Field", "constr", "root_validator", "validator"]
 # In your application code, do this:
 # from .compat import BaseModel
 # NOT this:
@@ -28,4 +27,4 @@
 
 Future-Ready: When you eventually decide to migrate fully to the Pydantic V2 API (to take advantage of its speed and features),
 you only need to change your application code and your compat.py import statements, making the transition much clearer.
-"""
+"""
diff --git a/dbldatagen/spec/generator_spec.py b/dbldatagen/spec/generator_spec.py
@@ -1,103 +1,62 @@
-from .compat import BaseModel, validator, root_validator, field_validator
-from typing import Dict, Optional, Union, Any, Literal, List
+from __future__ import annotations
+
+import logging
+from typing import Any, Literal, Union
+
 import pandas as pd
-from IPython.display import display, HTML
-
-DbldatagenBasicType = Literal[
-    "string",
-    "int",
-    "long",
-    "float",
-    "double",
-    "decimal",
-    "boolean",
-    "date",
-    "timestamp",
-    "short",
-    "byte",
-    "binary",
-    "integer",
-    "bigint",
-    "tinyint",
-]
-
-class ColumnDefinition(BaseModel):
-    name: str
-    type: Optional[DbldatagenBasicType] = None
-    primary: bool = False
-    options: Optional[Dict[str, Any]] = {}
-    nullable: Optional[bool] = False
-    omit: Optional[bool] = False
-    baseColumn: Optional[str] = "id"
-    baseColumnType: Optional[str] = "auto"
-
-    @root_validator(skip_on_failure=True)
-    def check_model_constraints(cls, values: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Validates constraints across the entire model after individual fields are processed.
-        """
-        is_primary = values.get("primary")
-        options = values.get("options", {})
-        name = values.get("name")
-        is_nullable = values.get("nullable")
-        column_type = values.get("type")
+from IPython.display import HTML, display
 
-        if is_primary:
-            if "min" in options or "max" in options:
-                raise ValueError(f"Primary column '{name}' cannot have min/max options.")
+from dbldatagen.spec.column_spec import ColumnDefinition
 
-            if is_nullable:
-                raise ValueError(f"Primary column '{name}' cannot be nullable.")
+from .compat import BaseModel, validator
 
-            if column_type is None:
-                raise ValueError(f"Primary column '{name}' must have a type defined.")
-        return values
 
+logger = logging.getLogger(__name__)
 
 class UCSchemaTarget(BaseModel):
     catalog: str
     schema_: str
     output_format: str = "delta"  # Default to delta for UC Schema
 
-    @field_validator("catalog", "schema_", mode="after")
-    def validate_identifiers(cls, v):  # noqa: N805, pylint: disable=no-self-argument
+    @validator("catalog", "schema_")
+    def validate_identifiers(cls, v: str) -> str:
         if not v.strip():
             raise ValueError("Identifier must be non-empty.")
         if not v.isidentifier():
             logger.warning(
                 f"'{v}' is not a basic Python identifier. Ensure validity for Unity Catalog.")
         return v.strip()
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.catalog}.{self.schema_} (Format: {self.output_format}, Type: UC Table)"
 
 
 class FilePathTarget(BaseModel):
     base_path: str
     output_format: Literal["csv", "parquet"]  # No default, must be specified
 
-    @field_validator("base_path", mode="after")
-    def validate_base_path(cls, v):  # noqa: N805, pylint: disable=no-self-argument
+    @validator("base_path")
+    def validate_base_path(cls, v: str) -> str:
         if not v.strip():
             raise ValueError("base_path must be non-empty.")
         return v.strip()
 
-    def __str__(self):
+    def __str__(self) -> str:
         return f"{self.base_path} (Format: {self.output_format}, Type: File Path)"
 
 
 class TableDefinition(BaseModel):
     number_of_rows: int
-    partitions: Optional[int] = None
-    columns: List[ColumnDefinition]
+    partitions: int | None = None
+    columns: list[ColumnDefinition]
 
 
 class ValidationResult:
     """Container for validation results with errors and warnings."""
 
     def __init__(self) -> None:
-        self.errors: List[str] = []
-        self.warnings: List[str] = []
+        self.errors: list[str] = []
+        self.warnings: list[str] = []
 
     def add_error(self, message: str) -> None:
         """Add an error message."""
@@ -132,16 +91,16 @@ def __str__(self) -> str:
         return "\n".join(lines)
 
 class DatagenSpec(BaseModel):
-    tables: Dict[str, TableDefinition]
-    output_destination: Optional[Union[UCSchemaTarget, FilePathTarget]] = None # there is a abstraction, may be we can use that? talk to Greg
-    generator_options: Optional[Dict[str, Any]] = {}
-    intended_for_databricks: Optional[bool] = None # May be infered.
+    tables: dict[str, TableDefinition]
+    output_destination: Union[UCSchemaTarget, FilePathTarget] | None = None # there is a abstraction, may be we can use that? talk to Greg
+    generator_options: dict[str, Any] | None = None
+    intended_for_databricks: bool | None = None # May be infered.
 
     def _check_circular_dependencies(
         self,
         table_name: str,
-        columns: List[ColumnDefinition]
-    ) -> List[str]:
+        columns: list[ColumnDefinition]
+    ) -> list[str]:
         """
         Check for circular dependencies in baseColumn references.
         Returns a list of error messages if circular dependencies are found.
@@ -152,13 +111,13 @@ def _check_circular_dependencies(
         for col in columns:
             if col.baseColumn and col.baseColumn != "id":
                 # Track the dependency chain
-                visited = set()
+                visited: set[str] = set()
                 current = col.name
 
                 while current:
                     if current in visited:
                         # Found a cycle
-                        cycle_path = " -> ".join(list(visited) + [current])
+                        cycle_path = " -> ".join([*list(visited), current])
                         errors.append(
                             f"Table '{table_name}': Circular dependency detected in column '{col.name}': {cycle_path}"
                         )
@@ -182,7 +141,7 @@ def _check_circular_dependencies(
 
         return errors
 
-    def validate(self, strict: bool = True) -> ValidationResult:
+    def validate(self, strict: bool = True) -> ValidationResult:  # type: ignore[override]
         """
         Validates the entire DatagenSpec configuration.
         Always runs all validation checks and collects all errors and warnings.
@@ -284,17 +243,15 @@ def validate(self, strict: bool = True) -> ValidationResult:
                 "random", "randomSeed", "randomSeedMethod", "verbose",
                 "debug", "seedColumnName"
             ]
-            for key in self.generator_options.keys():
+            for key in self.generator_options:
                 if key not in known_options:
                     result.add_warning(
                         f"Unknown generator option: '{key}'. "
                         "This may be ignored during generation."
                     )
 
         # Now that all validations are complete, decide whether to raise
-        if strict and (result.errors or result.warnings):
-            raise ValueError(str(result))
-        elif not strict and result.errors:
+        if (strict and (result.errors or result.warnings)) or (not strict and result.errors):
             raise ValueError(str(result))
 
         return result
diff --git a/dbldatagen/spec/generator_spec_impl.py b/dbldatagen/spec/generator_spec_impl.py
@@ -1,11 +1,13 @@
 import logging
-from typing import Dict, Union
 import posixpath
+from typing import Any, Union
 
-from dbldatagen.spec.generator_spec import TableDefinition
 from pyspark.sql import SparkSession
+
 import dbldatagen as dg
-from .generator_spec import DatagenSpec, UCSchemaTarget, FilePathTarget, ColumnDefinition
+from dbldatagen.spec.generator_spec import TableDefinition
+
+from .generator_spec import ColumnDefinition, DatagenSpec, FilePathTarget, UCSchemaTarget
 
 
 logging.basicConfig(
@@ -41,7 +43,7 @@ def __init__(self, spark: SparkSession, app_name: str = "DataGen_ClassBased") ->
         self.app_name = app_name
         logger.info("Generator initialized with SparkSession")
 
-    def _columnspec_to_datagen_columnspec(self, col_def: ColumnDefinition) -> Dict[str, str]:
+    def _columnspec_to_datagen_columnspec(self, col_def: ColumnDefinition) -> dict[str, Any]:
         """
         Convert a ColumnDefinition to dbldatagen column specification.
         Args:
@@ -95,7 +97,7 @@ def _prepare_data_generators(
         self,
         config: DatagenSpec,
         config_source_name: str = "PydanticConfig"
-    ) -> Dict[str, dg.DataGenerator]:
+    ) -> dict[str, dg.DataGenerator]:
         """
         Prepare DataGenerator specifications for each table based on the configuration.
         Args:
@@ -117,10 +119,10 @@ def _prepare_data_generators(
             raise RuntimeError(
                 "SparkSession is not available. Cannot prepare data generators")
 
-        tables_config: Dict[str, TableDefinition] = config.tables
+        tables_config: dict[str, TableDefinition] = config.tables
         global_gen_options = config.generator_options if config.generator_options else {}
 
-        prepared_generators: Dict[str, dg.DataGenerator] = {}
+        prepared_generators: dict[str, dg.DataGenerator] = {}
         generation_order = list(tables_config.keys()) # This becomes impotant when we get into multitable
 
         for table_name in generation_order:
@@ -156,7 +158,7 @@ def _prepare_data_generators(
 
     def write_prepared_data(
         self,
-        prepared_generators: Dict[str, dg.DataGenerator],
+        prepared_generators: dict[str, dg.DataGenerator],
         output_destination: Union[UCSchemaTarget, FilePathTarget, None],
         config_source_name: str = "PydanticConfig",
     ) -> None:
@@ -188,7 +190,7 @@ def write_prepared_data(
                 logger.info(
                     f"Built DataFrame for '{table_name}': {actual_row_count} rows (requested: {requested_rows})")
 
-                if actual_row_count == 0 and requested_rows > 0:
+                if actual_row_count == 0 and requested_rows is not None and requested_rows > 0:
                     logger.warning(f"Table '{table_name}': Requested {requested_rows} rows but built 0")
 
                 # Write data based on destination type
@@ -251,4 +253,4 @@ def generate_and_write_data(
             logger.error(
                 f"Error during combined data generation and writing: {e}")
             raise RuntimeError(
-                f"Error during combined data generation and writing: {e}") from e
+                f"Error during combined data generation and writing: {e}") from e
diff --git a/makefile b/makefile
@@ -8,7 +8,7 @@ clean:
 
 .venv/bin/python:
 	pip install hatch
-	hatch env create test-pydantic.pydantic==1.10.6-v1
+	hatch env create
 
 dev: .venv/bin/python
 	@hatch run which python
diff --git a/pyproject.toml b/pyproject.toml
@@ -103,6 +103,7 @@ dependencies = [
     "jmespath>=0.10.0",
     "py4j>=0.10.9",
     "pickleshare>=0.7.5",
+    "ipython>=7.32.0",
 ]
 
 python="3.10"
@@ -431,7 +432,7 @@ check_untyped_defs = true
 disallow_untyped_decorators = false
 no_implicit_optional = true
 warn_redundant_casts = true
-warn_unused_ignores = true
+warn_unused_ignores = false
 warn_no_return = true
 warn_unreachable = true
 strict_equality = true