Skip to content

Commit 75aaba4

Browse files
authored
Support more dbt contract data types (#10) — thanks, @jmkacz!
* Support more dbt contract data types (Snowflake) - Move `column_to_dimension_types` to a constant, at the top of the file, TYPE_MAPPINGS. - Normalize data types coming from dbt contracts by: - Downcasing - Remove extra type detail, contained in parentheses (ex. "timestamp(3)" => "timestamp") - After looking up the type mapping, compare the target type against the short list of supported dimension types in cube. - Support all data types in Snowflake and make a best effort of mapping them. - I assume the bool => boolean mapping comes from a BigQuery data type, so moved that into its own constant. * Support more dbt contract data types (BigQuery) - Since we started BIGQUERY_TYPE_MAPPINGS, let's add the rest of the BigQuery types to the map. - Enhance normalizing data types coming from dbt contracts by: - Remove extra type detail, contained in angle brackets (ex. "array<struct<array<int64>>>" => "array") * Support more dbt contract data types (Redshift) - Since we've added support for Snowflake and BigQuery, let's wrap this up by adding support for data types from the other major data warehouse, Redshift
1 parent b12e727 commit 75aaba4

File tree

2 files changed

+254
-20
lines changed

2 files changed

+254
-20
lines changed

src/cube_dbt/column.py

Lines changed: 203 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,198 @@
1+
import re
2+
13
from cube_dbt.dump import dump
24

5+
# As of 2024-10-17, the valid "Dimension Types" listed on
6+
# https://cube.dev/docs/reference/data-model/types-and-formats#dimension-types
7+
# are: time, string, number, boolean, and geo
8+
VALID_DIMENSION_TYPES = [
9+
"boolean",
10+
"geo",
11+
"number",
12+
"string",
13+
"time",
14+
]
15+
# Other System's Type => Cube Type
16+
# See https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
17+
BIGQUERY_TYPE_MAPPINGS = {
18+
"array": "string",
19+
"bool": "boolean",
20+
"bytes": "string",
21+
"date": "time",
22+
"datetime": "time",
23+
"geography": "geo",
24+
"interval": "string",
25+
"json": "string",
26+
27+
# numeric types
28+
"int64": "number",
29+
"int": "number",
30+
"smallint": "number",
31+
"integer": "number",
32+
"bigint": "number",
33+
"tinyint": "number",
34+
"byteint": "number",
35+
"numeric": "number",
36+
"decimal": "number",
37+
"bignumeric": "number",
38+
"bigdecimal": "number",
39+
"float64": "number",
40+
41+
"range": "string",
42+
# string does not need to be mapped
43+
"struct": "string",
44+
# time does not need to be mapped
45+
"timestamp": "time",
46+
}
47+
# See https://docs.snowflake.com/en/sql-reference-data-types
48+
SNOWFLAKE_TYPE_MAPPINGS = {
49+
# Numeric data types
50+
# number does not need to be mapped
51+
"decimal": "number",
52+
"dec": "number",
53+
"numeric": "number",
54+
"int": "number",
55+
"integer": "number",
56+
"bigint": "number",
57+
"smallint": "number",
58+
"tinyint": "number",
59+
"byteint": "number",
60+
"float": "number",
61+
"float4": "number",
62+
"float8": "number",
63+
"double": "number",
64+
"double precision": "number",
65+
"real": "number",
66+
67+
# String & binary data types
68+
"varchar": "string",
69+
"char": "string",
70+
"character": "string",
71+
"nchar": "string",
72+
# string does not need to be mapped
73+
"text": "string",
74+
"nvarchar": "string",
75+
"nvarchar2": "string",
76+
"char varying": "string",
77+
"nchar varying": "string",
78+
"binary": "string",
79+
"varbinary": "string",
80+
81+
# Logical data types
82+
# boolean does not need to be mapped
83+
84+
# Date & time data types
85+
"date": "time",
86+
"datetime": "time",
87+
# time does not need to be mapped
88+
"timestamp": "time",
89+
"timestamp_ltz": "time",
90+
"timestamp_ntz": "time",
91+
"timestamp_tz": "time",
92+
93+
# Semi-structured data types
94+
"variant": "string",
95+
"object": "string",
96+
"array": "string",
97+
98+
# Geospatial data types
99+
"geography": "geo",
100+
"geometry": "string",
101+
102+
# Vector data types
103+
"vector": "string",
104+
}
105+
# See https://docs.aws.amazon.com/redshift/latest/dg/c_Supported_data_types.html
106+
REDSHIFT_TYPE_MAPPINGS = {
107+
# Signed two-byte integer
108+
"smallint": "number",
109+
"int2": "number",
110+
111+
# Signed four-byte integer
112+
"integer": "number",
113+
"int": "number",
114+
"int4": "number",
115+
116+
# Signed eight-byte integer
117+
"bigint": "number",
118+
"int8": "number",
119+
120+
# Exact numeric of selectable precision
121+
"decimal": "number",
122+
"numeric": "number",
123+
124+
# Single precision floating-point number
125+
"real": "number",
126+
"float4": "number",
127+
128+
# Double precision floating-point number
129+
"double precision": "number",
130+
"float8": "number",
131+
"float": "number",
132+
133+
# Fixed-length character string
134+
"char": "string",
135+
"character": "string",
136+
"nchar": "string",
137+
"bpchar": "string",
138+
139+
# Variable-length character string with a user-defined limit
140+
"varchar": "string",
141+
"character varying": "string",
142+
"nvarchar": "string",
143+
"text": "string",
144+
145+
# Calendar date (year, month, day)
146+
"date": "time",
147+
148+
# Time of day
149+
"time": "time",
150+
"time without time zone": "time",
151+
152+
# Time of day with time zone
153+
"timetz": "time",
154+
"time with time zone": "time",
155+
156+
# Date and time (without time zone)
157+
"timestamp": "time",
158+
"timestamp without time zone": "time",
159+
160+
# Date and time (with time zone)
161+
"timestamptz": "time",
162+
"timestamp with time zone": "time",
163+
164+
# Time duration in year to month order
165+
"interval year to month": "string",
166+
167+
# Time duration in day to second order
168+
"interval day to second": "string",
169+
170+
# Logical Boolean (true/false)
171+
# boolean does not need to be mapped
172+
"bool": "boolean",
173+
174+
# Type used with HyperLogLog sketches
175+
"hllsketch": "string",
176+
177+
# A superset data type that encompasses all scalar types of Amazon Redshift including complex types such as ARRAY and STRUCTS
178+
"super": "string",
179+
180+
# Variable-length binary value
181+
"varbyte": "string",
182+
"varbinary": "string",
183+
"binary varying": "string",
184+
185+
# Spatial data
186+
"geometry": "geo",
187+
"geography": "string",
188+
}
189+
TYPE_MAPPINGS = {
190+
**BIGQUERY_TYPE_MAPPINGS,
191+
**REDSHIFT_TYPE_MAPPINGS,
192+
**SNOWFLAKE_TYPE_MAPPINGS,
193+
}
194+
195+
3196
class Column:
4197
def __init__(self, model_name: str, column_dict: dict) -> None:
5198
self._model_name = model_name
@@ -25,29 +218,20 @@ def sql(self) -> str:
25218
def type(self) -> str:
26219
if not 'data_type' in self._column_dict or self._column_dict['data_type'] == None:
27220
return 'string'
28-
29-
column_to_dimension_types = {
30-
'time': 'time',
31-
'date': 'time',
32-
'datetime': 'time',
33-
'timestamp': 'time',
34-
35-
'string': 'string',
36221

37-
'number': 'number',
38-
'numeric': 'number',
222+
# Normalize the data_type value, downcasing it, and removing extra information.
223+
source_data_type = re.sub(r"<.*>", "", re.sub(r"\([^\)]*\)", "", self._column_dict["data_type"].lower()))
39224

40-
'boolean': 'boolean',
41-
'bool': 'boolean',
225+
if source_data_type in TYPE_MAPPINGS:
226+
cube_data_type = TYPE_MAPPINGS[source_data_type]
227+
else:
228+
cube_data_type = source_data_type
42229

43-
'geo': 'geo',
44-
'geography': 'geo',
45-
}
46-
if not self._column_dict['data_type'] in column_to_dimension_types:
230+
if cube_data_type not in VALID_DIMENSION_TYPES:
47231
raise RuntimeError(f"Unknown column type of {self._model_name}.{self.name}: {self._column_dict['data_type']}")
48232

49-
return column_to_dimension_types[self._column_dict['data_type']]
50-
233+
return cube_data_type
234+
51235
@property
52236
def meta(self) -> dict:
53237
return self._column_dict['meta']
@@ -78,4 +262,4 @@ def as_dimension(self) -> str:
78262
For use in Jinja:
79263
{{ dbt.model('name').column('name').as_dimension() }}
80264
"""
81-
return dump(self._as_dimension(), indent=8)
265+
return dump(self._as_dimension(), indent=8)

tests/test_column.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,56 @@ def test_known_type(self):
4242
column = Column('model', column_dict)
4343
assert column.type == 'number'
4444

45+
def test_known_type_but_uppercase(self):
46+
"""
47+
If type is known, then map it
48+
"""
49+
column_dict = {
50+
'data_type': 'STRING'
51+
}
52+
column = Column('model', column_dict)
53+
assert column.type == 'string'
54+
55+
def test_known_type_but_with_one_extra_info(self):
56+
"""
57+
If type is known, then map it
58+
"""
59+
column_dict = {
60+
'data_type': 'timestamp(3)'
61+
}
62+
column = Column('model', column_dict)
63+
assert column.type == 'time'
64+
65+
def test_known_type_but_with_two_extra_info(self):
66+
"""
67+
If type is known, then map it
68+
"""
69+
column_dict = {
70+
'data_type': 'numeric(38,0)'
71+
}
72+
column = Column('model', column_dict)
73+
assert column.type == 'number'
74+
75+
def test_known_type_but_with_two_extra_info_of_different_types(self):
76+
"""
77+
If type is known, then map it
78+
"""
79+
column_dict = {
80+
'data_type': 'VECTOR(FLOAT, 256)'
81+
}
82+
column = Column('model', column_dict)
83+
assert column.type == 'string'
84+
85+
def test_known_bigquery_type_but_with_extra_info(self):
86+
"""
87+
If type is known, then map it
88+
"""
89+
column_dict = {
90+
'data_type': 'ARRAY<STRUCT<ARRAY<INT64>>>'
91+
}
92+
column = Column('model', column_dict)
93+
assert column.type == 'string'
94+
4595
def test_as_dimension(self):
4696
column_dict = {
4797
'name': 'column',
@@ -69,4 +119,4 @@ def test_as_dimension_render(self):
69119
assert column.as_dimension() == """name: column
70120
sql: column
71121
type: number
72-
"""
122+
"""

0 commit comments

Comments
 (0)