Skip to content

Commit 5aaa3f3

Browse files
committed
Handle missing values in metadata creation for categorical variables.
Add key 'accepts_missing' for all variables in metadata. By default, BaseModel._validate accepts missing values if not defined in metadata
1 parent 8d9bde5 commit 5aaa3f3

File tree

2 files changed

+12
-10
lines changed

2 files changed

+12
-10
lines changed

src/model/base.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def __init__(self, name):
5656
self._id = int(self)
5757

5858
def __repr__(self):
59-
return "Task('{}')".format(self._name)
59+
return "Task('{}')".format(self.name)
6060

6161

6262
class BaseModel(object):
@@ -157,31 +157,32 @@ def _validate(self, input):
157157
name, var_type = feature['name'], feature['type']
158158
default = feature.get('default', None)
159159
categories = feature.get('categories', None)
160+
accepts_missing = feature.get('accepts_missing', True)
160161
if name not in df.columns:
161162
df[name] = default or np.nan
162163
else:
164+
has_missing = df[name].isnull().any()
165+
if has_missing and not accepts_missing:
166+
raise ValueError(f'Feature {name} has unexpected missing values')
163167
if var_type == 'numeric':
164168
var_type = float
165169
elif var_type == 'string':
166170
var_type = str
167171
elif var_type == 'category':
168172
if categories is not None:
169173
var_type = CategoricalDtype(categories=categories, ordered=True)
170-
new_cat = set(df[name].unique()).difference(categories)
174+
new_cat = set(df[name].dropna().unique()).difference(categories)
171175
if len(new_cat):
172-
msg = 'Unexpected categorical value for {}: {}'.format(name, new_cat)
173-
raise ValueError(msg)
176+
raise ValueError(f'Unexpected categorical value for {name}: {new_cat}')
174177
else:
175-
msg = 'Missing "categories" for "{}" in metadata'.format(name)
176-
raise ValueError(msg)
178+
raise ValueError(f'Missing "categories" for "{name}" in metadata')
177179
else:
178-
msg = 'Unknown variable type: {}'.format(var_type)
179-
raise ValueError(msg)
180+
raise ValueError(f'Unknown variable type: {var_type}')
180181

181182
if default is None:
182-
df[name] = df[name].astype(var_type)
183+
df[name] = df[name].astype(var_type)
183184
else:
184-
df[name] = df[name].fillna(default).astype(var_type)
185+
df[name] = df[name].fillna(default).astype(var_type)
185186
# TO DO: add more validation logic
186187
return df
187188

src/utils/helper_functions.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,6 @@ def metadata_from_dataframe(df):
2828
tmp = {'name': c, 'type': 'string'}
2929
else:
3030
raise ValueError('Unknown type for {}'.format(c))
31+
tmp['accepts_missing'] = df[c].isnull().any()
3132
metadata.append(tmp)
3233
return metadata

0 commit comments

Comments
 (0)