scikit-learn-contrib
diff --git a/‎doc/api.rst‎
Lines changed: 1 addition & 0 deletions b/‎doc/api.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/over_sampling.rst‎
Lines changed: 45 additions & 2 deletions b/‎doc/over_sampling.rst‎
Lines changed: 45 additions & 2 deletions
diff --git a/‎doc/whats_new/v0.0.4.rst‎
Lines changed: 6 additions & 0 deletions b/‎doc/whats_new/v0.0.4.rst‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/over-sampling/plot_comparison_over_sampling.py‎
Lines changed: 26 additions & 1 deletion b/‎examples/over-sampling/plot_comparison_over_sampling.py‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎imblearn/over_sampling/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎imblearn/over_sampling/__init__.py‎
Lines changed: 2 additions & 1 deletion
@@ -72,6 +72,7 @@ Prototype selection
    over_sampling.ADASYN
    over_sampling.RandomOverSampler
    over_sampling.SMOTE
+   over_sampling.SMOTENC
 
 
 .. _combine_ref:
 
@@ -160,6 +160,44 @@ some variant of the SMOTE algorithm::
   >>> print(sorted(Counter(y_resampled).items()))
   [(0, 4674), (1, 4674), (2, 4674)]
 
+When dealing with mixed data type such as continuous and categorical features,
+none of the presented methods (apart of the class :class:`RandomOverSampler`)
+can deal with the categorical features. The :class:`SMOTENC` [CBHK2002]_ is an
+extension of the :class:`SMOTE` algorithm for which categorical data are
+treated differently::
+
+  >>> # create a synthetic data set with continuous and categorical features
+  >>> rng = np.random.RandomState(42)
+  >>> n_samples = 50
+  >>> X = np.empty((n_samples, 3), dtype=object)
+  >>> X[:, 0] = rng.choice(['A', 'B', 'C'], size=n_samples).astype(object)
+  >>> X[:, 1] = rng.randn(n_samples)
+  >>> X[:, 2] = rng.randint(3, size=n_samples)
+  >>> y = np.array([0] * 20 + [1] * 30)
+  >>> print(sorted(Counter(y).items()))
+  [(0, 20), (1, 30)]
+
+In this data set, the first and last features are considered as categorical
+features. One need to provide this information to :class:`SMOTENC` via the
+parameters ``categorical_features`` either by passing the indices of these
+features or a boolean mask marking these features::
+
+  >>> from imblearn.over_sampling import SMOTENC
+  >>> smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
+  >>> X_resampled, y_resampled = smote_nc.fit_resample(X, y)
+  >>> print(sorted(Counter(y_resampled).items()))
+  [(0, 30), (1, 30)]
+  >>> print(X_resampled[-5:])
+  [['B' 0.1989993778979113 0]
+   ['A' -0.3657680728116921 1]
+   ['B' 0.8790828729585258 0]
+   ['A' 0.3710891618824609 0]
+   ['A' 0.3327240726719727 0]]
+
+Therefore, it can be seen that the samples generated in the first and last
+columns are belonging to the same categories originally presented without any
+other extra interpolation.
+
 .. topic:: References
 
   .. [HWB2005] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new
@@ -198,8 +236,13 @@ interpolation will create a sample on the line between :math:`x_{i}` and
    :scale: 60
    :align: center
 
-Each SMOTE variant and ADASYN differ from each other by selecting the samples
-:math:`x_i` ahead of generating the new samples.
+SMOTE-NC slightly change the way a new sample is generated by performing
+something specific for the categorical features. In fact, the categories of a
+new generated sample are decided by picking the most frequent category of the
+nearest neighbors present during the generation.
+
+The other SMOTE variants and ADASYN differ from each other by selecting the
+samples :math:`x_i` ahead of generating the new samples.
 
 The **regular** SMOTE algorithm --- cf. to the :class:`SMOTE` object --- does not
 impose any rule and will randomly pick-up all possible :math:`x_i` available.
 
@@ -41,6 +41,12 @@ New features
   under-sampling stage before each boosting iteration of AdaBoost.
   :issue:`469` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- Add :class:`imblern.over_sampling.SMOTENC` which generate synthetic samples
+  on data set with heterogeneous data type (continuous and categorical
+  features).
+  :issue:`412` by :user:`Denis Dudnik <ddudnik>` and
+  :user:`Guillaume Lemaitre <glemaitre>`.
+
 Enhancement
 ...........
 
 
@@ -21,7 +21,7 @@
 
 from imblearn.pipeline import make_pipeline
 from imblearn.over_sampling import ADASYN
-from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE
+from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC
 from imblearn.over_sampling import RandomOverSampler
 from imblearn.base import BaseSampler
 
@@ -226,4 +226,29 @@ def _fit_resample(self, X, y):
     ax[1].set_title('Resampling using {}'.format(sampler.__class__.__name__))
 fig.tight_layout()
 
+###############################################################################
+# When dealing with a mixed of continuous and categorical features, SMOTE-NC
+# is the only method which can handle this case.
+
+# create a synthetic data set with continuous and categorical features
+rng = np.random.RandomState(42)
+n_samples = 50
+X = np.empty((n_samples, 3), dtype=object)
+X[:, 0] = rng.choice(['A', 'B', 'C'], size=n_samples).astype(object)
+X[:, 1] = rng.randn(n_samples)
+X[:, 2] = rng.randint(3, size=n_samples)
+y = np.array([0] * 20 + [1] * 30)
+
+print('The original imbalanced dataset')
+print(sorted(Counter(y).items()))
+print('The first and last columns are containing categorical features:')
+print(X[:5])
+
+smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
+X_resampled, y_resampled = smote_nc.fit_resample(X, y)
+print('Dataset after resampling:')
+print(sorted(Counter(y_resampled).items()))
+print('SMOTE-NC will generate categories for the categorical features:')
+print(X_resampled[-5:])
+
 plt.show()
@@ -8,6 +8,7 @@
 from ._smote import SMOTE
 from ._smote import BorderlineSMOTE
 from ._smote import SVMSMOTE
+from ._smote import SMOTENC
 
 __all__ = ['ADASYN', 'RandomOverSampler',
-           'SMOTE', 'BorderlineSMOTE', 'SVMSMOTE']
+           'SMOTE', 'BorderlineSMOTE', 'SVMSMOTE', 'SMOTENC']