2525from sklearn .utils import _approximate_mode , check_random_state
2626from sklearn .utils .validation import _num_samples , check_array
2727
28+ from autoPyTorch .constants import MIN_CATEGORIES_FOR_EMBEDDING_MAX
2829from autoPyTorch .data .base_target_validator import SupportedTargetTypes
2930from autoPyTorch .utils .common import ispandas
3031
@@ -459,8 +460,8 @@ def _subsample_by_indices(
459460 return X , y
460461
461462
462- def megabytes (arr : DatasetCompressionInputType ) -> float :
463-
463+ def get_raw_memory_usage (arr : DatasetCompressionInputType ) -> float :
464+ memory_in_bytes : float
464465 if isinstance (arr , np .ndarray ):
465466 memory_in_bytes = arr .nbytes
466467 elif issparse (arr ):
@@ -470,19 +471,57 @@ def megabytes(arr: DatasetCompressionInputType) -> float:
470471 else :
471472 raise ValueError (f"Unrecognised data type of X, expected data type to "
472473 f"be in (np.ndarray, spmatrix, pd.DataFrame) but got :{ type (arr )} " )
474+ return memory_in_bytes
475+
476+
477+ def get_approximate_mem_usage_in_mb (
478+ arr : DatasetCompressionInputType ,
479+ categorical_columns : List ,
480+ n_categories_per_cat_column : Optional [List [int ]] = None
481+ ) -> float :
482+
483+ err_msg = "Value number of categories per categorical is required when the data has categorical columns"
484+ if ispandas (arr ):
485+ arr_dtypes = arr .dtypes .to_dict ()
486+ multipliers = [dtype .itemsize for col , dtype in arr_dtypes .items () if col not in categorical_columns ]
487+ if len (categorical_columns ) > 0 :
488+ if n_categories_per_cat_column is None :
489+ raise ValueError (err_msg )
490+ for col , num_cat in zip (categorical_columns , n_categories_per_cat_column ):
491+ if num_cat < MIN_CATEGORIES_FOR_EMBEDDING_MAX :
492+ multipliers .append (num_cat * arr_dtypes [col ].itemsize )
493+ else :
494+ multipliers .append (arr_dtypes [col ].itemsize )
495+ size_one_row = sum (multipliers )
496+
497+ elif isinstance (arr , (np .ndarray , spmatrix )):
498+ n_cols = arr .shape [- 1 ] - len (categorical_columns )
499+ multiplier = arr .dtype .itemsize
500+ if len (categorical_columns ) > 0 :
501+ if n_categories_per_cat_column is None :
502+ raise ValueError (err_msg )
503+ # multiply num categories with the size of the column to capture memory after one hot encoding
504+ n_cols += sum (num_cat if num_cat < MIN_CATEGORIES_FOR_EMBEDDING_MAX else 1 for num_cat in n_categories_per_cat_column )
505+ size_one_row = n_cols * multiplier
506+ else :
507+ raise ValueError (f"Unrecognised data type of X, expected data type to "
508+ f"be in (np.ndarray, spmatrix, pd.DataFrame), but got :{ type (arr )} " )
473509
474- return float (memory_in_bytes / (2 ** 20 ))
510+ return float (arr . shape [ 0 ] * size_one_row / (2 ** 20 ))
475511
476512
477513def reduce_dataset_size_if_too_large (
478514 X : DatasetCompressionInputType ,
479515 memory_allocation : Union [int , float ],
480516 is_classification : bool ,
481517 random_state : Union [int , np .random .RandomState ],
518+ categorical_columns : List ,
519+ n_categories_per_cat_column : Optional [List [int ]] = None ,
482520 y : Optional [SupportedTargetTypes ] = None ,
483521 methods : List [str ] = ['precision' , 'subsample' ],
484522) -> DatasetCompressionInputType :
485- f""" Reduces the size of the dataset if it's too close to the memory limit.
523+ f"""
524+ Reduces the size of the dataset if it's too close to the memory limit.
486525
487526 Follows the order of the operations passed in and retains the type of its
488527 input.
@@ -513,7 +552,6 @@ def reduce_dataset_size_if_too_large(
513552 Reduce the amount of samples of the dataset such that it fits into the allocated
514553 memory. Ensures stratification and that unique labels are present
515554
516-
517555 memory_allocation (Union[int, float]):
518556 The amount of memory to allocate to the dataset. It should specify an
519557 absolute amount.
@@ -524,7 +562,7 @@ def reduce_dataset_size_if_too_large(
524562 """
525563
526564 for method in methods :
527- if megabytes ( X ) <= memory_allocation :
565+ if get_approximate_mem_usage_in_mb ( X , categorical_columns , n_categories_per_cat_column ) <= memory_allocation :
528566 break
529567
530568 if method == 'precision' :
@@ -540,7 +578,8 @@ def reduce_dataset_size_if_too_large(
540578 # into the allocated memory, we subsample it so that it does
541579
542580 n_samples_before = X .shape [0 ]
543- sample_percentage = memory_allocation / megabytes (X )
581+ sample_percentage = memory_allocation / get_approximate_mem_usage_in_mb (
582+ X , categorical_columns , n_categories_per_cat_column )
544583
545584 # NOTE: type ignore
546585 #
0 commit comments