@@ -21,7 +21,14 @@ class ProblemType(Enum):
2121 FeatureMultilabel = 4
2222
2323class DataManager (object ):
24+ """ Load data from multiple sources and formants"""
25+
2426 def __init__ (self , verbose = 0 ):
27+ """Construct the DataManager
28+
29+ Keyword Arguments:
30+ verbose {bool} -- Whether to print stuff. (default: {0})
31+ """
2532 self .verbose = verbose
2633 self .X_train , self .Y_train = None , None
2734 self .X_test , self .Y_test = None , None
@@ -33,6 +40,16 @@ def __init__(self, verbose=0):
3340 self .categorical_features = None
3441
3542 def read_data (self , file_name , test_split = 0.0 , is_classification = None , random_seed = 0 , ** kwargs ):
43+ """Read the data.
44+
45+ Arguments:
46+ file_name {str} -- The name of the file to load. Different Readers are associated with different filenames.
47+
48+ Keyword Arguments:
49+ test_split {float} -- Amount of data to use as test split (default: {0.0})
50+ is_classification {bool} -- Whether the data is a classification task (default: {None})
51+ random_seed {int} -- a random seed (default: {0})
52+ """
3653 print ("Read:" + file_name )
3754 reader = self ._get_reader (file_name , is_classification )
3855 reader .read ()
@@ -53,6 +70,18 @@ def read_data(self, file_name, test_split=0.0, is_classification=None, random_se
5370 self ._split_data (test_split , random_seed )
5471
5572 def _get_reader (self , file_name , is_classification ):
73+ """Get the reader associated with the filename.
74+
75+ Arguments:
76+ file_name {str} -- The file to load
77+ is_classification {bool} -- Whether the data is a classification task or not
78+
79+ Raises:
80+ ValueError: The given file type is not supported
81+
82+ Returns:
83+ DataReader -- A reader that is able to read the data type
84+ """
5685 if file_name .endswith (".csv" ):
5786 reader = CSVReader (file_name , is_classification = is_classification )
5887 elif file_name .startswith ("openml:" ):
@@ -65,6 +94,17 @@ def _get_reader(self, file_name, is_classification):
6594 return reader
6695
6796 def generate_classification (self , num_classes , num_features , num_samples , test_split = 0.1 , seed = 0 ):
97+ """Generate a classification task
98+
99+ Arguments:
100+ num_classes {int} -- Number of classes
101+ num_features {int} -- Number of features
102+ num_samples {int} -- Number of samples
103+
104+ Keyword Arguments:
105+ test_split {float} -- Size of test split (default: {0.1})
106+ seed {int} -- A random seed (default: {0})
107+ """
68108 #X, Y = make_classification(n_samples=800, n_features=num_feats, n_classes=num_classes, n_informative=4)
69109 X , y = make_multilabel_classification (
70110 n_samples = num_samples , n_features = num_features , n_classes = num_classes , n_labels = 0.01 ,
@@ -78,13 +118,29 @@ def generate_classification(self, num_classes, num_features, num_samples, test_s
78118 self ._split_data (test_split , seed )
79119
80120 def generate_regression (self , num_features , num_samples , test_split = 0.1 , seed = 0 ):
121+ """Generate a regression task
122+
123+ Arguments:
124+ num_features {int} -- Number of features
125+ num_samples {int} -- Number of samples
126+
127+ Keyword Arguments:
128+ test_split {float} -- Size of test split (default: {0.1})
129+ seed {int} -- a random seed (default: {0})
130+ """
81131 X , Y = make_regression (n_samples = num_samples , n_features = num_features , random_state = seed )
82132 self .categorical_features = [False ] * num_features
83133 self .problem_type = ProblemType .FeatureRegression
84134 self .X , self .Y = X , Y
85135 self ._split_data (test_split , seed )
86136
87137 def _split_data (self , test_split , seed ):
138+ """Split the data in test (, valid) and training set.
139+
140+ Arguments:
141+ test_split {[type]} -- [description]
142+ seed {[type]} -- [description]
143+ """
88144 valid_specified = self .X_valid is not None and self .Y_valid is not None
89145 test_specified = self .X_test is not None and self .Y_test is not None
90146
@@ -101,6 +157,17 @@ def _split_data(self, test_split, seed):
101157 self .Y_train = self .Y
102158
103159def deterministic_shuffle_and_split (X , Y , split , seed ):
160+ """Split the data deterministically given the seed
161+
162+ Arguments:
163+ X {array} -- The feature data
164+ Y {array} -- The targets
165+ split {float} -- The size of the split
166+ seed {int} -- A random seed
167+
168+ Returns:
169+ tuple -- Tuple of full data and the two splits
170+ """
104171 rng = np .random .RandomState (seed )
105172 p = rng .permutation (X .shape [0 ])
106173
@@ -110,4 +177,4 @@ def deterministic_shuffle_and_split(X, Y, split, seed):
110177 split = int (split * X .shape [0 ])
111178 return X , Y , X [0 :- split ], Y [0 :- split ], X [- split :], Y [- split :]
112179 else :
113- return X , Y , X , Y , None , None
180+ return X , Y , X , Y , None , None
0 commit comments