Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/pytorch_tabular/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

__author__ = """Manu Joseph"""
__email__ = "manujosephv@gmail.com"
__version__ = "1.1.0"
__version__ = "1.1.1"

from . import models, ssl_models
from .categorical_encoders import CategoricalEmbeddingTransformer
Expand Down
2 changes: 1 addition & 1 deletion src/pytorch_tabular/ssl_models/common/noise_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class SwapNoiseCorrupter(nn.Module):

def __init__(self, probas):
super().__init__()
self.probas = torch.from_numpy(np.array(probas))
self.probas = torch.from_numpy(np.array(probas, dtype=np.float32))

def forward(self, x):
should_swap = torch.bernoulli(self.probas.to(x.device) * torch.ones(x.shape).to(x.device))
Expand Down
12 changes: 5 additions & 7 deletions src/pytorch_tabular/tabular_datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def __init__(
if isinstance(target, str):
self.y = self.y.reshape(-1, 1) # .astype(np.int64)
else:
self.y = np.zeros((self.n, 1)) # .astype(np.int64)
self.y = np.zeros((self.n, 1), dtype=np.float32) # .astype(np.int64)

if task == "classification":
self.y = self.y.astype(np.int64)
Expand Down Expand Up @@ -502,7 +502,7 @@ def _cache_dataset(self):

def split_train_val(self, train):
logger.debug(
"No validation data provided." f" Using {self.config.validation_split*100}% of train data as validation"
f"No validation data provided. Using {self.config.validation_split * 100}% of train data as validation"
)
val_idx = train.sample(
int(self.config.validation_split * len(train)),
Expand Down Expand Up @@ -753,18 +753,16 @@ def _load_dataset_from_cache(self, tag: str = "train"):
try:
dataset = getattr(self, f"_{tag}_dataset")
except AttributeError:
raise AttributeError(
f"{tag}_dataset not found in memory. Please provide the data for" f" {tag} dataloader"
)
raise AttributeError(f"{tag}_dataset not found in memory. Please provide the data for {tag} dataloader")
elif self.cache_mode is self.CACHE_MODES.DISK:
try:
dataset = torch.load(self.cache_dir / f"{tag}_dataset")
except FileNotFoundError:
raise FileNotFoundError(
f"{tag}_dataset not found in {self.cache_dir}. Please provide the" f" data for {tag} dataloader"
f"{tag}_dataset not found in {self.cache_dir}. Please provide the data for {tag} dataloader"
)
elif self.cache_mode is self.CACHE_MODES.INFERENCE:
raise RuntimeError("Cannot load dataset in inference mode. Use" " `prepare_inference_dataloader` instead")
raise RuntimeError("Cannot load dataset in inference mode. Use `prepare_inference_dataloader` instead")
else:
raise ValueError(f"{self.cache_mode} is not a valid cache mode")
return dataset
Expand Down
90 changes: 40 additions & 50 deletions src/pytorch_tabular/tabular_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@
from pandas import DataFrame
from pytorch_lightning import seed_everything
from pytorch_lightning.callbacks import RichProgressBar
from pytorch_lightning.callbacks.gradient_accumulation_scheduler import (
GradientAccumulationScheduler,
)
from pytorch_lightning.callbacks.gradient_accumulation_scheduler import GradientAccumulationScheduler
from pytorch_lightning.tuner.tuning import Tuner
from pytorch_lightning.utilities.model_summary import summarize
from pytorch_lightning.utilities.rank_zero import rank_zero_only
Expand All @@ -48,11 +46,7 @@
)
from pytorch_tabular.config.config import InferredConfig
from pytorch_tabular.models.base_model import BaseModel, _CaptumModel, _GenericModel
from pytorch_tabular.models.common.layers.embeddings import (
Embedding1dLayer,
Embedding2dLayer,
PreEncoded1dLayer,
)
from pytorch_tabular.models.common.layers.embeddings import Embedding1dLayer, Embedding2dLayer, PreEncoded1dLayer
from pytorch_tabular.tabular_datamodule import TabularDatamodule
from pytorch_tabular.utils import (
OOMException,
Expand Down Expand Up @@ -139,9 +133,9 @@ def __init__(
trainer_config = self._read_parse_config(trainer_config, TrainerConfig)
optimizer_config = self._read_parse_config(optimizer_config, OptimizerConfig)
if model_config.task != "ssl":
assert data_config.target is not None, (
"`target` in data_config should not be None for" f" {model_config.task} task"
)
assert (
data_config.target is not None
), f"`target` in data_config should not be None for {model_config.task} task"
if experiment_config is None:
if self.verbose:
logger.info("Experiment Tracking is turned off")
Expand Down Expand Up @@ -284,9 +278,7 @@ def _setup_experiment_tracking(self):
offline=False,
)
else:
raise NotImplementedError(
f"{self.config.log_target} is not implemented. Try one of [wandb," " tensorboard]"
)
raise NotImplementedError(f"{self.config.log_target} is not implemented. Try one of [wandb, tensorboard]")

def _prepare_callbacks(self, callbacks=None) -> List:
"""Prepares the necesary callbacks to the Trainer based on the configuration.
Expand Down Expand Up @@ -374,11 +366,9 @@ def _check_and_set_target_transform(self, target_transform):
elif isinstance(target_transform, TransformerMixin):
pass
else:
raise ValueError(
"`target_transform` should wither be an sklearn Transformer or a" " tuple of callables."
)
raise ValueError("`target_transform` should wither be an sklearn Transformer or a tuple of callables.")
if self.config.task == "classification" and target_transform is not None:
logger.warning("For classification task, target transform is not used. Ignoring the" " parameter")
logger.warning("For classification task, target transform is not used. Ignoring the parameter")
target_transform = None
return target_transform

Expand Down Expand Up @@ -674,6 +664,8 @@ def train(
self.model.reset_weights()
# Parameters in models needs to be initialized again after LR find
self.model.data_aware_initialization(self.datamodule)
# Update the Trainer to use the suggested LR
self._prepare_for_training(self.model, self.datamodule, callbacks, max_epochs, min_epochs)
self.model.train()
if self.verbose:
logger.info("Training Started")
Expand Down Expand Up @@ -771,9 +763,9 @@ def fit(
pl.Trainer: The PyTorch Lightning Trainer instance

"""
assert self.config.task != "ssl", (
"`fit` is not valid for SSL task. Please use `pretrain` for" " semi-supervised learning"
)
assert (
self.config.task != "ssl"
), "`fit` is not valid for SSL task. Please use `pretrain` for semi-supervised learning"
if metrics is not None:
assert len(metrics) == len(
metrics_prob_inputs or []
Expand Down Expand Up @@ -854,9 +846,9 @@ def pretrain(
pl.Trainer: The PyTorch Lightning Trainer instance

"""
assert self.config.task == "ssl", (
f"`pretrain` is not valid for {self.config.task} task. Please use `fit`" " instead."
)
assert (
self.config.task == "ssl"
), f"`pretrain` is not valid for {self.config.task} task. Please use `fit` instead."
seed = seed or self.config.seed
if seed:
seed_everything(seed)
Expand Down Expand Up @@ -1001,7 +993,7 @@ def create_finetune_model(
if self.track_experiment:
# Renaming the experiment run so that a different log is created for finetuning
if self.verbose:
logger.info("Renaming the experiment run for finetuning as" f" {config['run_name'] + '_finetuned'}")
logger.info(f"Renaming the experiment run for finetuning as {config['run_name'] + '_finetuned'}")
config["run_name"] = config["run_name"] + "_finetuned"

config_override = {"target": target} if target is not None else {}
Expand Down Expand Up @@ -1105,9 +1097,9 @@ def finetune(
pl.Trainer: The trainer object

"""
assert self._is_finetune_model, (
"finetune() can only be called on a finetune model created using" " `TabularModel.create_finetune_model()`"
)
assert (
self._is_finetune_model
), "finetune() can only be called on a finetune model created using `TabularModel.create_finetune_model()`"
seed_everything(self.config.seed)
if freeze_backbone:
for param in self.model.backbone.parameters():
Expand Down Expand Up @@ -1294,15 +1286,15 @@ def _format_predicitons(
)
if is_probabilistic:
for j, q in enumerate(quantiles):
col_ = f"{target_col}_q{int(q*100)}"
col_ = f"{target_col}_q{int(q * 100)}"
pred_df[col_] = self.datamodule.target_transforms[i].inverse_transform(
quantile_predictions[:, j, i].reshape(-1, 1)
)
else:
pred_df[f"{target_col}_prediction"] = point_predictions[:, i]
if is_probabilistic:
for j, q in enumerate(quantiles):
pred_df[f"{target_col}_q{int(q*100)}"] = quantile_predictions[:, j, i].reshape(-1, 1)
pred_df[f"{target_col}_q{int(q * 100)}"] = quantile_predictions[:, j, i].reshape(-1, 1)

elif self.config.task == "classification":
start_index = 0
Expand Down Expand Up @@ -1483,7 +1475,7 @@ def predict(
"min",
"max",
"hard_voting",
], "aggregate should be one of 'mean', 'median', 'min', 'max', or" " 'hard_voting'"
], "aggregate should be one of 'mean', 'median', 'min', 'max', or 'hard_voting'"
if self.config.task == "regression":
assert aggregate_tta != "hard_voting", "hard_voting is only available for classification"

Expand Down Expand Up @@ -1538,11 +1530,9 @@ def load_best_model(self) -> None:
ckpt = pl_load(ckpt_path, map_location=lambda storage, loc: storage)
self.model.load_state_dict(ckpt["state_dict"])
else:
logger.warning("No best model available to load. Did you run it more than 1" " epoch?...")
logger.warning("No best model available to load. Did you run it more than 1 epoch?...")
else:
logger.warning(
"No best model available to load. Checkpoint Callback needs to be" " enabled for this to work"
)
logger.warning("No best model available to load. Checkpoint Callback needs to be enabled for this to work")

def save_datamodule(self, dir: str, inference_only: bool = False) -> None:
"""Saves the datamodule in the specified directory.
Expand Down Expand Up @@ -1707,7 +1697,7 @@ def ret_summary(self, model=None, max_depth: int = -1) -> str:
summary_str += "Config\n"
summary_str += "-" * 100 + "\n"
summary_str += pformat(self.config.__dict__["_content"], indent=4, width=80, compact=True)
summary_str += "\nFull Model Summary once model has been " "initialized or passed in as an argument"
summary_str += "\nFull Model Summary once model has been initialized or passed in as an argument"
return summary_str

def __str__(self) -> str:
Expand Down Expand Up @@ -1936,9 +1926,7 @@ def _prepare_baselines_captum(
else:
baselines = baselines.mean(dim=0, keepdim=True)
else:
raise ValueError(
"Invalid value for `baselines`. Please refer to the documentation" " for more details."
)
raise ValueError("Invalid value for `baselines`. Please refer to the documentation for more details.")
return baselines

def _handle_categorical_embeddings_attributions(
Expand Down Expand Up @@ -2061,9 +2049,7 @@ def explain(
hasattr(self.model.hparams, "embedding_dims") and self.model.hparams.embedding_dims is not None
)
if (not is_embedding1d) and (not is_embedding2d):
raise NotImplementedError(
"Attributions are not implemented for models with this type of" " embedding layer"
)
raise NotImplementedError("Attributions are not implemented for models with this type of embedding layer")
test_dl = self.datamodule.prepare_inference_dataloader(data)
self.model.eval()
# prepare import for Captum
Expand Down Expand Up @@ -2095,7 +2081,7 @@ def explain(
"Something went wrong. The number of features in the attributions"
f" ({attributions.shape[1]}) does not match the number of features in"
" the model"
f" ({self.model.hparams.continuous_dim+self.model.hparams.categorical_dim})"
f" ({self.model.hparams.continuous_dim + self.model.hparams.categorical_dim})"
)
return pd.DataFrame(
attributions.detach().cpu().numpy(),
Expand Down Expand Up @@ -2215,7 +2201,7 @@ def cross_validate(
oof_preds = []
for fold, (train_idx, val_idx) in it:
if verbose:
logger.info(f"Running Fold {fold+1}/{cv.get_n_splits()}")
logger.info(f"Running Fold {fold + 1}/{cv.get_n_splits()}")
# train_fold = train.iloc[train_idx]
# val_fold = train.iloc[val_idx]
if reset_datamodule:
Expand Down Expand Up @@ -2247,7 +2233,7 @@ def cross_validate(
result = self.evaluate(train.iloc[val_idx], verbose=False)
cv_metrics.append(result[0][metric])
if verbose:
logger.info(f"Fold {fold+1}/{cv.get_n_splits()} score: {cv_metrics[-1]}")
logger.info(f"Fold {fold + 1}/{cv.get_n_splits()} score: {cv_metrics[-1]}")
self.model.reset_weights()
return cv_metrics, oof_preds

Expand Down Expand Up @@ -2375,9 +2361,13 @@ def bagging_predict(
"regression",
], "Bagging is only available for classification and regression"
if not callable(aggregate):
assert aggregate in ["mean", "median", "min", "max", "hard_voting"], (
"aggregate should be one of 'mean', 'median', 'min', 'max', or" " 'hard_voting'"
)
assert aggregate in [
"mean",
"median",
"min",
"max",
"hard_voting",
], "aggregate should be one of 'mean', 'median', 'min', 'max', or 'hard_voting'"
if self.config.task == "regression":
assert aggregate != "hard_voting", "hard_voting is only available for classification"
cv = self._check_cv(cv)
Expand All @@ -2387,7 +2377,7 @@ def bagging_predict(
model = None
for fold, (train_idx, val_idx) in enumerate(cv.split(train, y=train[self.config.target], groups=groups)):
if verbose:
logger.info(f"Running Fold {fold+1}/{cv.get_n_splits()}")
logger.info(f"Running Fold {fold + 1}/{cv.get_n_splits()}")
train_fold = train.iloc[train_idx]
val_fold = train.iloc[val_idx]
if reset_datamodule:
Expand All @@ -2412,7 +2402,7 @@ def bagging_predict(
elif self.config.task == "regression":
pred_prob_l.append(fold_preds.values)
if verbose:
logger.info(f"Fold {fold+1}/{cv.get_n_splits()} prediction done")
logger.info(f"Fold {fold + 1}/{cv.get_n_splits()} prediction done")
self.model.reset_weights()
pred_df = self._combine_predictions(pred_prob_l, pred_idx, aggregate, weights)
if return_raw_predictions:
Expand Down
Loading