Skip to content

Commit 0e0df37

Browse files
Merge pull request #7 from frederikhoengaard/ingestion-improvement
Improve ingestion process
2 parents c0e7ed5 + 35acb80 commit 0e0df37

File tree

24 files changed

+399
-24
lines changed

24 files changed

+399
-24
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "lazylearn"
7-
version = "0.0.1"
7+
version = "0.0.2"
88
authors = [
99
{ name="Frederik P. Høngaard", email="mail@frederikhoengaard.com" },
1010
]

python/src/lazylearn/ingestion/ingestion_pipeline_steps/data_parser_step.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
import pandas as pd
12
from errors.errors import DataSourceError
3+
from ingestion.utils.csv import csv_check
24
from pandas import DataFrame
35
from pipeline.pipeline import IngestionPipeline, PipelineStep
46

@@ -17,5 +19,8 @@ def apply(self, pipeline: IngestionPipeline):
1719

1820
if isinstance(pipeline.raw_data, DataFrame):
1921
pipeline.df = pipeline.raw_data
22+
# check if raw data is a path to a csv file and read it into csv
23+
elif csv_check(pipeline.df):
24+
pipeline.df = pd.read_csv(pipeline.raw_data)
2025
else:
2126
raise DataSourceError

python/src/lazylearn/ingestion/ingestion_pipeline_steps/interpreter_step.py

Lines changed: 78 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
1+
import numpy as np
12
import pandas as pd
2-
from pandas import Series
3+
from pandas import DataFrame, Series
34
from pipeline.pipeline import IngestionPipeline
45
from tqdm import tqdm
56

67

78
class ColumnTypeInterpreter:
9+
def __int__(self):
10+
self.df: DataFrame = None
11+
812
def apply(self, pipeline: IngestionPipeline):
913
"""
1014
This method is responsible for inferring the
@@ -23,6 +27,10 @@ def apply(self, pipeline: IngestionPipeline):
2327
) # noqa
2428

2529
pipeline.column_type_map = column_types
30+
if "unknown" in pipeline.column_type_map.values():
31+
pipeline.needs_type_map = True
32+
33+
pipeline.type_collections = self.build_type_collections(column_types)
2634

2735
def analyze_column(self, column: Series):
2836
"""
@@ -33,16 +41,22 @@ def analyze_column(self, column: Series):
3341
values = column.tolist()
3442
types = [type(value) for value in values]
3543

36-
if self.categorical_test(values):
37-
return "categorical"
44+
column_type = None
3845

46+
if self.categorical_test(values):
47+
column_type = "categorical"
48+
elif self.numeric_test(types) and self.id_check(types, values):
49+
column_type = "id"
3950
elif self.numeric_test(types):
40-
return "numeric"
51+
column_type = "numeric"
4152

42-
elif self.datetime_check(column):
43-
return "datetime"
44-
else:
45-
return "object"
53+
if self.datetime_check(column) and not self.numeric_test(types):
54+
column_type = "datetime"
55+
56+
if column_type is None:
57+
column_type = "unknown"
58+
59+
return column_type
4660

4761
@staticmethod
4862
def categorical_test(values: list):
@@ -72,15 +86,66 @@ def numeric_test(types: list):
7286
:param types: list of type objects
7387
:return: True if column is numeric, False otherwise
7488
"""
75-
return all([item == float or item == int for item in set(types)])
89+
return all(
90+
[
91+
item == float or item == int
92+
for item in set(types)
93+
if item is not None # noqa
94+
]
95+
)
7696

7797
@staticmethod
7898
def string_test(types: set):
7999
raise NotImplementedError
80100

81101
def datetime_check(self, column: Series):
82-
try:
83-
self.df[column.name] = pd.to_datetime(column)
102+
"""
103+
104+
:param column:
105+
:return:
106+
"""
107+
col_name = str(column.name)
108+
109+
# if type of column is actually datetime
110+
if self.df[col_name].dtype.type == np.datetime64:
84111
return True
85-
except Exception as e: # noqa
86-
return False
112+
113+
# if date or time is in column name and can be cast as date
114+
if "date" in col_name.lower() or "time" in col_name.lower():
115+
try:
116+
self.df[col_name] = pd.to_datetime(self.df[col_name])
117+
return True
118+
except Exception as e: # noqa
119+
pass
120+
121+
# if format of values looks like dates
122+
123+
return False
124+
125+
def id_check(self, types, values):
126+
"""
127+
128+
:param types:
129+
:param values:
130+
:return:
131+
"""
132+
return all(
133+
[item == int for item in set(types) if item is not None]
134+
) and len( # noqa
135+
set(values)
136+
) == len(
137+
self.df
138+
)
139+
140+
@staticmethod
141+
def build_type_collections(column_type_map):
142+
collections = {}
143+
144+
for data_type in ["datetime", "numeric", "categorical"]:
145+
collections[data_type] = [
146+
col
147+
for col in column_type_map
148+
if column_type_map[col] == data_type # noqa
149+
]
150+
151+
return collections

python/src/lazylearn/ingestion/utils/__init__.py

Whitespace-only changes.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
def csv_check(path):
2+
raise NotImplementedError

python/src/lazylearn/lazylearn.py

Lines changed: 54 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,67 @@
11
from ingestion.ingestion_pipeline import Ingestion
2+
from model_selection.splitters import test_train_splitter
3+
from preprocessing.time.date_processor import date_processor
4+
from preprocessing.time.duration import duration_builder
5+
from regression.models.randomforest.randomforest import ( # noqa
6+
RandomForestRegressionRunner,
7+
)
8+
from sklearn.metrics import mean_absolute_error
29

310

411
class LazyLearner:
5-
def __init__(self):
12+
def __init__(self, random_state=None):
613
self.dataset = None
14+
self.task = None
15+
self.models = None
16+
self.leaderboard = None
17+
self.random_state = random_state
18+
self.target = None
719

820
def create_project(self, data, target, task="infer"):
921
# ingest data
10-
ingestion_response = Ingestion().run(data) # noqa
22+
self.target = target
23+
self.dataset = Ingestion().run(data)
1124

12-
# preprocess
25+
if task == "infer":
26+
# if target is numeric then regression, else classification
27+
if self.dataset.column_type_map[target] == "numeric":
28+
self.task = "regression"
29+
else:
30+
self.task = "classification"
31+
32+
# process dates
33+
34+
self.dataset = date_processor(self.dataset)
35+
self.dataset = duration_builder(self.dataset)
36+
37+
# split partitions
38+
39+
self.dataset = test_train_splitter(
40+
self.dataset, random_state=self.random_state
41+
) # noqa
1342

1443
# set modelling configurations
1544

16-
# train
45+
def run_autopilot(self):
46+
"""
47+
TODO: Everything here must be abstracted away into strategies
48+
TODO: such that several models are run and their scores are added to
49+
TODO: the leaderboard
50+
51+
:return:
52+
"""
53+
54+
simple_random_forest = RandomForestRegressionRunner(
55+
target=self.target,
56+
dataset=self.dataset,
57+
random_state=self.random_state, # noqa
58+
)
59+
simple_random_forest.fit()
1760

18-
# eval
61+
# get holdout scores
62+
simple_random_forest.predict(self.dataset.partitions["test"])
63+
simple_random_forest.pipeline.holdout_score = mean_absolute_error(
64+
self.dataset.partitions["test"][self.target],
65+
simple_random_forest.pipeline.tmp_pred,
66+
)
67+
return simple_random_forest

python/src/lazylearn/model_selection/__init__.py

Whitespace-only changes.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from models.models import Dataset
2+
from sklearn.model_selection import train_test_split
3+
4+
5+
def test_train_splitter(dataset: Dataset, random_state=None) -> Dataset:
6+
train_partition, test_partition = train_test_split(
7+
dataset.df, test_size=0.2, random_state=random_state
8+
)
9+
10+
dataset.partitions["test"] = test_partition
11+
dataset.partitions["train"] = train_partition
12+
13+
return dataset
14+
15+
16+
def cv_splitter(dataset: Dataset) -> Dataset:
17+
return dataset

python/src/lazylearn/models/models.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,20 @@
22

33

44
class Dataset:
5-
def __init__(self, df: DataFrame, column_type_map: dict):
5+
def __init__(
6+
self,
7+
df: DataFrame,
8+
column_type_map: dict,
9+
summary_stats: dict,
10+
type_collections: dict,
11+
):
612
self.name = None
713
self.description = None
814
self.df = df
915
self.column_type_map = column_type_map
16+
self.summary_stats = summary_stats
17+
self.type_collections = type_collections
18+
self.partitions: dict = {}
1019

1120
def save(self):
1221
raise NotImplementedError

python/src/lazylearn/pipeline/pipeline.py

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from typing import List
22

33
from models.models import Dataset
4-
from pandas import DataFrame
4+
from pandas import DataFrame, Series
55

66

77
class Pipeline:
@@ -21,6 +21,12 @@ class PipelineStep:
2121
def apply(self, pipeline: Pipeline):
2222
pass
2323

24+
def fit(self, pipeline: Pipeline):
25+
pass
26+
27+
def predict(self, pipeline: Pipeline):
28+
pass
29+
2430

2531
class IngestionPipeline(Pipeline):
2632
def __init__(self):
@@ -29,6 +35,42 @@ def __init__(self):
2935
self.df: DataFrame = None
3036
self.column_type_map: dict = None
3137
self.summary_stats: dict = {}
38+
self.needs_type_map: bool = False
39+
self.type_collections: dict = None
3240

3341
def response(self):
34-
return Dataset(df=self.df, column_type_map=self.column_type_map)
42+
return Dataset(
43+
df=self.df,
44+
column_type_map=self.column_type_map,
45+
summary_stats=self.summary_stats,
46+
type_collections=self.type_collections,
47+
)
48+
49+
50+
class ModelPipeline(Pipeline):
51+
def __init__(self):
52+
super().__init__()
53+
self._is_fitted = False
54+
self.feature_list: list = []
55+
self.tmp_test = None
56+
self.tmp_pred = None
57+
self.target = None
58+
59+
def fit(self):
60+
[step.fit(self) for step in self._steps]
61+
self._is_fitted = True
62+
63+
def predict(self):
64+
assert self._is_fitted
65+
[step.predict(self) for step in self._steps]
66+
return self.tmp_pred
67+
68+
69+
class RegressionPipeline(ModelPipeline):
70+
def __init__(self):
71+
super().__init__()
72+
self.train_features_df: DataFrame = None
73+
self.train_targets: Series = None
74+
self.holdout_features_df: DataFrame = None
75+
self.holdout_targets: Series = None
76+
self.holdout_score: float = None

0 commit comments

Comments
 (0)