Skip to content
This repository was archived by the owner on Jun 22, 2022. It is now read-only.

Commit de31258

Browse files
authored
Dev (#7)
* init codebase from home-credit with small changes * initial * fixed validationscore error * lgbm monitoring added * fixed callback_config parsing * updated config * fixed loss
1 parent 195ee58 commit de31258

File tree

11 files changed

+1055
-0
lines changed

11 files changed

+1055
-0
lines changed

main.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import click
2+
from src.pipeline_manager import PipelineManager
3+
4+
pipeline_manager = PipelineManager()
5+
6+
7+
@click.group()
8+
def main():
9+
pass
10+
11+
12+
@main.command()
13+
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
14+
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
15+
def train(pipeline_name, dev_mode):
16+
pipeline_manager.train(pipeline_name, dev_mode)
17+
18+
19+
@main.command()
20+
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
21+
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
22+
def evaluate(pipeline_name, dev_mode):
23+
pipeline_manager.evaluate(pipeline_name, dev_mode)
24+
25+
26+
@main.command()
27+
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
28+
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
29+
@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False)
30+
def predict(pipeline_name, dev_mode, submit_predictions):
31+
pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions)
32+
33+
34+
@main.command()
35+
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
36+
@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False)
37+
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
38+
def train_evaluate_predict(pipeline_name, submit_predictions, dev_mode):
39+
pipeline_manager.train(pipeline_name, dev_mode)
40+
pipeline_manager.evaluate(pipeline_name, dev_mode)
41+
pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions)
42+
43+
44+
@main.command()
45+
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
46+
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
47+
def train_evaluate(pipeline_name, dev_mode):
48+
pipeline_manager.train(pipeline_name, dev_mode)
49+
pipeline_manager.evaluate(pipeline_name, dev_mode)
50+
51+
52+
@main.command()
53+
@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
54+
@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False)
55+
@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
56+
def evaluate_predict(pipeline_name, submit_predictions, dev_mode):
57+
pipeline_manager.evaluate(pipeline_name, dev_mode)
58+
pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions)
59+
60+
61+
62+
if __name__ == "__main__":
63+
main()

neptune.yaml

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
project: ORGANIZATION/Santander-Value-Prediction-Challenge
2+
3+
name: Santander-Value-Prediction-Challenge
4+
tags: [solution-1]
5+
6+
metric:
7+
channel: 'ROC_AUC'
8+
goal: maximize
9+
10+
exclude:
11+
- output
12+
- imgs
13+
- neptune.log
14+
- offline_job.log
15+
- .git
16+
- .github
17+
- .idea
18+
- .ipynb_checkpoints
19+
- Untitled.ipynb
20+
21+
parameters:
22+
# Data
23+
train_filepath: YOUR/PATH/TO/train.csv
24+
test_filepath: YOUR/PATH/TO/test.csv
25+
sample_submission_filepath: YOUR/PATH/TO/test.csv/sample_submission.csv
26+
experiment_directory: YOUR/PATH/WORKDIR
27+
28+
# Kaggle
29+
kaggle_api: 0
30+
kaggle_message: 'solution-1'
31+
32+
# Data preparation
33+
validation_size: 0.1
34+
shuffle: 1
35+
36+
# Execution
37+
clean_experiment_directory_before_training: 1
38+
num_workers: 16
39+
verbose: 1
40+
41+
# Preprocessing
42+
fillna_value: -1
43+
44+
# Light GBM
45+
lgbm_random_search_runs: 0
46+
lgbm__device: cpu # gpu cpu
47+
lgbm__boosting_type: gbdt
48+
lgbm__objective: rmse
49+
lgbm__metric: rmse
50+
lgbm__number_boosting_rounds: 10000
51+
lgbm__early_stopping_rounds: 100
52+
lgbm__learning_rate: 0.01
53+
lgbm__num_leaves: 32
54+
lgbm__max_depth: 10
55+
lgbm__min_child_samples: 1
56+
lgbm__max_bin: 300 # at most 255 for device=gpu
57+
lgbm__subsample: 0.8
58+
lgbm__subsample_freq: 1
59+
lgbm__colsample_bytree: 0.8
60+
lgbm__min_child_weight: 1
61+
lgbm__reg_lambda: 0
62+
lgbm__reg_alpha: 0
63+
lgbm__scale_pos_weight: 1

neptune_random_search.yaml

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
project: ORGANIZATION/Santander-Value-Prediction-Challenge
2+
3+
name: Santander-Value-Prediction-Challenge
4+
tags: [solution-1]
5+
6+
metric:
7+
channel: 'RMSLE'
8+
goal: minimize
9+
10+
exclude:
11+
- output
12+
- imgs
13+
- neptune.log
14+
- offline_job.log
15+
- .git
16+
- .github
17+
- .idea
18+
- .ipynb_checkpoints
19+
- Untitled.ipynb
20+
21+
parameters:
22+
# Data
23+
train_filepath: YOUR/PATH/TO/train.csv
24+
test_filepath: YOUR/PATH/TO/test.csv
25+
sample_submission_filepath: YOUR/PATH/TO/test.csv/sample_submission.csv
26+
experiment_directory: YOUR/PATH/WORKDIR
27+
28+
# Kaggle
29+
kaggle_api: 0
30+
kaggle_message: 'solution-1'
31+
32+
# Data preparation
33+
validation_size: 0.2
34+
shuffle: 1
35+
36+
# Execution
37+
clean_experiment_directory_before_training: 1
38+
num_workers: 16
39+
verbose: 1
40+
41+
# Preprocessing
42+
fillna_value: -1
43+
44+
# Light GBM
45+
lgbm_random_search_runs: 500
46+
lgbm__device: cpu # gpu cpu
47+
lgbm__boosting_type: gbdt
48+
lgbm__objective: rmse
49+
lgbm__metric: rmse
50+
lgbm__number_boosting_rounds: 10000
51+
lgbm__early_stopping_rounds: 100
52+
lgbm__learning_rate: 0.01
53+
lgbm__num_leaves: '[10, 50]'
54+
lgbm__max_depth: '[1, 20]'
55+
lgbm__min_child_samples: '[1, 20]'
56+
lgbm__max_bin: '[180, 500]' # at most 255 for device=gpu
57+
lgbm__subsample: '[0.8, 0.9, 0.99, 0.6, 0.7, "list"]'
58+
lgbm__subsample_freq: 1
59+
lgbm__colsample_bytree: 0.8
60+
lgbm__min_child_weight: '[1, 20]'
61+
lgbm__reg_lambda: '[0.0, 0.1, "uniform"]'
62+
lgbm__reg_alpha: '[0.0, 0.1, "uniform"]'
63+
lgbm__scale_pos_weight: 1

src/feature_extraction.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import numpy as np
2+
import pandas as pd
3+
from steppy.base import BaseTransformer
4+
from steppy.utils import get_logger
5+
6+
logger = get_logger()
7+
8+
9+
class InferredTypeSplitter(BaseTransformer):
10+
def transform(self, X, **kwargs):
11+
numerical_columns, categorical_columns = self._get_column_types(X)
12+
13+
outputs = {'numerical_features': X[numerical_columns],
14+
'categorical_features': X[categorical_columns]
15+
}
16+
return outputs
17+
18+
def _get_column_types(self, X):
19+
types = X.dtypes.to_frame().reset_index()
20+
types.columns = ['colname', 'type']
21+
types['filter'] = types['type'].apply(self._infer_type)
22+
23+
categorical_columns = types[types['filter'] == 'categorical']['colname'].tolist()
24+
numerical_columns = types[types['filter'] == 'numerical']['colname'].tolist()
25+
return numerical_columns, categorical_columns
26+
27+
def _infer_type(self, x):
28+
x_ = str(x)
29+
if 'float' in x_:
30+
return 'numerical'
31+
elif 'int' in x_:
32+
return 'categorical'
33+
else:
34+
return 'other'
35+
36+
37+
class FeatureJoiner(BaseTransformer):
38+
def transform(self, numerical_feature_list, categorical_feature_list, **kwargs):
39+
features = numerical_feature_list + categorical_feature_list
40+
for feature in features:
41+
feature.reset_index(drop=True, inplace=True)
42+
outputs = dict()
43+
outputs['features'] = pd.concat(features, axis=1).astype(np.float32)
44+
outputs['feature_names'] = self._get_feature_names(features)
45+
outputs['categorical_features'] = self._get_feature_names(categorical_feature_list)
46+
return outputs
47+
48+
def _get_feature_names(self, dataframes):
49+
feature_names = []
50+
for dataframe in dataframes:
51+
try:
52+
feature_names.extend(list(dataframe.columns))
53+
except Exception as e:
54+
print(e)
55+
feature_names.append(dataframe.name)
56+
57+
return feature_names

0 commit comments

Comments
 (0)