Skip to content
This repository was archived by the owner on Jun 22, 2022. It is now read-only.

Commit ce29b19

Browse files
jakubczakonKamil A. Kaczmarek
authored andcommitted
Dev (#34)
* added some stat features to row-wise aggregations (#30) * Dev bucket aggregations (#32) * added bucket aggregations, to pipelines and configs * added bucket aggs * Dev lgbm monitor (#33) * added lgb monitoring * fixed naming bug * hyperparam update * renamed bucket_aggregations * prepare for release 3
1 parent de0b7e1 commit ce29b19

File tree

10 files changed

+166
-117
lines changed

10 files changed

+166
-117
lines changed

main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import click
2+
23
from src.pipeline_manager import PipelineManager
34

45
pipeline_manager = PipelineManager()

neptune.yaml

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
project: ORGANIZATION/Santander-Value-Prediction-Challenge
22

33
name: Santander-Value-Prediction-Challenge
4-
tags: [solution-2]
4+
tags: [solution-3]
55

66
metric:
7-
channel: 'ROC_AUC'
8-
goal: maximize
7+
channel: 'RMSLE'
8+
goal: minimize
99

1010
exclude:
1111
- output
@@ -22,12 +22,12 @@ parameters:
2222
# Data
2323
train_filepath: YOUR/PATH/TO/train.csv
2424
test_filepath: YOUR/PATH/TO/test.csv
25-
sample_submission_filepath: YOUR/PATH/TO/test.csv/sample_submission.csv
25+
sample_submission_filepath: YOUR/PATH/TO/sample_submission.csv
2626
experiment_directory: YOUR/PATH/WORKDIR
2727

2828
# Kaggle
2929
kaggle_api: 0
30-
kaggle_message: 'solution-2'
30+
kaggle_message: 'solution-3'
3131

3232
# Data preparation
3333
n_cv_splits: 5
@@ -44,6 +44,7 @@ parameters:
4444
variance_threshold__threshold: 0.0
4545

4646
# Feature Extraction
47+
row_aggregations__bucket_nrs: "[1, 2]"
4748
truncated_svd__use: False
4849
truncated_svd__n_components: 50
4950
truncated_svd__n_iter: 10
@@ -66,15 +67,15 @@ parameters:
6667
lgbm__objective: rmse
6768
lgbm__metric: rmse
6869
lgbm__number_boosting_rounds: 10000
69-
lgbm__early_stopping_rounds: 100
70-
lgbm__learning_rate: 0.01
71-
lgbm__num_leaves: 180
70+
lgbm__early_stopping_rounds: 1000
71+
lgbm__learning_rate: 0.001
72+
lgbm__num_leaves: 16
7273
lgbm__max_depth: -1
7374
lgbm__min_child_samples: 1
74-
lgbm__max_bin: 255 # at most 255 for device=gpu
75-
lgbm__subsample: 0.5
76-
lgbm__subsample_freq: 4
77-
lgbm__colsample_bytree: 0.5
75+
lgbm__max_bin: 300
76+
lgbm__subsample: 1.0
77+
lgbm__subsample_freq: 1
78+
lgbm__colsample_bytree: 0.1
7879
lgbm__min_child_weight: 10
7980
lgbm__reg_lambda: 0.1
8081
lgbm__reg_alpha: 0.0

neptune_random_search.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
project: ORGANIZATION/Santander-Value-Prediction-Challenge
22

33
name: Santander-Value-Prediction-Challenge
4-
tags: [solution-2]
4+
tags: [solution-3]
55

66
metric:
77
channel: 'RMSLE'
@@ -22,12 +22,12 @@ parameters:
2222
# Data
2323
train_filepath: YOUR/PATH/TO/train.csv
2424
test_filepath: YOUR/PATH/TO/test.csv
25-
sample_submission_filepath: YOUR/PATH/TO/test.csv/sample_submission.csv
25+
sample_submission_filepath: YOUR/PATH/TO/sample_submission.csv
2626
experiment_directory: YOUR/PATH/WORKDIR
2727

2828
# Kaggle
2929
kaggle_api: 0
30-
kaggle_message: 'solution-2'
30+
kaggle_message: 'solution-3'
3131

3232
# Data preparation
3333
n_cv_splits: 5
@@ -44,6 +44,7 @@ parameters:
4444
variance_threshold__threshold: 0.0
4545

4646
# Feature Extraction
47+
row_aggregations__bucket_nrs: "[1, 2]"
4748
truncated_svd__use: False
4849
truncated_svd__n_components: 50
4950
truncated_svd__n_iter: 10
@@ -83,4 +84,3 @@ parameters:
8384

8485
# Postprocessing
8586
aggregation_method: mean
86-

src/feature_extraction.py

Lines changed: 71 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import numpy as np
22
import pandas as pd
3-
from scipy.stats import skew, kurtosis
43
import sklearn.decomposition as sk_d
54
import sklearn.random_projection as sk_rp
5+
from scipy.stats import skew, kurtosis
66
from sklearn.externals import joblib
77
from steppy.base import BaseTransformer
88
from steppy.utils import get_logger
@@ -84,34 +84,78 @@ def __init__(self, **kwargs):
8484

8585

8686
class RowAggregationFeatures(BaseTransformer):
87+
def __init__(self, bucket_nr, **kwargs):
88+
super().__init__()
89+
self.bucket_nr = bucket_nr
90+
8791
def transform(self, X, **kwargs):
88-
X_agg = X.apply(aggregate_row, axis=1)
89-
return {'numerical_features': X_agg}
92+
X_aggs = []
93+
for i, column_bucket in enumerate(self._column_bucket_gen(X.columns)):
94+
X_bucket_agg = X[column_bucket].apply(aggregate_row, axis=1)
95+
X_bucket_agg.columns = self._add_prefix(X_bucket_agg.columns, i)
96+
X_aggs.append(X_bucket_agg)
97+
X_aggs = pd.concat(X_aggs, axis=1)
98+
return {'numerical_features': X_aggs}
99+
100+
def _column_bucket_gen(self, cols):
101+
chunk_size = len(cols) // self.bucket_nr + 1
102+
for i in range(0, len(cols), chunk_size):
103+
yield cols[i:i + chunk_size]
104+
105+
def _add_prefix(self, columns, bucket_id):
106+
columns = ['{}_of_{}_{}'.format(self.bucket_nr, bucket_id, col)
107+
for col in columns]
108+
return columns
90109

91110

92111
def aggregate_row(row):
93112
non_zero_values = row.iloc[row.nonzero()]
94-
aggs = {'non_zero_mean': non_zero_values.mean(),
95-
'non_zero_std': non_zero_values.std(),
96-
'non_zero_max': non_zero_values.max(),
97-
'non_zero_min': non_zero_values.min(),
98-
'non_zero_sum': non_zero_values.sum(),
99-
'non_zero_skewness': skew(non_zero_values),
100-
'non_zero_kurtosis': kurtosis(non_zero_values),
101-
'non_zero_median': non_zero_values.median(),
102-
'non_zero_q1': np.percentile(non_zero_values, q=25),
103-
'non_zero_q3': np.percentile(non_zero_values, q=75),
104-
'non_zero_log_mean': np.log1p(non_zero_values).mean(),
105-
'non_zero_log_std': np.log1p(non_zero_values).std(),
106-
'non_zero_log_max': np.log1p(non_zero_values).max(),
107-
'non_zero_log_min': np.log1p(non_zero_values).min(),
108-
'non_zero_log_sum': np.log1p(non_zero_values).sum(),
109-
'non_zero_log_skewness': skew(np.log1p(non_zero_values)),
110-
'non_zero_log_kurtosis': kurtosis(np.log1p(non_zero_values)),
111-
'non_zero_log_median': np.log1p(non_zero_values).median(),
112-
'non_zero_log_q1': np.percentile(np.log1p(non_zero_values), q=25),
113-
'non_zero_log_q3': np.percentile(np.log1p(non_zero_values), q=75),
114-
'non_zero_count': non_zero_values.count(),
115-
'non_zero_fraction': non_zero_values.count() / row.count()
116-
}
117-
return pd.Series(aggs)
113+
if non_zero_values.empty:
114+
aggregations = {'non_zero_mean': np.nan,
115+
'non_zero_std': np.nan,
116+
'non_zero_max': np.nan,
117+
'non_zero_min': np.nan,
118+
'non_zero_sum': np.nan,
119+
'non_zero_skewness': np.nan,
120+
'non_zero_kurtosis': np.nan,
121+
'non_zero_median': np.nan,
122+
'non_zero_q1': np.nan,
123+
'non_zero_q3': np.nan,
124+
'non_zero_log_mean': np.nan,
125+
'non_zero_log_std': np.nan,
126+
'non_zero_log_max': np.nan,
127+
'non_zero_log_min': np.nan,
128+
'non_zero_log_sum': np.nan,
129+
'non_zero_log_skewness': np.nan,
130+
'non_zero_log_kurtosis': np.nan,
131+
'non_zero_log_median': np.nan,
132+
'non_zero_log_q1': np.nan,
133+
'non_zero_log_q3': np.nan,
134+
'non_zero_count': np.nan,
135+
'non_zero_fraction': np.nan
136+
}
137+
else:
138+
aggregations = {'non_zero_mean': non_zero_values.mean(),
139+
'non_zero_std': non_zero_values.std(),
140+
'non_zero_max': non_zero_values.max(),
141+
'non_zero_min': non_zero_values.min(),
142+
'non_zero_sum': non_zero_values.sum(),
143+
'non_zero_skewness': skew(non_zero_values),
144+
'non_zero_kurtosis': kurtosis(non_zero_values),
145+
'non_zero_median': non_zero_values.median(),
146+
'non_zero_q1': np.percentile(non_zero_values, q=25),
147+
'non_zero_q3': np.percentile(non_zero_values, q=75),
148+
'non_zero_log_mean': np.log1p(non_zero_values).mean(),
149+
'non_zero_log_std': np.log1p(non_zero_values).std(),
150+
'non_zero_log_max': np.log1p(non_zero_values).max(),
151+
'non_zero_log_min': np.log1p(non_zero_values).min(),
152+
'non_zero_log_sum': np.log1p(non_zero_values).sum(),
153+
'non_zero_log_skewness': skew(np.log1p(non_zero_values)),
154+
'non_zero_log_kurtosis': kurtosis(np.log1p(non_zero_values)),
155+
'non_zero_log_median': np.log1p(non_zero_values).median(),
156+
'non_zero_log_q1': np.percentile(np.log1p(non_zero_values), q=25),
157+
'non_zero_log_q3': np.percentile(np.log1p(non_zero_values), q=75),
158+
'non_zero_count': non_zero_values.count(),
159+
'non_zero_fraction': non_zero_values.count() / row.count()
160+
}
161+
return pd.Series(aggregations)

src/models.py

Lines changed: 31 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1+
import lightgbm as lgb
12
import numpy as np
23
import pandas as pd
3-
import lightgbm as lgb
44
from attrdict import AttrDict
5-
from sklearn.externals import joblib
65
from deepsense import neptune
6+
from sklearn.externals import joblib
77
from steppy.base import BaseTransformer
88

99
from .utils import get_logger
@@ -13,13 +13,14 @@
1313

1414

1515
class LightGBM(BaseTransformer):
16-
def __init__(self, **params):
16+
def __init__(self, name=None, **params):
1717
super().__init__()
18-
logger.info('initializing LightGBM...')
18+
self.msg_prefix = 'LightGBM transformer'
19+
logger.info('initializing {}.'.format(self.msg_prefix))
1920
self.params = params
2021
self.training_params = ['number_boosting_rounds', 'early_stopping_rounds']
2122
self.evaluation_function = None
22-
self.callbacks = None
23+
self.callbacks = callbacks(channel_prefix=name)
2324

2425
@property
2526
def model_config(self):
@@ -32,24 +33,22 @@ def training_config(self):
3233
if param in self.training_params})
3334

3435
def fit(self,
35-
X,
36-
y,
37-
X_valid,
38-
y_valid,
36+
X, y,
37+
X_valid, y_valid,
3938
feature_names='auto',
4039
categorical_features='auto',
4140
**kwargs):
4241
evaluation_results = {}
4342

4443
self._check_target_shape_and_type(y, 'y')
4544
self._check_target_shape_and_type(y_valid, 'y_valid')
46-
y = self._format_target(y)
47-
y_valid = self._format_target(y_valid)
45+
y = self._format_target(y, 'y')
46+
y_valid = self._format_target(y_valid, 'y_valid')
4847

49-
logger.info('LightGBM, train data shape {}'.format(X.shape))
50-
logger.info('LightGBM, validation data shape {}'.format(X_valid.shape))
51-
logger.info('LightGBM, train labels shape {}'.format(y.shape))
52-
logger.info('LightGBM, validation labels shape {}'.format(y_valid.shape))
48+
logger.info('{}, train data shape {}'.format(self.msg_prefix, X.shape))
49+
logger.info('{}, validation data shape {}'.format(self.msg_prefix, X_valid.shape))
50+
logger.info('{}, train labels shape {}'.format(self.msg_prefix, y.shape))
51+
logger.info('{}, validation labels shape {}'.format(self.msg_prefix, y_valid.shape))
5352

5453
data_train = lgb.Dataset(data=X,
5554
label=y,
@@ -91,30 +90,36 @@ def persist(self, filepath):
9190
def _check_target_shape_and_type(self, target, name):
9291
if not any([isinstance(target, obj_type) for obj_type in [pd.Series, np.ndarray, list]]):
9392
raise TypeError(
94-
'"{}" must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead.'.format(type(target)))
93+
'{}: "{}" must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead.'.format(
94+
self.msg_prefix,
95+
name,
96+
type(target)))
9597
try:
96-
assert len(target.shape) == 1, '"{}" must be 1-D. It is {}-D instead.'.format(name,
97-
len(target.shape))
98+
assert len(target.shape) == 1, '{}: "{}" must be 1-D. It is {}-D instead.'.format(self.msg_prefix,
99+
name,
100+
len(target.shape))
98101
except AttributeError:
99-
print('Cannot determine shape of the {}. '
100-
'Type must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead'.format(name,
102+
print('{}: cannot determine shape of the {}.'
103+
'Type must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead'.format(self.msg_prefix,
104+
name,
101105
type(target)))
102106

103-
def _format_target(self, target):
104-
107+
def _format_target(self, target, name):
105108
if isinstance(target, pd.Series):
106109
return target.values
107110
elif isinstance(target, np.ndarray):
108111
return target
109112
elif isinstance(target, list):
110113
return np.array(target)
111114
else:
112-
raise TypeError(
113-
'"{}" must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead.'.format(type(target)))
115+
raise TypeError('{}: "{}" must be "numpy.ndarray" or "Pandas.Series" or "list", got {} instead.'.format(
116+
self.msg_prefix,
117+
name,
118+
type(target)))
114119

115120

116-
def callbacks(callback_config):
117-
neptune_monitor = neptune_monitor_lgbm(**callback_config['neptune_monitor'])
121+
def callbacks(channel_prefix):
122+
neptune_monitor = neptune_monitor_lgbm(channel_prefix)
118123
return [neptune_monitor]
119124

120125

0 commit comments

Comments
 (0)