minerva-ml
diff --git a/‎main.py‎
Lines changed: 63 additions & 0 deletions b/‎main.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎neptune.yaml‎
Lines changed: 63 additions & 0 deletions b/‎neptune.yaml‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎neptune_random_search.yaml‎
Lines changed: 63 additions & 0 deletions b/‎neptune_random_search.yaml‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎src/feature_extraction.py‎
Lines changed: 57 additions & 0 deletions b/‎src/feature_extraction.py‎
Lines changed: 57 additions & 0 deletions
@@ -0,0 +1,63 @@
+import click
+from src.pipeline_manager import PipelineManager
+
+pipeline_manager = PipelineManager()
+
+
+@click.group()
+def main():
+    pass
+
+
+@main.command()
+@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
+def train(pipeline_name, dev_mode):
+    pipeline_manager.train(pipeline_name, dev_mode)
+
+
+@main.command()
+@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
+def evaluate(pipeline_name, dev_mode):
+    pipeline_manager.evaluate(pipeline_name, dev_mode)
+
+
+@main.command()
+@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
+@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False)
+def predict(pipeline_name, dev_mode, submit_predictions):
+    pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions)
+
+
+@main.command()
+@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False)
+@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
+def train_evaluate_predict(pipeline_name, submit_predictions, dev_mode):
+    pipeline_manager.train(pipeline_name, dev_mode)
+    pipeline_manager.evaluate(pipeline_name, dev_mode)
+    pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions)
+
+
+@main.command()
+@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
+def train_evaluate(pipeline_name, dev_mode):
+    pipeline_manager.train(pipeline_name, dev_mode)
+    pipeline_manager.evaluate(pipeline_name, dev_mode)
+
+
+@main.command()
+@click.option('-p', '--pipeline_name', help='pipeline to be trained', required=True)
+@click.option('-s', '--submit_predictions', help='submit predictions if true', is_flag=True, required=False)
+@click.option('-d', '--dev_mode', help='if true only a small sample of data will be used', is_flag=True, required=False)
+def evaluate_predict(pipeline_name, submit_predictions, dev_mode):
+    pipeline_manager.evaluate(pipeline_name, dev_mode)
+    pipeline_manager.predict(pipeline_name, dev_mode, submit_predictions)
+
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,63 @@
+project: ORGANIZATION/Santander-Value-Prediction-Challenge
+
+name: Santander-Value-Prediction-Challenge
+tags: [solution-1]
+
+metric:
+  channel: 'ROC_AUC'
+  goal: maximize
+
+exclude:
+  - output
+  - imgs
+  - neptune.log
+  - offline_job.log
+  - .git
+  - .github
+  - .idea
+  - .ipynb_checkpoints
+  - Untitled.ipynb
+
+parameters:
+# Data
+  train_filepath:                 YOUR/PATH/TO/train.csv
+  test_filepath:                  YOUR/PATH/TO/test.csv
+  sample_submission_filepath: YOUR/PATH/TO/test.csv/sample_submission.csv
+  experiment_directory:           YOUR/PATH/WORKDIR
+
+# Kaggle
+  kaggle_api: 0
+  kaggle_message: 'solution-1'
+
+# Data preparation
+  validation_size: 0.1
+  shuffle: 1
+
+# Execution
+  clean_experiment_directory_before_training: 1
+  num_workers: 16
+  verbose: 1
+
+# Preprocessing
+  fillna_value: -1
+
+# Light GBM
+  lgbm_random_search_runs: 0
+  lgbm__device: cpu # gpu cpu
+  lgbm__boosting_type: gbdt
+  lgbm__objective: rmse
+  lgbm__metric: rmse
+  lgbm__number_boosting_rounds: 10000
+  lgbm__early_stopping_rounds: 100
+  lgbm__learning_rate: 0.01
+  lgbm__num_leaves: 32
+  lgbm__max_depth: 10
+  lgbm__min_child_samples: 1
+  lgbm__max_bin: 300  # at most 255 for device=gpu
+  lgbm__subsample: 0.8
+  lgbm__subsample_freq: 1
+  lgbm__colsample_bytree: 0.8
+  lgbm__min_child_weight: 1
+  lgbm__reg_lambda: 0
+  lgbm__reg_alpha: 0
+  lgbm__scale_pos_weight: 1
@@ -0,0 +1,63 @@
+project: ORGANIZATION/Santander-Value-Prediction-Challenge
+
+name: Santander-Value-Prediction-Challenge
+tags: [solution-1]
+
+metric:
+  channel: 'RMSLE'
+  goal: minimize
+
+exclude:
+  - output
+  - imgs
+  - neptune.log
+  - offline_job.log
+  - .git
+  - .github
+  - .idea
+  - .ipynb_checkpoints
+  - Untitled.ipynb
+
+parameters:
+# Data
+  train_filepath:                 YOUR/PATH/TO/train.csv
+  test_filepath:                  YOUR/PATH/TO/test.csv
+  sample_submission_filepath: YOUR/PATH/TO/test.csv/sample_submission.csv
+  experiment_directory:           YOUR/PATH/WORKDIR
+
+# Kaggle
+  kaggle_api: 0
+  kaggle_message: 'solution-1'
+
+# Data preparation
+  validation_size: 0.2
+  shuffle: 1
+
+# Execution
+  clean_experiment_directory_before_training: 1
+  num_workers: 16
+  verbose: 1
+
+# Preprocessing
+  fillna_value: -1
+
+# Light GBM
+  lgbm_random_search_runs: 500
+  lgbm__device: cpu # gpu cpu
+  lgbm__boosting_type: gbdt
+  lgbm__objective: rmse
+  lgbm__metric: rmse
+  lgbm__number_boosting_rounds: 10000
+  lgbm__early_stopping_rounds: 100
+  lgbm__learning_rate: 0.01
+  lgbm__num_leaves: '[10, 50]'
+  lgbm__max_depth: '[1, 20]'
+  lgbm__min_child_samples: '[1, 20]'
+  lgbm__max_bin: '[180, 500]' # at most 255 for device=gpu
+  lgbm__subsample: '[0.8, 0.9, 0.99, 0.6, 0.7, "list"]'
+  lgbm__subsample_freq: 1
+  lgbm__colsample_bytree: 0.8
+  lgbm__min_child_weight: '[1, 20]'
+  lgbm__reg_lambda: '[0.0, 0.1, "uniform"]'
+  lgbm__reg_alpha: '[0.0, 0.1, "uniform"]'
+  lgbm__scale_pos_weight: 1
@@ -0,0 +1,57 @@
+import numpy as np
+import pandas as pd
+from steppy.base import BaseTransformer
+from steppy.utils import get_logger
+
+logger = get_logger()
+
+
+class InferredTypeSplitter(BaseTransformer):
+    def transform(self, X, **kwargs):
+        numerical_columns, categorical_columns = self._get_column_types(X)
+
+        outputs = {'numerical_features': X[numerical_columns],
+                   'categorical_features': X[categorical_columns]
+                   }
+        return outputs
+
+    def _get_column_types(self, X):
+        types = X.dtypes.to_frame().reset_index()
+        types.columns = ['colname', 'type']
+        types['filter'] = types['type'].apply(self._infer_type)
+
+        categorical_columns = types[types['filter'] == 'categorical']['colname'].tolist()
+        numerical_columns = types[types['filter'] == 'numerical']['colname'].tolist()
+        return numerical_columns, categorical_columns
+
+    def _infer_type(self, x):
+        x_ = str(x)
+        if 'float' in x_:
+            return 'numerical'
+        elif 'int' in x_:
+            return 'categorical'
+        else:
+            return 'other'
+
+
+class FeatureJoiner(BaseTransformer):
+    def transform(self, numerical_feature_list, categorical_feature_list, **kwargs):
+        features = numerical_feature_list + categorical_feature_list
+        for feature in features:
+            feature.reset_index(drop=True, inplace=True)
+        outputs = dict()
+        outputs['features'] = pd.concat(features, axis=1).astype(np.float32)
+        outputs['feature_names'] = self._get_feature_names(features)
+        outputs['categorical_features'] = self._get_feature_names(categorical_feature_list)
+        return outputs
+
+    def _get_feature_names(self, dataframes):
+        feature_names = []
+        for dataframe in dataframes:
+            try:
+                feature_names.extend(list(dataframe.columns))
+            except Exception as e:
+                print(e)
+                feature_names.append(dataframe.name)
+
+        return feature_names