From cd8c91715e0d5c5a933e0eec9965536759d8ecde Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 23 Oct 2019 00:37:34 +0200
Subject: [PATCH 001/124] added eager executing tf2 training with nb noise

---
 batchglm/train/tf2/__init__.py                |   3 +
 batchglm/train/tf2/base/__init__.py           |   3 +
 batchglm/train/tf2/base/estimator.py          |  32 ++
 batchglm/train/tf2/base/external.py           |   5 +
 batchglm/train/tf2/base/model.py              |  57 ++
 batchglm/train/tf2/base/optim.py              |  52 ++
 batchglm/train/tf2/base_glm/README.md         |   2 +
 batchglm/train/tf2/base_glm/__init__.py       |  10 +
 batchglm/train/tf2/base_glm/estimator.py      | 485 ++++++++++++++++
 batchglm/train/tf2/base_glm/external.py       |   9 +
 batchglm/train/tf2/base_glm/layers.py         | 268 +++++++++
 .../train/tf2/base_glm/layers_gradients.py    | 450 +++++++++++++++
 batchglm/train/tf2/base_glm/model.py          | 226 ++++++++
 batchglm/train/tf2/base_glm/optim.py          | 535 ++++++++++++++++++
 batchglm/train/tf2/base_glm/processModel.py   |   9 +
 .../train/tf2/base_glm/training_strategies.py | 111 ++++
 batchglm/train/tf2/base_glm/vars.py           |  86 +++
 batchglm/train/tf2/glm_beta/__init__.py       |   5 +
 batchglm/train/tf2/glm_beta/estimator.py      | 239 ++++++++
 batchglm/train/tf2/glm_beta/external.py       |  12 +
 batchglm/train/tf2/glm_beta/layers.py         |  53 ++
 .../train/tf2/glm_beta/layers_gradients.py    | 144 +++++
 batchglm/train/tf2/glm_beta/model.py          |  44 ++
 batchglm/train/tf2/glm_beta/processModel.py   |  45 ++
 batchglm/train/tf2/glm_beta/vars.py           |   8 +
 batchglm/train/tf2/glm_nb/__init__.py         |   5 +
 batchglm/train/tf2/glm_nb/estimator.py        | 266 +++++++++
 batchglm/train/tf2/glm_nb/external.py         |  18 +
 batchglm/train/tf2/glm_nb/layers.py           |  59 ++
 batchglm/train/tf2/glm_nb/layers_gradients.py | 144 +++++
 batchglm/train/tf2/glm_nb/model.py            |  43 ++
 batchglm/train/tf2/glm_nb/processModel.py     |  42 ++
 batchglm/train/tf2/glm_nb/vars.py             |   8 +
 batchglm/train/tf2/glm_norm/__init__.py       |   5 +
 batchglm/train/tf2/glm_norm/estimator.py      | 284 ++++++++++
 batchglm/train/tf2/glm_norm/external.py       |  12 +
 batchglm/train/tf2/glm_norm/layers.py         |  49 ++
 .../train/tf2/glm_norm/layers_gradients.py    | 116 ++++
 batchglm/train/tf2/glm_norm/model.py          |  55 ++
 batchglm/train/tf2/glm_norm/processModel.py   |  42 ++
 batchglm/train/tf2/glm_norm/vars.py           |   8 +
 batchglm/train/tf2/ops.py                     |  59 ++
 42 files changed, 4108 insertions(+)
 create mode 100644 batchglm/train/tf2/__init__.py
 create mode 100644 batchglm/train/tf2/base/__init__.py
 create mode 100644 batchglm/train/tf2/base/estimator.py
 create mode 100644 batchglm/train/tf2/base/external.py
 create mode 100644 batchglm/train/tf2/base/model.py
 create mode 100644 batchglm/train/tf2/base/optim.py
 create mode 100644 batchglm/train/tf2/base_glm/README.md
 create mode 100644 batchglm/train/tf2/base_glm/__init__.py
 create mode 100644 batchglm/train/tf2/base_glm/estimator.py
 create mode 100644 batchglm/train/tf2/base_glm/external.py
 create mode 100644 batchglm/train/tf2/base_glm/layers.py
 create mode 100644 batchglm/train/tf2/base_glm/layers_gradients.py
 create mode 100644 batchglm/train/tf2/base_glm/model.py
 create mode 100644 batchglm/train/tf2/base_glm/optim.py
 create mode 100644 batchglm/train/tf2/base_glm/processModel.py
 create mode 100644 batchglm/train/tf2/base_glm/training_strategies.py
 create mode 100644 batchglm/train/tf2/base_glm/vars.py
 create mode 100644 batchglm/train/tf2/glm_beta/__init__.py
 create mode 100644 batchglm/train/tf2/glm_beta/estimator.py
 create mode 100644 batchglm/train/tf2/glm_beta/external.py
 create mode 100644 batchglm/train/tf2/glm_beta/layers.py
 create mode 100644 batchglm/train/tf2/glm_beta/layers_gradients.py
 create mode 100644 batchglm/train/tf2/glm_beta/model.py
 create mode 100644 batchglm/train/tf2/glm_beta/processModel.py
 create mode 100644 batchglm/train/tf2/glm_beta/vars.py
 create mode 100644 batchglm/train/tf2/glm_nb/__init__.py
 create mode 100644 batchglm/train/tf2/glm_nb/estimator.py
 create mode 100644 batchglm/train/tf2/glm_nb/external.py
 create mode 100644 batchglm/train/tf2/glm_nb/layers.py
 create mode 100644 batchglm/train/tf2/glm_nb/layers_gradients.py
 create mode 100644 batchglm/train/tf2/glm_nb/model.py
 create mode 100644 batchglm/train/tf2/glm_nb/processModel.py
 create mode 100644 batchglm/train/tf2/glm_nb/vars.py
 create mode 100644 batchglm/train/tf2/glm_norm/__init__.py
 create mode 100644 batchglm/train/tf2/glm_norm/estimator.py
 create mode 100644 batchglm/train/tf2/glm_norm/external.py
 create mode 100644 batchglm/train/tf2/glm_norm/layers.py
 create mode 100644 batchglm/train/tf2/glm_norm/layers_gradients.py
 create mode 100644 batchglm/train/tf2/glm_norm/model.py
 create mode 100644 batchglm/train/tf2/glm_norm/processModel.py
 create mode 100644 batchglm/train/tf2/glm_norm/vars.py
 create mode 100644 batchglm/train/tf2/ops.py

diff --git a/batchglm/train/tf2/__init__.py b/batchglm/train/tf2/__init__.py
new file mode 100644
index 00000000..9170f2ff
--- /dev/null
+++ b/batchglm/train/tf2/__init__.py
@@ -0,0 +1,3 @@
+from . import glm_nb as nb
+from . import glm_norm as norm
+from . import glm_beta as beta
\ No newline at end of file
diff --git a/batchglm/train/tf2/base/__init__.py b/batchglm/train/tf2/base/__init__.py
new file mode 100644
index 00000000..9b75ab32
--- /dev/null
+++ b/batchglm/train/tf2/base/__init__.py
@@ -0,0 +1,3 @@
+from .estimator import TFEstimator
+from .model import ProcessModelBase, ModelBase, LossBase
+from .optim import OptimizerBase
diff --git a/batchglm/train/tf2/base/estimator.py b/batchglm/train/tf2/base/estimator.py
new file mode 100644
index 00000000..15fc0906
--- /dev/null
+++ b/batchglm/train/tf2/base/estimator.py
@@ -0,0 +1,32 @@
+from .external import pkg_constants, TrainingStrategies
+from .model import ModelBase, LossBase
+
+import numpy as np
+import tensorflow as tf
+
+
+class TFEstimator:
+    model: ModelBase
+    loss: LossBase
+
+    def __init__(self, input_data, dtype):
+
+        self._input_data = input_data
+        self.dtype = dtype
+
+    def _train(
+            self,
+            batched_model: bool,
+            batch_size: int,
+            optimizer_object: tf.keras.optimizers.Optimizer,
+            optimizer_enum: TrainingStrategies,
+            convergence_criteria: str,
+            stopping_criteria: int,
+            autograd: bool,
+            featurewise: bool,
+            benchmark: bool
+    ):
+        pass
+
+    def fetch_fn(self, idx):
+        pass
diff --git a/batchglm/train/tf2/base/external.py b/batchglm/train/tf2/base/external.py
new file mode 100644
index 00000000..08784cca
--- /dev/null
+++ b/batchglm/train/tf2/base/external.py
@@ -0,0 +1,5 @@
+#from batchglm.models.base import _Estimator_Base
+#from batchglm.xarray_sparse import SparseXArrayDataArray, SparseXArrayDataSet
+from batchglm.train.tf2.base_glm.training_strategies import TrainingStrategies
+#import batchglm.utils.stats as stat_utils
+from batchglm import pkg_constants
diff --git a/batchglm/train/tf2/base/model.py b/batchglm/train/tf2/base/model.py
new file mode 100644
index 00000000..acce4dee
--- /dev/null
+++ b/batchglm/train/tf2/base/model.py
@@ -0,0 +1,57 @@
+import abc
+import logging
+import tensorflow as tf
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+class ModelBase(tf.keras.Model, metaclass=abc.ABCMeta):
+
+    def __init__(self):
+        super(ModelBase, self).__init__()
+
+    @abc.abstractmethod
+    def call(self, inputs, training=False, mask=None):
+        pass
+
+
+class LossBase(tf.keras.losses.Loss, metaclass=abc.ABCMeta):
+
+    def __init__(self):
+        super(LossBase, self).__init__()
+
+    @abc.abstractmethod
+    def call(self, y_true, y_pred):
+        pass
+
+
+class ProcessModelBase:
+
+    @abc.abstractmethod
+    def param_bounds(self, dtype):
+        pass
+
+    def tf_clip_param(
+            self,
+            param,
+            name
+    ):
+        bounds_min, bounds_max = self.param_bounds(param.dtype)
+        return tf.clip_by_value(
+            param,
+            bounds_min[name],
+            bounds_max[name]
+        )
+
+    def np_clip_param(
+            self,
+            param,
+            name
+    ):
+        bounds_min, bounds_max = self.param_bounds(param.dtype)
+        return np.clip(
+            param,
+            bounds_min[name],
+            bounds_max[name]
+        )
diff --git a/batchglm/train/tf2/base/optim.py b/batchglm/train/tf2/base/optim.py
new file mode 100644
index 00000000..5fc8d13b
--- /dev/null
+++ b/batchglm/train/tf2/base/optim.py
@@ -0,0 +1,52 @@
+import abc
+import logging
+import tensorflow as tf
+
+logger = logging.getLogger("batchglm")
+
+
+class OptimizerBase(tf.keras.optimizers.Optimizer, metaclass=abc.ABCMeta):
+
+    def __init__(self, name):
+        super(OptimizerBase, self).__init__(name=name)
+
+    @abc.abstractmethod
+    def _resource_apply_dense(self, grad, handle):
+        pass
+
+    @abc.abstractmethod
+    def _resource_apply_sparse(self, grad, handle, apply_state):
+        pass
+
+    @abc.abstractmethod
+    def _create_slots(self):
+        pass
+
+    """
+    @property
+    @abc.abstractmethod
+    def vars(self):
+        pass
+
+    @property
+    @abc.abstractmethod
+    def gradients(self):
+        return None
+
+    @property
+    @abc.abstractmethod
+    def hessians(self):
+        pass
+
+    @property
+    @abc.abstractmethod
+    def fims(self):
+        pass
+
+    @abc.abstractmethod
+    def step(self, learning_rate):
+        pass
+    """
+    @abc.abstractmethod
+    def get_config(self):
+        pass
diff --git a/batchglm/train/tf2/base_glm/README.md b/batchglm/train/tf2/base_glm/README.md
new file mode 100644
index 00000000..eea79ccc
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/README.md
@@ -0,0 +1,2 @@
+# Classes with GLM specific code.
+All noise models that are in the GLM category inherit all of these classes.
\ No newline at end of file
diff --git a/batchglm/train/tf2/base_glm/__init__.py b/batchglm/train/tf2/base_glm/__init__.py
new file mode 100644
index 00000000..a662e17d
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/__init__.py
@@ -0,0 +1,10 @@
+from .processModel import ProcessModelGLM
+from .model import GLM, LossGLM
+
+from .estimator import Estimator
+from .vars import ModelVarsGLM
+from .layers import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM
+from .layers import LikelihoodGLM, UnpackParamsGLM
+from .layers_gradients import JacobianGLM, HessianGLM, FIMGLM
+from .optim import NR, IRLS
+from .training_strategies import TrainingStrategies
diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
new file mode 100644
index 00000000..11cddd75
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -0,0 +1,485 @@
+import abc
+import logging
+import numpy as np
+import scipy
+import tensorflow as tf
+from .model import GLM
+from .training_strategies import TrainingStrategies
+from .external import TFEstimator, _EstimatorGLM
+from .optim import NR, IRLS
+from .external import pkg_constants
+import time
+
+logger = logging.getLogger("batchglm")
+
+class Estimator(TFEstimator, _EstimatorGLM, metaclass=abc.ABCMeta):
+    """
+    Estimator for Generalized Linear Models (GLMs).
+    """
+    model: GLM
+    _train_loc: bool
+    _train_scale: bool
+    _initialized: bool = False
+    noise_model: str
+
+    def initialize(self, **kwargs):
+        self.values = []
+        self.times = []
+        self.converged = []
+        self._initialized = True
+
+    def finalize(self, **kwargs):
+        """
+        Evaluate all tensors that need to be exported from session and save these as class attributes
+        and close session.
+
+        Changes .model entry from tf-based EstimatorGraph to numpy based Model instance and
+        transfers relevant attributes.
+        """
+        a_var, b_var = self.model.unpack_params([self.model.params, self.model.model_vars.a_var.get_shape()[0]])
+        self.model = self.get_model_container(self._input_data)
+        self.model._a_var = a_var
+        self.model._b_var = b_var
+        self._loss = tf.reduce_sum(-self._log_likelihood / self.input_data.num_observations)
+
+    def __init__(
+            self,
+            input_data,
+            dtype,
+    ):
+
+        self._input_data = input_data
+
+        TFEstimator.__init__(
+            self=self,
+            input_data=input_data,
+            dtype=dtype,
+        )
+        _EstimatorGLM.__init__(
+            self=self,
+            model=None,
+            input_data=input_data
+        )
+
+    def train_sequence(self, training_strategy: []):
+        for strategy in training_strategy:
+            self.train(
+                batched_model=strategy['use_batching'],
+                optimizer=strategy['optim_algo'],
+                convergence_criteria=strategy['convergence_criteria'],
+                stopping_criteria=strategy['stopping_criteria'],
+                batch_size=strategy['batch_size'] if 'batch_size' in strategy else 500,
+                learning_rate=strategy['learning_rate'] if 'learning_rate' in strategy else 1e-2,
+                autograd=strategy['autograd'] if 'autograd' in strategy else False,
+                featurewise=strategy['featurewise'] if 'featurewise' in strategy else True
+            )
+
+    def _train(
+            self,
+            noise_model: str,
+            batched_model: bool = True,
+            batch_size: int = 500,
+            optimizer_object: tf.keras.optimizers.Optimizer = tf.keras.optimizers.Adam(),
+            optimizer_enum: TrainingStrategies = TrainingStrategies.DEFAULT,
+            convergence_criteria: str = "step",
+            stopping_criteria: int = 1000,
+            autograd: bool = False,
+            featurewise: bool = True,
+            benchmark: bool = False,
+    ):
+
+        if not self._initialized:
+            raise RuntimeError("Cannot train the model: \
+                                Estimator not initialized. Did you forget to call estimator.initialize() ?")
+
+        if autograd and optimizer_enum.value['hessian']:
+            logger.warning("Automatic differentiation is currently not supported for hessians. \
+                            Falling back to closed form. Only Jacobians are calculated using autograd.")
+
+        self.noise_model = noise_model
+        # Slice data and create batches
+        data_ids = tf.data.Dataset.from_tensor_slices(
+            (tf.range(self._input_data.num_observations, name="sample_index", dtype=tf.dtypes.int64))
+        )
+        if batched_model:
+            data = data_ids.shuffle(buffer_size=2 * batch_size).repeat().batch(batch_size)
+        else:
+            data = data_ids.shuffle(buffer_size=2 * batch_size).batch(batch_size, drop_remainder=True)
+        input_list = data.map(self.fetch_fn, num_parallel_calls=pkg_constants.TF_NUM_THREADS)
+
+        # Iterate until conditions are fulfilled.
+        train_step = 0
+
+        # Set all to convergence status = False, this is needed if multiple
+        # training strategies are run:
+        converged_current = np.repeat(
+            False, repeats=self.model.model_vars.n_features)
+
+        def convergence_decision(convergence_status, train_step):
+            if convergence_criteria == "step":
+                return train_step < stopping_criteria
+            elif convergence_criteria == "all_converged":
+                return np.any(np.logical_not(convergence_status))
+            elif convergence_criteria == "both":
+                return np.any(np.logical_not(convergence_status)) and train_step < stopping_criteria
+            else:
+                raise ValueError("convergence_criteria %s not recognized." % convergence_criteria)
+
+        # fill with highest possible number:
+        ll_current = np.zeros([self._input_data.num_features], self.dtype) + np.nextafter(np.inf, 0, dtype=self.dtype)
+
+        dataset_iterator = iter(input_list)
+        calc_separated = False
+        if optimizer_enum.value["hessian"] is True or optimizer_enum.value["fim"] is True:
+            second_order_optim = True
+            calc_separated = optimizer_enum.value['calc_separated']
+            update_func = optimizer_object.perform_parameter_update
+        else:
+            update_func = optimizer_object.apply_gradients
+            second_order_optim = False
+        n_obs = self._input_data.num_observations
+
+        curr_norm_loc = np.sqrt(np.sum(np.square(
+            np.abs(self.model.params.numpy()[self.model.model_vars.idx_train_loc, :])), axis=0))
+        curr_norm_scale = np.sqrt(np.sum(np.square(
+            np.abs(self.model.params.numpy()[self.model.model_vars.idx_train_scale, :])), axis=0))
+
+        batch_features = False
+        while convergence_decision(converged_current, train_step):
+            # ### Iterate over the batches of the dataset.
+            # x_batch is a tuple (idx, (X_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor))
+            if benchmark:
+                t0_epoch = time.time()
+
+            not_converged = np.logical_not(self.model.model_vars.converged)
+            ll_prev = ll_current.copy()
+            if train_step % 10 == 0:
+                logger.info('step %i', train_step)
+
+            if not batched_model:
+                results = None
+                x_batch = None
+                first_batch = True
+                for x_batch_tuple in input_list:
+                    x_batch = self.getModelInput(x_batch_tuple, batch_features, not_converged)
+
+                    current_results = self.model(x_batch)
+                    if first_batch:
+                        results = list(current_results)
+                        first_batch = False
+                    else:
+                        for i, x in enumerate(current_results):
+                            results[i] += x
+
+            else:
+                x_batch_tuple = next(dataset_iterator)
+                x_batch = self.getModelInput(x_batch_tuple, batch_features, not_converged)
+
+                results = self.model(x_batch)
+            if second_order_optim:
+                if calc_separated:
+                    update_func([x_batch, *results, False, n_obs], True, False, batch_features, ll_prev)
+                    if self._train_scale:
+                        update_func([x_batch, *results, False, n_obs], False, True, batch_features, ll_prev)
+                else:
+                    update_func([x_batch, *results, False, n_obs], True, True, batch_features, ll_prev)
+                features_updated = self.model.model_vars.updated
+            else:
+                if batch_features:
+                    indices = tf.where(not_converged)
+                    update_var = tf.transpose(tf.scatter_nd(
+                        indices,
+                        tf.transpose(results[1]),
+                        shape=(self.model.model_vars.n_features, results[1].get_shape()[0])
+                    ))
+                else:
+                    update_var = results[1]
+                update_func([(update_var, self.model.params)])
+                features_updated = not_converged
+
+            if benchmark:
+                self.values.append(self.model.trainable_variables[0].numpy().copy())
+
+            # Update converged status
+            prev_norm_loc = curr_norm_loc.copy()
+            prev_norm_scale = curr_norm_scale.copy()
+            converged_prev = converged_current.copy()
+            ll_current = self.loss.norm_neg_log_likelihood(results[0]).numpy()
+
+            if batch_features:
+                indices = tf.where(not_converged)
+                updated_lls = tf.scatter_nd(indices, ll_current, shape=ll_prev.shape)
+                ll_current = np.where(features_updated, updated_lls.numpy(), ll_prev)
+
+            if batched_model:
+                jac_normalization = batch_size
+            else:
+                jac_normalization = self._input_data.num_observations
+            if optimizer_enum.value["optim_algo"] in ['irls', 'irls_gd', 'irls_gd_tr', 'irls_tr']:
+                grad_numpy = tf.abs(tf.concat((results[1], results[2]), axis=1))
+            elif optimizer_enum.value["optim_algo"] in ['nr', 'nr_tr']:
+                grad_numpy = tf.abs(results[1])
+            else:
+                grad_numpy = tf.abs(tf.transpose(results[1]))
+            if batch_features:
+                indices = tf.where(not_converged)
+                grad_numpy = tf.scatter_nd(indices, grad_numpy, shape=(self.model.model_vars.n_features,
+                                                                       self.model.params.get_shape()[0]))
+            grad_numpy = grad_numpy.numpy()
+            convergences = self.calculate_convergence(converged_prev, ll_prev, prev_norm_loc, prev_norm_scale,
+                                                      ll_current, jac_normalization, grad_numpy, features_updated)
+            converged_current, converged_f, converged_g, converged_x = convergences
+
+            self.model.model_vars.convergence_update(converged_current, features_updated)
+            num_converged = np.sum(converged_current).astype("int32")
+            if np.sum(converged_current) != np.sum(converged_prev):
+                if featurewise and not batch_features:
+                    batch_features = True
+                    self.model.batch_features = batch_features
+                logger.info("Step: %i loss: %f, converged %i, updated %i, (logs: %i, grad: %i, x_step: %i)",
+                            train_step,
+                            np.sum(ll_current),
+                            num_converged,
+                            np.sum(features_updated).astype("int32"),
+                            np.sum(converged_f), np.sum(converged_g), np.sum(converged_x))
+            train_step += 1
+            if benchmark:
+                t1_epoch = time.time()
+                self.times.append(t1_epoch-t0_epoch)
+                self.converged.append(num_converged)
+
+        # Evaluate final params
+        self._log_likelihood = results[0].numpy()
+        self._fisher_inv = tf.zeros(shape=()).numpy()
+        self._hessian = tf.zeros(shape=()).numpy()
+
+        if optimizer_enum.value["hessian"] is True:
+            self._hessian = results[2].numpy()
+            self._jacobian = results[1].numpy()
+        elif optimizer_enum.value["fim"] is True:
+            self._fisher_inv = tf.concat([results[3], results[4]], axis=0).numpy()
+            self._jacobian = tf.concat([results[1], results[2]], axis=0).numpy()
+        else:
+            self._jacobian = results[1].numpy()
+
+    def getModelInput(self, x_batch_tuple: tuple, batch_features: bool, not_converged):
+
+        if batch_features:
+            x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor = x_batch_tuple
+            if isinstance(self._input_data.x, scipy.sparse.csr_matrix):
+                not_converged_idx = np.where(not_converged)[0]
+                feature_columns = tf.sparse.split(
+                    x_tensor,
+                    num_split=self.model.model_vars.n_features,
+                    axis=1)
+                feature_columns = [feature_columns[i] for i in not_converged_idx]
+                x_tensor = tf.sparse.concat(axis=1, sp_inputs=feature_columns)
+                if not isinstance(x_tensor, tf.sparse.SparseTensor):
+                    raise RuntimeError("x_tensor now dense!!!")
+            else:
+                x_tensor = tf.boolean_mask(tensor=x_tensor, mask=not_converged, axis=1)
+            x_batch = (x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor)
+        else:
+            x_batch = x_batch_tuple
+
+        return x_batch
+
+    def calculate_convergence(self, converged_prev, ll_prev, prev_norm_loc, prev_norm_scale, ll_current,
+                              jac_normalization, grad_numpy, features_updated):
+        def get_convergence(converged_previous, condition1, condition2):
+            return np.logical_or(converged_previous, np.logical_and(condition1, condition2))
+
+        def get_convergence_by_method(converged_previous, condition1, condition2):
+            return np.logical_and(np.logical_not(converged_previous), np.logical_and(condition1, condition2))
+
+        def calc_x_step(idx_train, prev_norm):
+            if len(idx_train) > 0 and len(self.values) > 1:
+                curr_norm = np.sqrt(np.sum(np.square(
+                    np.abs(self.model.params.numpy()[idx_train, :])
+                ), axis=0))
+                return np.abs(curr_norm - prev_norm)
+            else:
+                return np.zeros([self.model.model_vars.n_features]) + np.nextafter(np.inf, 0, dtype=self.dtype)
+
+        x_norm_loc = calc_x_step(self.model.model_vars.idx_train_loc, prev_norm_loc)
+        x_norm_scale = calc_x_step(self.model.model_vars.idx_train_scale, prev_norm_scale)
+
+        ll_converged = np.abs(ll_prev - ll_current) / ll_prev < pkg_constants.LLTOL_BY_FEATURE
+
+        converged_current = get_convergence(converged_prev, ll_converged, features_updated)
+
+        # those features which were not converged in the prev run, but converged now
+        converged_f = get_convergence_by_method(converged_prev, ll_converged, features_updated)
+        grad_loc = np.sum(grad_numpy[:, self.model.model_vars.idx_train_loc], axis=1)
+        grad_norm_loc = grad_loc / jac_normalization
+        grad_scale = np.sum(grad_numpy[:, self.model.model_vars.idx_train_scale], axis=1)
+        grad_norm_scale = grad_scale / jac_normalization
+
+        converged_current = get_convergence(converged_current,
+                                            grad_norm_loc < pkg_constants.GTOL_BY_FEATURE_LOC,
+                                            grad_norm_scale < pkg_constants.GTOL_BY_FEATURE_SCALE)
+        # those features which were not converged in the prev run, but converged now
+        converged_g = get_convergence_by_method(converged_prev,
+                                                grad_norm_loc < pkg_constants.GTOL_BY_FEATURE_LOC,
+                                                grad_norm_scale < pkg_constants.GTOL_BY_FEATURE_SCALE)
+
+        # Step length:
+        converged_current = get_convergence(converged_current,
+                                            x_norm_loc < pkg_constants.XTOL_BY_FEATURE_LOC,
+                                            x_norm_scale < pkg_constants.XTOL_BY_FEATURE_SCALE)
+
+        # those features which were not converged in the prev run, but converged now
+        converged_x = get_convergence_by_method(converged_prev,
+                                                x_norm_loc < pkg_constants.XTOL_BY_FEATURE_LOC,
+                                                x_norm_scale < pkg_constants.XTOL_BY_FEATURE_SCALE)
+        return converged_current, converged_f, converged_g, converged_x
+
+    def get_optimizer_object(self, optimizer, learning_rate):
+
+        optimizer = optimizer.lower()
+
+        if optimizer == "gd":
+            return tf.keras.optimizers.SGD(learning_rate=learning_rate), TrainingStrategies.GD
+        if optimizer == "adam":
+            return tf.keras.optimizers.Adam(learning_rate=learning_rate), TrainingStrategies.ADAM
+        if optimizer == "adagrad":
+            return tf.keras.optimizers.Adagrad(learning_rate=learning_rate), TrainingStrategies.ADAGRAD
+        if optimizer == "rmsprop":
+            return tf.keras.optimizers.RMSprop(learning_rate=learning_rate), TrainingStrategies.RMSPROP
+        if optimizer == "irls":
+            return IRLS(dtype=self.dtype,
+                        trusted_region_mode=False,
+                        model=self.model,
+                        name="IRLS"), TrainingStrategies.IRLS
+        if optimizer == "irls_tr":
+            return IRLS(dtype=self.dtype,
+                        trusted_region_mode=True,
+                        model=self.model,
+                        name="IRLS_TR"), TrainingStrategies.IRLS_TR
+        if optimizer == "irls_gd":
+            return IRLS(dtype=self.dtype,
+                        trusted_region_mode=False,
+                        model=self.model,
+                        name="IRLS_GD"), TrainingStrategies.IRLS_GD
+        if optimizer == "irls_gd_tr":
+            return IRLS(dtype=self.dtype,
+                        trusted_region_mode=True,
+                        model=self.model,
+                        name="IRLS_GD_TR"), TrainingStrategies.IRLS_GD_TR
+        if optimizer == "nr":
+            return NR(dtype=self.dtype,
+                      trusted_region_mode=False,
+                      model=self.model,
+                      name="NR"), TrainingStrategies.NR
+        if optimizer == "nr_tr":
+            return NR(dtype=self.dtype,
+                      trusted_region_mode=True,
+                      model=self.model,
+                      name="NR_TR"), TrainingStrategies.NR_TR
+
+        logger.warning("No valid optimizer given. Default optimizer Adam chosen.")
+        return tf.keras.optimizers.Adam(learning_rate=learning_rate), TrainingStrategies.ADAM
+
+    def fetch_fn(self, idx):
+        """
+        Documentation of tensorflow coding style in this function:
+        tf.py_func defines a python function (the getters of the InputData object slots)
+        as a tensorflow operation. Here, the shape of the tensor is lost and
+        has to be set with set_shape. For size factors, we use explicit broadcasting
+        as explained below.
+        """
+        # Catch dimension collapse error if idx is only one element long, ie. 0D:
+        if len(idx.shape) == 0:
+            idx = tf.expand_dims(idx, axis=0)
+
+        if isinstance(self._input_data.x, scipy.sparse.csr_matrix):
+
+            x_tensor_idx, x_tensor_val, x = tf.py_function(
+                func=self._input_data.fetch_x_sparse,
+                inp=[idx],
+                Tout=[np.int64, np.float64, np.int64],
+            )
+            # Note on Tout: np.float64 for val seems to be required to avoid crashing v1.12.
+            x_tensor_idx = tf.cast(x_tensor_idx, dtype=tf.int64)
+            x = tf.cast(x, dtype=tf.int64)
+            x_tensor_val = tf.cast(x_tensor_val, dtype=self.dtype)
+            x_tensor = tf.SparseTensor(x_tensor_idx, x_tensor_val, x)
+            x_tensor = tf.cast(x_tensor, dtype=self.dtype)
+
+        else:
+
+            x_tensor = tf.py_function(
+                func=self._input_data.fetch_x_dense,
+                inp=[idx],
+                Tout=self._input_data.x.dtype,
+            )
+
+            x_tensor.set_shape(idx.get_shape().as_list() + [self._input_data.num_features])
+            x_tensor = tf.cast(x_tensor, dtype=self.dtype)
+
+        design_loc_tensor = tf.py_function(
+            func=self._input_data.fetch_design_loc,
+            inp=[idx],
+            Tout=self._input_data.design_loc.dtype,
+        )
+        design_loc_tensor.set_shape(idx.get_shape().as_list() + [self._input_data.num_design_loc_params])
+        design_loc_tensor = tf.cast(design_loc_tensor, dtype=self.dtype)
+
+        design_scale_tensor = tf.py_function(
+            func=self._input_data.fetch_design_scale,
+            inp=[idx],
+            Tout=self._input_data.design_scale.dtype,
+        )
+        design_scale_tensor.set_shape(idx.get_shape().as_list() + [self._input_data.num_design_scale_params])
+        design_scale_tensor = tf.cast(design_scale_tensor, dtype=self.dtype)
+
+        if self._input_data.size_factors is not None and self.noise_model in ["nb", "norm"]:
+            size_factors_tensor = tf.py_function(
+                func=self._input_data.fetch_size_factors,
+                inp=[idx],
+                Tout=self._input_data.size_factors.dtype,
+            )
+
+            size_factors_tensor.set_shape(idx.get_shape())
+            size_factors_tensor = tf.expand_dims(size_factors_tensor, axis=-1)
+            size_factors_tensor = tf.cast(size_factors_tensor, dtype=self.dtype)
+
+        else:
+            size_factors_tensor = tf.constant(1, shape=[1, 1], dtype=self.dtype)
+
+        # feature batching
+        return x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor
+
+    @staticmethod
+    def get_init_from_model(init_a, init_b, input_data, init_model):
+        # Locations model:
+        if isinstance(init_a, str) and (init_a.lower() == "auto" or init_a.lower() == "init_model"):
+            my_loc_names = set(input_data.loc_names)
+            my_loc_names = my_loc_names.intersection(set(init_model.input_data.loc_names))
+
+            init_loc = np.zeros([input_data.num_loc_params, input_data.num_features])
+            for parm in my_loc_names:
+                init_idx = np.where(init_model.input_data.loc_names == parm)[0]
+                my_idx = np.where(input_data.loc_names == parm)[0]
+                init_loc[my_idx] = init_model.a_var[init_idx]
+
+            init_a = init_loc
+
+        # Scale model:
+        if isinstance(init_b, str) and (init_b.lower() == "auto" or init_b.lower() == "init_model"):
+            my_scale_names = set(input_data.scale_names)
+            my_scale_names = my_scale_names.intersection(init_model.input_data.scale_names)
+
+            init_scale = np.zeros([input_data.num_scale_params, input_data.num_features])
+            for parm in my_scale_names:
+                init_idx = np.where(init_model.input_data.scale_names == parm)[0]
+                my_idx = np.where(input_data.scale_names == parm)[0]
+                init_scale[my_idx] = init_model.b_var[init_idx]
+
+            init_b = init_scale
+
+        return init_a, init_b
+
+    @abc.abstractmethod
+    def get_model_container(self, input_data):
+        pass
diff --git a/batchglm/train/tf2/base_glm/external.py b/batchglm/train/tf2/base_glm/external.py
new file mode 100644
index 00000000..9188d2b0
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/external.py
@@ -0,0 +1,9 @@
+from batchglm.train.tf2.base import ProcessModelBase, ModelBase, TFEstimator
+from batchglm.train.tf2.base import OptimizerBase, LossBase
+#from batchglm.train.tf2.glm_nb import NR, IRLS
+
+from batchglm.models.base_glm import InputDataGLM, _ModelGLM, _EstimatorGLM
+
+#import batchglm.train.tf.ops as op_utils
+from batchglm.utils.linalg import groupwise_solve_lm
+from batchglm import pkg_constants
diff --git a/batchglm/train/tf2/base_glm/layers.py b/batchglm/train/tf2/base_glm/layers.py
new file mode 100644
index 00000000..8ced3a4b
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/layers.py
@@ -0,0 +1,268 @@
+from typing import Union
+
+import abc
+import tensorflow as tf
+
+from .processModel import ProcessModelGLM
+
+
+class UnpackParamsGLM(tf.keras.layers.Layer, ProcessModelGLM):
+
+    """
+    Layer that slices the parameter tensor into mean and variance block.
+    """
+
+    def __init__(self):
+        super(UnpackParamsGLM, self).__init__()
+
+    def call(self, inputs, **kwargs):
+        """
+        :param inputs: tuple (params, border)
+           Must contain the parameter matrix (params) and the first index
+           of the variance block within the parameters matrix (border)
+
+        :return tf.Tensor, tf.Tensor
+           The two returned tensor correspond to the mean and variance block
+           of the parameter matrix.
+        """
+        params, border = inputs
+        a_var = params[0:border]  # loc obs
+        b_var = params[border:]  # scale obs
+        a_var = self.tf_clip_param(a_var, "a_var")
+        b_var = self.tf_clip_param(b_var, "b_var")
+        return a_var, b_var
+
+
+class LinearLocGLM(tf.keras.layers.Layer, ProcessModelGLM):
+
+    """
+    Computes the dot product between the design matrix of the mean model and the mean block of the parameter matrix.
+    """
+
+    def __init__(self):
+        super(LinearLocGLM, self).__init__()
+
+    def _eta_loc(
+            self,
+            a_var: tf.Tensor,
+            design_loc: tf.Tensor,
+            constraints_loc: Union[tf.Tensor, None] = None,
+            size_factors: Union[tf.Tensor, None] = None
+    ):
+        """
+        Does the actual computation of eta_loc.
+
+        :param a_var: tf.Tensor
+            the mean block of the parameter matrix
+        :param design_loc: tf.Tensor
+            the design matrix of the mean model
+        :param contraints_loc: tf.Tensor, optional
+            ??? # TODO
+        :param size_factors: tf.Tensor, optional
+            ??? # TODO
+
+        :return tf.Tensor
+            the mean values for each individual distribution, encoded in linker space.
+        """
+        if constraints_loc is not None:
+            eta_loc = tf.matmul(
+                design_loc,
+                tf.matmul(constraints_loc, a_var)
+            )
+        else:
+            eta_loc = tf.matmul(design_loc, a_var)
+
+        if size_factors is not None and size_factors.shape != (1, 1):
+            eta_loc = self.with_size_factors(eta_loc, size_factors)
+
+        eta_loc = self.tf_clip_param(eta_loc, "eta_loc")
+
+        return eta_loc
+
+    @abc.abstractmethod
+    def with_size_factors(self, eta_loc, size_factors):
+        """
+        Calculates eta_loc with size_factors. Is noise model specific and needs to be implemented in the inheriting
+        layer.
+        :param eta_loc: tf.Tensor
+            the mean values for each individual distribution, encoded in linker space
+        """
+
+    def call(self, inputs, **kwargs):
+        """
+        Calculates the eta_loc tensor, containing the mean values for each individual distribution,
+        encoded in linker space.
+
+        :param input: tuple
+            Must contain a_var, design_loc, constraints_loc and size_factors in this order, where
+            contraints_loc and size_factor can be None.
+
+        :return tf.Tensor
+            the mean values for each individual distribution, encoded in linker space.
+        """
+        return self._eta_loc(*inputs)
+
+
+class LinearScaleGLM(tf.keras.layers.Layer, ProcessModelGLM):
+
+    """
+    Computes the dot product between the design matrix of the variance model
+    and the variance block of the parameter matrix.
+    """
+
+    def __init__(self):
+        super(LinearScaleGLM, self).__init__()
+
+    def _eta_scale(
+            self,
+            b_var: tf.Tensor,
+            design_scale: tf.Tensor,
+            constraints_scale: Union[tf.Tensor, None] = None
+    ):
+        """
+        Does the actual computation of eta_scale.
+
+        :param b_var: tf.Tensor
+            the variance block of the parameter matrix
+        :param design_scale: tf.Tensor
+            the design matrix of the mean model
+        :param contraints_scale: tf.Tensor, optional
+            ??? # TODO
+
+        :return tf.Tensor
+            the variance values for each individual distribution, encoded in linker space.
+        """
+        if constraints_scale is not None:
+            eta_scale = tf.matmul(
+                design_scale,
+                tf.matmul(constraints_scale, b_var)
+            )
+        else:
+            eta_scale = tf.matmul(design_scale, b_var)
+
+        eta_scale = self.tf_clip_param(eta_scale, "eta_scale")
+
+        return eta_scale
+
+    def call(self, inputs, **kwargs):
+        """
+        Calculates the eta_scale tensor, containing the variance values for each individual distribution,
+        encoded in linker space.
+
+        :param input: tuple
+            Must contain b_var, design_scale and constraints_loc in this order, where
+            contraints_loc can be None.
+
+        :return tf.Tensor
+            the variance values for each individual distribution, encoded in linker space.
+        """
+        return self._eta_scale(*inputs)
+
+
+class LinkerLocGLM(tf.keras.layers.Layer):
+
+    """
+    Translation from linker to data space for the mean model.
+    """
+
+    def __init__(self):
+        super(LinkerLocGLM, self).__init__()
+
+    @abc.abstractmethod
+    def _inv_linker(self, loc: tf.Tensor):
+        """
+        Translates the given mean values from linker to data space. Depends on the given noise model and needs to
+        be implemented in the inheriting layer.
+
+        :param loc: tf. Tensor
+            the mean values for each individual distribution, encoded in linker space.
+
+        :return tf.Tensor
+            the mean values for each individual distribution, encoded in data space.
+        """
+
+    def call(self, eta_loc: tf.Tensor, **kwargs):
+        """
+        Calls the distribution specific linker function to translate from linker to data space.
+
+        :param eta_loc: tf.Tensor
+            the mean values for each individual distribution, encoded in linker space.
+
+        :return tf.Tensor
+            the mean values for each individual distribution, encoded in data space.
+        """
+        loc = self._inv_linker(eta_loc)
+        return loc
+
+
+class LinkerScaleGLM(tf.keras.layers.Layer):
+
+    """
+    Translation from linker to data space for the variance model.
+    """
+
+    def __init__(self):
+        super(LinkerScaleGLM, self).__init__()
+
+    @abc.abstractmethod
+    def _inv_linker(self, scale: tf.Tensor):
+        pass
+
+    def call(self, eta_scale: tf.Tensor, **kwargs):
+        """
+        Calls the distribution specific linker function to translate from linker to data space.
+
+        :param eta_scale: tf.Tensor
+            the variance values for each individual distribution, encoded in linker space.
+
+        :return tf.Tensor
+            the variance values for each individual distribution, encoded in data space.
+        """
+        scale = self._inv_linker(eta_scale)
+        return scale
+
+
+class LikelihoodGLM(tf.keras.layers.Layer, ProcessModelGLM):
+
+    """
+    Contains the computation of the distribution specific log-likelihood function
+    """
+
+    def __init__(self, dtype):
+        super(LikelihoodGLM, self).__init__()
+        self.ll_dtype = dtype
+
+    @abc.abstractmethod
+    def _ll(self, eta_loc, eta_scale, loc, scale, x, n_features):
+        """
+        Does the actual likelihood calculation. Depends on the given noise model and needs to be implemented in the
+        inheriting layer.
+
+        :param eta_loc: tf.Tensor
+            the mean values for each individual distribution, encoded in linker space.
+        :param eta_scale: tf.Tensor
+            the variance values for each individual distribution, encoded in linker space.
+        :param loc: tf.Tensor
+            the mean values for each individual distribution, encoded in data space.
+        :param scale: tf.Tensor
+            the variance values for each individual distribution, encoded in data space.
+        :param x: tf.Tensor
+            the input data
+        :param n_features
+            number of features.
+
+        :return tf.Tensor
+            the log-likelihoods of each individual data point.
+        """
+
+    def call(self, inputs, **kwargs):
+        """
+        Calls the distribution specific log-likelihood function.
+
+        :param inputs: tuple
+            Must contain eta_loc, eta_scale, loc, scale, x, n_features in this order.
+
+        :return tf.Tensor
+            the log-likelihoods of each individual data point.
+        """
+        return self._ll(*inputs)
diff --git a/batchglm/train/tf2/base_glm/layers_gradients.py b/batchglm/train/tf2/base_glm/layers_gradients.py
new file mode 100644
index 00000000..01b7dfb7
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/layers_gradients.py
@@ -0,0 +1,450 @@
+import abc
+import tensorflow as tf
+
+
+class Gradient(tf.keras.layers.Layer):
+
+    """Superclass for Jacobians, Hessian, FIM"""
+
+    def __init__(self, model_vars, compute_a, compute_b, dtype):
+        super(Gradient, self).__init__()
+        self.model_vars = model_vars
+        self.compute_a = compute_a
+        self.compute_b = compute_b
+        self.grad_dtype = dtype
+
+    @abc.abstractmethod
+    def call(self, inputs, **kwargs):
+        pass
+
+    @staticmethod
+    def calc_design_mat(design_mat, constraints):
+        if constraints is not None:
+            xh = tf.matmul(design_mat, constraints)
+        else:
+            xh = design_mat
+        return xh
+
+    # Here, we use the einsum to efficiently perform the two outer products and the marginalisation.
+    @staticmethod
+    def create_specific_block(w, xh_loc, xh_scale):
+        return tf.einsum('ofc,od->fcd', tf.einsum('of,oc->ofc', w, xh_loc), xh_scale)
+
+
+class FIMGLM(Gradient):
+    """
+    Compute expected fisher information matrix (FIM)
+    for iteratively re-weighted least squares (IWLS or IRLS) parameter updates for GLMs.
+    """
+
+    def call(self, inputs, **kwargs):
+        return self._fim_analytic(*inputs)
+
+    def _fim_analytic(self, x, design_loc, design_scale, loc, scale, concat=False) -> tf.Tensor:
+        """
+        Compute the closed-form of the base_glm_all model fim
+        by evalutating its terms grouped by observations.
+        """
+
+        def _a_byobs():
+            """
+            Compute the mean model diagonal block of the
+            closed form fim of base_glm_all model by observation across features
+            for a batch of observations.
+            """
+            w = self._weight_fim_aa(x=x, loc=loc, scale=scale)  # [observations x features]
+            # The computation of the fim block requires two outer products between
+            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
+            # The resulting tensor is observations x features x coefficients x coefficients which
+            # is too large too store in memory in most cases. However, the full 4D tensor is never
+            # actually needed but only its marginal across features, the final hessian block shape.
+            # Here, we use the einsum to efficiently perform the two outer products and the marginalisation.
+            xh = self.calc_design_mat(design_loc, self.model_vars.constraints_loc)
+
+            fim_block = self.create_specific_block(w, xh, xh)
+            return fim_block
+
+        def _b_byobs():
+            """
+            Compute the dispersion model diagonal block of the
+            closed form fim of base_glm_all model by observation across features.
+            """
+            w = self._weight_fim_bb(x=x, loc=loc, scale=scale)  # [observations=1 x features]
+            # The computation of the fim block requires two outer products between
+            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
+            # The resulting tensor is observations x features x coefficients x coefficients which
+            # is too large too store in memory in most cases. However, the full 4D tensor is never
+            # actually needed but only its marginal across features, the final hessian block shape.
+            # Here, we use the Einstein summation to efficiently perform the two outer products and the marginalisation.
+            xh = self.calc_design_mat(design_scale, self.model_vars.constraints_scale)
+
+            fim_block = self.create_specific_block(w, xh, xh)
+            return fim_block
+
+        # The full fisher information matrix is block-diagonal with the cross-model
+        # blocks all zero. Accordingly, mean and dispersion model updates can be
+        # treated independently and the full fisher information matrix is never required.
+        # Here, the non-zero model-wise diagonal blocks are computed and returned
+        # as a dictionary. The according score function vectors are also returned as a dictionary.
+
+        if self.compute_a and self.compute_b:
+            fim_a = _a_byobs()
+            fim_b = _b_byobs()
+
+        elif self.compute_a and not self.compute_b:
+            fim_a = _a_byobs()
+            fim_b = tf.zeros(fim_a.get_shape(), self.grad_dtype)
+        elif not self.compute_a and self.compute_b:
+            fim_a = tf.zeros(fim_a.get_shape(), self.grad_dtype)
+            fim_b = _b_byobs()
+        else:
+            fim_a = tf.zeros_like(self.model_vars.a_var, dtype=self.grad_dtype)
+            fim_b = tf.zeros_like(self.model_vars.b_var, dtype=self.grad_dtype)
+
+        if concat:
+            fim = tf.concat([fim_a, fim_b], axis=1)
+            return fim
+        else:
+            return fim_a, fim_b
+
+    @abc.abstractmethod
+    def _weight_fim_aa(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        """
+        Compute for mean model IWLS update for a GLM.
+
+        :param loc: tf.tensor observations x features
+           Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+           Value of dispersion model by observation and feature.
+
+        :return tuple of tf.tensors
+           Constants with respect to coefficient index for
+           Fisher information matrix and score function computation.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _weight_fim_bb(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        """
+        Compute for dispersion model IWLS update for a GLM.
+
+        :param x: tf.tensor observations x features
+            Observation by observation and feature.
+        :param loc: tf.tensor observations x features
+            Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+            Value of dispersion model by observation and feature.
+
+        :return tuple of tf.tensors
+            Constants with respect to coefficient index for
+            Fisher information matrix and score function computation.
+        """
+        pass
+
+
+class JacobianGLM(Gradient):
+
+    def call(self, inputs, **kwargs):
+        return self._jac_analytic(*inputs)
+
+    def _jac_analytic(self, x, design_loc, design_scale, loc, scale, concat) -> tf.Tensor:
+        """
+        Compute the closed-form of the base_glm_all model jacobian
+        by evalutating its terms grouped by observations.
+
+        :param x: tf.tensor observations x features
+                Observation by observation and feature.
+        :param loc: tf.tensor observations x features
+                Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+                Value of dispersion model by observation and feature.
+        """
+
+        def _a_byobs():
+            """
+            Compute the mean model block of the jacobian.
+
+            :return Jblock: tf.tensor features x coefficients
+                Block of jacobian.
+            """
+            w = self._weights_jac_a(x=x, loc=loc, scale=scale)  # [observations, features]
+            xh = self.calc_design_mat(design_loc, self.model_vars.constraints_loc)  # [observations, coefficient]
+
+            jblock = tf.matmul(tf.transpose(w), xh)  # [features, coefficients]
+            return jblock
+
+        def _b_byobs():
+            """
+            Compute the dispersion model block of the jacobian.
+
+            :return Jblock: tf.tensor features x coefficients
+                Block of jacobian.
+            """
+            w = self._weights_jac_b(x=x, loc=loc, scale=scale)  # [observations, features]
+            xh = self.calc_design_mat(design_scale, self.model_vars.constraints_scale)  # [observations, coefficient]
+
+            jblock = tf.matmul(tf.transpose(w), xh)  # [features, coefficients]
+            return jblock
+
+        if self.compute_a and self.compute_b:
+            j_a = _a_byobs()
+            j_b = _b_byobs()
+        elif self.compute_a and not self.compute_b:
+            j_a = _a_byobs()
+            j_b = tf.zeros((j_a.get_shape()[0], self.model_vars.b_var.get_shape()[0]), dtype=self.grad_dtype)
+        elif not self.compute_a and self.compute_b:
+            j_b = _b_byobs()
+            j_a = tf.zeros((j_b.get_shape()[0], self.model_vars.b_var.get_shape()[0]), dtype=self.grad_dtype)
+        else:
+            j_a = tf.transpose(tf.zeros_like(self.model_vars.a_var, dtype=self.grad_dtype))
+            j_b = tf.transpose(tf.zeros_like(self.model_vars.b_var, dtype=self.grad_dtype))
+
+        if concat:
+            j = tf.concat([j_a, j_b], axis=1)
+            return j
+        else:
+            return j_a, j_b
+
+    @abc.abstractmethod
+    def _weights_jac_a(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        """
+        Compute the coefficient index invariant part of the
+        mean model gradient.
+
+        :param x: tf.tensor observations x features
+            Observation by observation and feature.
+        :param loc: tf.tensor observations x features
+            Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+            Value of dispersion model by observation and feature.
+
+        :return const: tf.tensor observations x features
+            Coefficient invariant terms of hessian of
+            given observations and features.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _weights_jac_b(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        """
+        Compute the coefficient index invariant part of the
+        dispersion model gradient.
+
+        :param x: tf.tensor observations x features
+            Observation by observation and feature.
+        :param loc: tf.tensor observations x features
+            Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+            Value of dispersion model by observation and feature.
+
+        :return const: tf.tensor observations x features
+            Coefficient invariant terms of hessian of
+            given observations and features.
+        """
+        pass
+
+
+class HessianGLM(Gradient):
+    """
+    Compute the closed-form of the base_glm_all model hessian
+    by evaluating its terms grouped by observations.
+
+    Has three sub-functions which built the specific blocks of the hessian
+    and one sub-function which concatenates the blocks into a full hessian.
+    """
+
+    def call(self, inputs, **kwargs):
+        return self._hessian_analytic(*inputs)
+
+    def _hessian_analytic(self, x, design_loc, design_scale, loc, scale, concat) -> tf.Tensor:
+        """
+        Compute the closed-form of the base_glm_all model hessian
+        by evaluating its terms grouped by observations.
+
+        Has three sub-functions which built the specific blocks of the hessian
+        and one sub-function which concatenates the blocks into a full hessian.
+        """
+
+        def _aa_byobs_batched():
+            """
+            Compute the mean model diagonal block of the
+            closed form hessian of base_glm_all model by observation across features
+            for a batch of observations.
+            """
+            w = self._weight_hessian_aa(x=x, loc=loc, scale=scale)  # [observations x features]
+            # The computation of the hessian block requires two outer products between
+            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
+            # The resulting tensor is observations x features x coefficients x coefficients which
+            # is too large too store in memory in most cases. However, the full 4D tensor is never
+            # actually needed but only its marginal across features, the final hessian block shape.
+            # Here, we use the einsum to efficiently perform the two outer products and the marginalisation.
+            xh = self.calc_design_mat(design_loc, self.model_vars.constraints_loc)
+
+            hblock = self.create_specific_block(w, xh, xh)
+            return hblock
+
+        def _bb_byobs_batched():
+            """
+            Compute the dispersion model diagonal block of the
+            closed form hessian of base_glm_all model by observation across features.
+            """
+            w = self._weight_hessian_bb(x=x, loc=loc, scale=scale)  # [observations x features]
+            # The computation of the hessian block requires two outer products between
+            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
+            # The resulting tensor is observations x features x coefficients x coefficients which
+            # is too large too store in memory in most cases. However, the full 4D tensor is never
+            # actually needed but only its marginal across features, the final hessian block shape.
+            # Here, we use the Einstein summation to efficiently perform the two outer products and the marginalisation.
+            xh = self.calc_design_mat(design_scale, self.model_vars.constraints_scale)
+
+            hblock = self.create_specific_block(w, xh, xh)
+            return hblock
+
+        def _ab_byobs_batched():
+            """
+            Compute the mean-dispersion model off-diagonal block of the
+            closed form hessian of base_glm_all model by observastion across features.
+
+            Note that there are two blocks of the same size which can
+            be compute from each other with a transpose operation as
+            the hessian is symmetric.
+            """
+            w = self._weight_hessian_ab(x=x, loc=loc, scale=scale)  # [observations x features]
+            # The computation of the hessian block requires two outer products between
+            # feature-wise constants and the coefficient wise design matrix entries, for each observation.
+            # The resulting tensor is observations x features x coefficients x coefficients which
+            # is too large too store in memory in most cases. However, the full 4D tensor is never
+            # actually needed but only its marginal across features, the final hessian block shape.
+            # Here, we use the Einstein summation to efficiently perform the two outer products and the marginalisation.
+            xhloc = self.calc_design_mat(design_loc, self.model_vars.constraints_loc)
+            xhscale = self.calc_design_mat(design_scale, self.model_vars.constraints_scale)
+
+            hblock = self.create_specific_block(w, xhloc, xhscale)
+            return hblock
+
+        if self.compute_a and self.compute_b:
+            h_aa = _aa_byobs_batched()
+            h_bb = _bb_byobs_batched()
+            h_ab = _ab_byobs_batched()
+            h_ba = tf.transpose(h_ab, perm=[0, 2, 1])
+        elif self.compute_a and not self.compute_b:
+            h_aa = _aa_byobs_batched()
+            h_bb = tf.zeros_like(h_aa, dtype=self.grad_dtype)
+            h_ab = tf.zeros_like(h_aa, dtype=self.grad_dtype)
+            h_ba = tf.zeros_like(h_aa, dtype=self.grad_dtype)
+        elif not self.compute_a and self.compute_b:
+            h_bb = _bb_byobs_batched()
+            h_aa = tf.zeros_like(h_bb, dtype=self.grad_dtype)
+            h_ab = tf.zeros_like(h_bb, dtype=self.grad_dtype)
+            h_ba = tf.zeros_like(h_bb, dtype=self.grad_dtype)
+        else:
+            h_aa = tf.zeros((), dtype=self.grad_dtype)
+            h_bb = tf.zeros((), dtype=self.grad_dtype)
+            h_ab = tf.zeros((), dtype=self.grad_dtype)
+            h_ba = tf.zeros((), dtype=self.grad_dtype)
+
+        if concat:
+            h = tf.concat(
+                [tf.concat([h_aa, h_ab], axis=2),
+                 tf.concat([h_ba, h_bb], axis=2)],
+                axis=1
+            )
+            return h
+        else:
+            return h_aa, h_ab, h_ba, h_bb
+
+    @abc.abstractmethod
+    def _weight_hessian_aa(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        """
+        Compute the coefficient index invariant part of the
+        mean model block of the hessian.
+
+        :param x: tf.tensor observations x features
+            Observation by observation and feature.
+        :param loc: tf.tensor observations x features
+            Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+            Value of dispersion model by observation and feature.
+
+        :return const: tf.tensor observations x features
+            Coefficient invariant terms of hessian of
+            given observations and features.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _weight_hessian_bb(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        """
+        Compute the coefficient index invariant part of the
+        dispersion model block of the hessian.
+
+        :param x: tf.tensor observations x features
+            Observation by observation and feature.
+        :param loc: tf.tensor observations x features
+            Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+            Value of dispersion model by observation and feature.
+
+        :return const: tf.tensor observations x features
+            Coefficient invariant terms of hessian of
+            given observations and features.
+        """
+        pass
+
+    @abc.abstractmethod
+    def _weight_hessian_ab(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        """
+        Compute the coefficient index invariant part of the
+        mean-dispersion model block of the hessian.
+
+        Note that there are two blocks of the same size which can
+        be compute from each other with a transpose operation as
+        the hessian is symmetric.
+
+        :param x: tf.tensor observations x features
+            Observation by observation and feature.
+        :param loc: tf.tensor observations x features
+            Value of mean model by observation and feature.
+        :param scale: tf.tensor observations x features
+            Value of dispersion model by observation and feature.
+
+        :return const: tf.tensor observations x features
+            Coefficient invariant terms of hessian of
+            given observations and features.
+        """
+        pass
diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
new file mode 100644
index 00000000..cbf2d6d1
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -0,0 +1,226 @@
+import logging
+import tensorflow as tf
+import numpy as np
+from .external import ModelBase, LossBase
+from .processModel import ProcessModelGLM
+from .training_strategies import TrainingStrategies
+
+logger = logging.getLogger("batchglm")
+
+
+class GLM(ModelBase, ProcessModelGLM):
+
+    """
+    base GLM class containg the model call.
+    """
+
+    TS: {} = TrainingStrategies.DEFAULT.value
+    compute_a: bool = True
+    compute_b: bool = True
+
+    def __init__(
+            self,
+            model_vars,
+            unpack_params: tf.keras.layers.Layer,
+            linear_loc: tf.keras.layers.Layer,
+            linear_scale: tf.keras.layers.Layer,
+            linker_loc: tf.keras.layers.Layer,
+            linker_scale: tf.keras.layers.Layer,
+            likelihood: tf.keras.layers.Layer,
+            jacobian: tf.keras.layers.Layer,
+            hessian: tf.keras.layers.Layer,
+            fim: tf.keras.layers.Layer,
+            use_gradient_tape: bool = False
+    ):
+        super(GLM, self).__init__()
+        self.model_vars = model_vars
+        self.params = tf.Variable(tf.concat(
+            [
+                model_vars.init_a_clipped,
+                model_vars.init_b_clipped,
+            ],
+            axis=0
+        ), name="params", trainable=True)
+
+        self.unpack_params = unpack_params
+        self.linear_loc = linear_loc
+        self.linear_scale = linear_scale
+        self.linker_loc = linker_loc
+        self.linker_scale = linker_scale
+        self.likelihood = likelihood
+        self.jacobian = jacobian
+        self.hessian = hessian
+        self.fim = fim
+        self.use_gradient_tape = use_gradient_tape
+        self.params_copy = None
+        self.batch_features = False
+
+    def _call_parameters(self, inputs, keep_previous_params_copy=False):
+        if not keep_previous_params_copy:
+            if self.batch_features:
+                self.params_copy = tf.Variable(tf.boolean_mask(tensor=self.params,
+                                                               mask=tf.logical_not(self.model_vars.converged),
+                                                               axis=1), trainable=True)
+            else:
+                self.params_copy = self.params
+        design_loc, design_scale, size_factors = inputs
+        a_var, b_var = self.unpack_params([self.params_copy, self.model_vars.a_var.get_shape()[0]])
+        eta_loc = self.linear_loc([a_var, design_loc, self.model_vars.constraints_loc, size_factors])
+        eta_scale = self.linear_scale([b_var, design_scale, self.model_vars.constraints_scale])
+        loc = self.linker_loc(eta_loc)
+        scale = self.linker_scale(eta_scale)
+        return eta_loc, eta_scale, loc, scale, a_var, b_var
+
+    def calc_ll(self, inputs, keep_previous_params_copy=False):
+        parameters = self._call_parameters(inputs[1:], keep_previous_params_copy)
+        log_probs = self.likelihood([*parameters[:-2], inputs[0], np.sum(self.model_vars.updated)])
+        return (log_probs, *parameters[2:])
+
+    def _calc_jacobians(self, inputs, concat, transpose=True):
+        """
+        calculates jacobian.
+
+        :param inputs: TODO
+        :param concat: boolean
+            if true, concatenates the loc and scale block.
+        :param transpose: bool
+            transpose the gradient if true.
+            autograd returns gradients with respect to the shape of self.params.
+            But analytic differentiation returns it the other way, which is
+            often needed for downstream operations (e.g. hessian)
+            Therefore, if self.use_gradient_tape, it will transpose if transpose == False
+        """
+
+        with tf.GradientTape(persistent=True) as g:
+            log_probs, loc, scale, a_var, b_var = self.calc_ll(inputs)
+
+        if self.use_gradient_tape:
+
+            if self.compute_a:
+                if self.compute_b:
+                    if concat:
+                        jacobians = g.gradient(log_probs, self.params_copy)
+                        if not transpose:
+                            jacobians = tf.transpose(jacobians)
+                    else:
+                        jac_a = g.gradient(log_probs, a_var)
+                        jac_b = g.gradient(log_probs, b_var)
+                        if not transpose:
+                            jac_a = tf.transpose(jac_a)
+                            jac_b = tf.transpose(jac_b)
+                else:
+                    jac_a = g.gradient(log_probs, a_var)
+                    jac_b = tf.zeros((jac_a.get_shape()[0], b_var.get_shape()[1]), b_var.dtype)
+                    if concat:
+                        jacobians = tf.concat([jac_a, jac_b], axis=0)
+                        if not transpose:
+                            jacobians = tf.transpose(jacobians)
+            else:
+                jac_b = g.gradient(log_probs, b_var)
+                jac_a = tf.zeros((jac_b.get_shape()[0], a_var.get_shape()[0]), a_var.dtype)
+                if concat:
+                    jacobians = tf.concat([jac_a, jac_b], axis=0)
+                    if not transpose:
+                        jacobians = tf.transpose(jacobians)
+
+        else:
+
+            if concat:
+                jacobians = self.jacobian([*inputs[0:3], loc, scale, True])
+                if transpose:
+                    jacobians = tf.transpose(jacobians)
+            else:
+                jac_a, jac_b = self.jacobian([*inputs[0:3], loc, scale, False])
+
+        del g
+        if concat:
+            return loc, scale, log_probs, tf.negative(jacobians)
+        return loc, scale, log_probs, tf.negative(jac_a), tf.negative(jac_b)
+
+    def call(self, inputs, training=False, mask=None):
+        # X_data, design_loc, design_scale, size_factors = inputs
+
+        # This is for first order optimizations, which get the full jacobian
+
+        concat = self.TS["concat_grads"]
+
+        if self.TS["jacobian"] is True:
+            _, _, log_probs, jacobians = self._calc_jacobians(inputs, concat=concat)
+            return log_probs, jacobians
+
+        # This is for SecondOrder NR/NR_TR
+        if self.TS["hessian"] is True:
+
+            # with tf.GradientTape(persistent=True) as g2:
+            if concat:
+                loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, concat=True, transpose=False)
+            else:
+                loc, scale, log_probs, jac_a, jac_b = self._calc_jacobians(inputs, concat=False, transpose=False)
+            # results_arr = [jacobians[:, i] for i in tf.range(self.params_copy.get_shape()[0])]
+
+            '''
+            autograd not yet working. TODO: Search error in the following code:
+
+            if self.use_gradient_tape:
+
+                i = tf.constant(0, tf.int32)
+                h_tensor_array = tf.TensorArray(  # hessian slices [:,:,j]
+                    dtype=self.params_copy.dtype,
+                    size=self.params_copy.get_shape()[0],
+                    clear_after_read=False
+                )
+                while i < self.params_copy.get_shape()[0]:
+                    grad = g2.gradient(results_arr[i], self.params_copy)
+                    h_tensor_array.write(index=i, value=grad)
+                    i += 1
+
+                # h_tensor_array is a TensorArray, reshape this into a tensor so that it can be used
+                # in down-stream computation graphs.
+
+                hessians = tf.transpose(tf.reshape(
+                    h_tensor_array.stack(),
+                    tf.stack((self.params_copy.get_shape()[0],
+                              self.params_copy.get_shape()[0],
+                              self.params_copy.get_shape()[1]))
+                ), perm=[2, 1, 0])
+                hessians = tf.negative(hessians)
+            '''
+            # else:
+            if concat:
+                hessians = tf.negative(self.hessian([*inputs[0:3], loc, scale, True]))
+                return log_probs, jacobians, hessians
+            else:
+                hes_aa, hes_ab, hes_ba, hes_bb = self.hessian([*inputs[0:3], loc, scale, False])
+                return log_probs, jac_a, jac_b, tf.negative(hes_aa), tf.negative(hes_ab), tf.negative(hes_ba), tf.negative(hes_bb)
+            # del g2 # need to delete this GradientTape because persistent is True.
+
+
+        # This is for SecondOrder IRLS/IRLS_GD/IRLS_TR/IRLS_GD_TR
+        if self.TS["fim"] is True:
+
+
+
+            if concat:
+                loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, concat=True, transpose=False)
+                fims = self.fim([*inputs[0:3], loc, scale, True])
+
+                return log_probs, tf.negative(jacobians), fims
+            else:
+                loc, scale, log_probs, jac_a, jac_b = self._calc_jacobians(inputs, concat=False, transpose=False)
+                fim_a, fim_b = self.fim([*inputs[0:3], loc, scale, False])
+
+                return log_probs, jac_a, jac_b, fim_a, fim_b
+
+        raise ValueError("No gradient calculation specified.")
+
+
+class LossGLM(LossBase):
+
+    def norm_log_likelihood(self, log_probs):
+        return tf.reduce_mean(log_probs, axis=0, name="log_likelihood")
+
+    def norm_neg_log_likelihood(self, log_probs):
+        return - self.norm_log_likelihood(log_probs)
+
+    def call(self, y_true, log_probs):
+        return tf.reduce_sum(self.norm_neg_log_likelihood(log_probs))
diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
new file mode 100644
index 00000000..04bd2f16
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -0,0 +1,535 @@
+from .external import pkg_constants
+import tensorflow as tf
+from .external import OptimizerBase
+import abc
+import numpy as np
+
+
+class SecondOrderOptim(OptimizerBase, metaclass=abc.ABCMeta):
+
+    """
+    Superclass for NR and IRLS
+    """
+
+    def _norm_log_likelihood(self, log_probs):
+        return tf.reduce_mean(log_probs, axis=0, name="log_likelihood")
+
+    def _norm_neg_log_likelihood(self, log_probs):
+        return - self._norm_log_likelihood(log_probs)
+
+    def _resource_apply_dense(self, grad, handle, apply_state=None):
+
+        update_op = handle.assign_add(grad, read_value=False)
+
+        return update_op
+
+    def _resource_apply_sparse(self, grad, handle, apply_state=None):
+
+        raise NotImplementedError('Applying SparseTensor currently not possible.')
+
+    def get_config(self):
+
+        config = {"name": "SOO"}
+        return config
+
+    def _create_slots(self, var_list):
+
+        self.add_slot(var_list[0], 'mu_r')
+
+    def _trust_region_ops(
+            self,
+            x_batch,
+            likelihood,
+            proposed_vector,
+            proposed_gain,
+            compute_a,
+            compute_b,
+            batch_features,
+            ll_prev
+    ):
+        # Load hyper-parameters:
+        assert pkg_constants.TRUST_REGION_ETA0 < pkg_constants.TRUST_REGION_ETA1, \
+            "eta0 must be smaller than eta1"
+        assert pkg_constants.TRUST_REGION_ETA1 <= pkg_constants.TRUST_REGION_ETA2, \
+            "eta1 must be smaller than or equal to eta2"
+        assert pkg_constants.TRUST_REGION_T1 <= 1, "t1 must be smaller than 1"
+        assert pkg_constants.TRUST_REGION_T2 >= 1, "t1 must be larger than 1"
+        # Set trust region hyper-parameters
+        eta0 = tf.constant(pkg_constants.TRUST_REGION_ETA0, dtype=self._dtype)
+        eta1 = tf.constant(pkg_constants.TRUST_REGION_ETA1, dtype=self._dtype)
+        eta2 = tf.constant(pkg_constants.TRUST_REGION_ETA2, dtype=self._dtype)
+        if self.gd and compute_b:
+            t1 = tf.constant(pkg_constants.TRUST_REGIONT_T1_IRLS_GD_TR_SCALE, dtype=self._dtype)
+        else:
+            t1 = tf.constant(pkg_constants.TRUST_REGION_T1, dtype=self._dtype)
+        t2 = tf.constant(pkg_constants.TRUST_REGION_T2, dtype=self._dtype)
+        upper_bound = tf.constant(pkg_constants.TRUST_REGION_UPPER_BOUND, dtype=self._dtype)
+
+        # Phase I: Perform a trial update.
+        # Propose parameter update:
+
+        self.model.params_copy.assign_sub(proposed_vector)
+        # Phase II: Evaluate success of trial update and complete update cycle.
+        # Include parameter updates only if update improves cost function:
+        new_likelihood = self.model.calc_ll([*x_batch], keep_previous_params_copy=True)[0]
+        delta_f_actual = self._norm_neg_log_likelihood(likelihood) - self._norm_neg_log_likelihood(new_likelihood)
+
+        if batch_features:
+
+            indices = tf.where(tf.logical_not(self.model.model_vars.converged))
+            updated_lls = tf.scatter_nd(indices, delta_f_actual, shape=ll_prev.shape)
+            delta_f_actual = np.where(self.model.model_vars.converged, ll_prev, updated_lls.numpy())
+            update_var = tf.transpose(tf.scatter_nd(
+                indices,
+                tf.transpose(proposed_vector),
+                shape=(self.model.model_vars.n_features, proposed_vector.get_shape()[0])
+            ))
+
+            gain_var = tf.transpose(tf.scatter_nd(
+                indices,
+                proposed_gain,
+                shape=([self.model.model_vars.n_features])))
+        else:
+            update_var = proposed_vector
+            gain_var = proposed_gain
+        delta_f_ratio = tf.divide(delta_f_actual, gain_var)
+
+        # Compute parameter updates.g
+        update_theta = tf.logical_and(delta_f_actual > eta0, tf.logical_not(self.model.model_vars.converged))
+        update_theta_numeric = tf.expand_dims(tf.cast(update_theta, self._dtype), axis=0)
+        keep_theta_numeric = tf.ones_like(update_theta_numeric) - update_theta_numeric
+        if batch_features:
+            params = tf.transpose(tf.scatter_nd(
+                indices,
+                tf.transpose(self.model.params_copy),
+                shape=(self.model.model_vars.n_features, self.model.params.get_shape()[0])
+            ))
+
+            theta_new_tr = tf.add(
+                tf.multiply(self.model.params, keep_theta_numeric),
+                tf.multiply(params, update_theta_numeric)
+            )
+
+
+            #self.model.params.assign_(tf.multiply(params, update_theta_numeric))
+
+        else:
+            params = self.model.params_copy
+            theta_new_tr = tf.add(
+                tf.multiply(params + update_var, keep_theta_numeric),  # old values
+                tf.multiply(params, update_theta_numeric)  # new values
+            )
+        self.model.params.assign(theta_new_tr)
+        self.model.model_vars.updated = update_theta.numpy()
+
+        # Update trusted region accordingly:
+        decrease_radius = tf.logical_or(
+            delta_f_actual <= eta0,
+            tf.logical_and(delta_f_ratio <= eta1, tf.logical_not(self.model.model_vars.converged))
+        )
+        increase_radius = tf.logical_and(
+            delta_f_actual > eta0,
+            tf.logical_and(delta_f_ratio > eta2, tf.logical_not(self.model.model_vars.converged))
+        )
+        keep_radius = tf.logical_and(tf.logical_not(decrease_radius),
+                                     tf.logical_not(increase_radius))
+        radius_update = tf.add_n([
+            tf.multiply(t1, tf.cast(decrease_radius, self._dtype)),
+            tf.multiply(t2, tf.cast(increase_radius, self._dtype)),
+            tf.multiply(tf.ones_like(t1), tf.cast(keep_radius, self._dtype))
+        ])
+
+        if self.gd and compute_b and not compute_a:
+            tr_radius = self.tr_radius_b
+        else:
+            tr_radius = self.tr_radius
+
+        radius_new = tf.minimum(tf.multiply(tr_radius, radius_update), upper_bound)
+        tr_radius.assign(radius_new)
+
+    def __init__(self, dtype: tf.dtypes.DType, trusted_region_mode: bool, model: tf.keras.Model, name: str):
+
+        self.model = model
+        self.gd = name in ['IRLS_GD', 'IRLS_GD_TR']
+
+        super(SecondOrderOptim, self).__init__(name)
+
+        self._dtype = dtype
+        self.trusted_region_mode = trusted_region_mode
+        if trusted_region_mode:
+
+            self.tr_radius = tf.Variable(
+                np.zeros(shape=[self.model.model_vars.n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT,
+                dtype=self._dtype, trainable=False
+            )
+            if self.gd:
+                self.tr_radius_b = tf.Variable(
+                    np.zeros(shape=[self.model.model_vars.n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT,
+                    dtype=self._dtype, trainable=False
+                )
+
+            self.tr_ll_prev = tf.Variable(np.zeros(shape=[self.model.model_vars.n_features]), trainable=False)
+            self.tr_pred_gain = tf.Variable(np.zeros(shape=[self.model.model_vars.n_features]), trainable=False)
+
+        else:
+
+            self.tr_radius = tf.Variable(np.array([np.inf]), dtype=self._dtype, trainable=False)
+
+    @abc.abstractmethod
+    def perform_parameter_update(self, inputs):
+        pass
+
+    def _newton_type_update(self, lhs, rhs, psd):
+
+        new_rhs = tf.expand_dims(rhs, axis=-1)
+        res = tf.linalg.lstsq(lhs, new_rhs, fast=False)
+        delta_t = tf.squeeze(res, axis=-1)
+        update_tensor = tf.transpose(delta_t)
+        return update_tensor
+
+    def _pad_updates(
+            self,
+            update_raw,
+            compute_a,
+            compute_b
+    ):
+        # Pad update vectors to receive update tensors that match
+        # the shape of model_vars.params.
+        if compute_a:
+            if compute_b:
+                netwon_type_update = update_raw
+            else:
+                netwon_type_update = tf.concat([
+                    update_raw,
+                    tf.zeros(shape=(self.model.model_vars.b_var.get_shape()[0], update_raw.get_shape()[1]),
+                             dtype=self._dtype)
+                ], axis=0)
+
+        elif compute_b:
+            netwon_type_update = tf.concat([
+                tf.zeros(shape=(self.model.model_vars.a_var.get_shape()[0], update_raw.get_shape()[1]),
+                         dtype=self._dtype),
+                update_raw
+            ], axis=0)
+
+        else:
+            raise ValueError("No training necessary")
+
+        return netwon_type_update
+
+    def _trust_region_update(
+            self,
+            update_raw,
+            radius_container,
+            n_obs=None
+    ):
+        update_magnitude_sq = tf.reduce_sum(tf.square(update_raw), axis=0)
+        update_magnitude = tf.where(
+            condition=update_magnitude_sq > 0,
+            x=tf.sqrt(update_magnitude_sq),
+            y=tf.zeros_like(update_magnitude_sq)
+        )
+        update_magnitude_inv = tf.where(
+            condition=update_magnitude > 0,
+            x=tf.divide(
+                tf.ones_like(update_magnitude),
+                update_magnitude
+            ),
+            y=tf.zeros_like(update_magnitude)
+        )
+        update_norm = tf.multiply(update_raw, update_magnitude_inv)
+        # the following switch is for irls_gd_tr (linear instead of newton)
+        if n_obs is not None:
+            update_magnitude /= n_obs
+        update_scale = tf.minimum(
+            radius_container,
+            update_magnitude
+        )
+        proposed_vector = tf.multiply(
+            update_norm,
+            update_scale
+        )
+
+        return proposed_vector
+
+    def _trust_region_newton_cost_gain(
+            self,
+            proposed_vector,
+            neg_jac,
+            hessian_fim,
+            n_obs
+    ):
+        pred_cost_gain = tf.add(
+            tf.einsum(
+                'ni,in->n',
+                neg_jac,
+                proposed_vector
+            ) / n_obs,
+            0.5 * tf.einsum(
+                'nix,xin->n',
+                tf.einsum('inx,nij->njx',
+                          tf.expand_dims(proposed_vector, axis=-1),
+                          hessian_fim),
+                tf.expand_dims(proposed_vector, axis=0)
+            ) / tf.square(n_obs)
+        )
+        return pred_cost_gain
+
+
+class NR(SecondOrderOptim):
+
+    def _get_updates(self, lhs, rhs, psd, compute_a, compute_b):
+
+        update_raw = self._newton_type_update(lhs=lhs, rhs=rhs, psd=psd)
+        update = self._pad_updates(update_raw, compute_a, compute_b)
+
+        return update_raw, update
+
+    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, prev_ll=None):
+
+        x_batch, log_probs, jacobians, hessians, psd, n_obs = inputs
+        if not (compute_a or compute_b):
+            raise ValueError(
+                "Nothing can be trained. Please make sure at least one of train_mu and train_r is set to True.")
+
+        update_raw, update = self._get_updates(hessians, jacobians, psd, compute_a, compute_b)
+
+        if self.trusted_region_mode:
+
+            n_obs = tf.cast(n_obs, dtype=self._dtype)
+            if batch_features:
+                radius_container = tf.boolean_mask(
+                    tensor=self.tr_radius,
+                    mask=tf.logical_not(self.model.model_vars.converged))
+            else:
+                radius_container = self.tr_radius
+            tr_proposed_vector = self._trust_region_update(
+                update_raw=update_raw,
+                radius_container=radius_container
+            )
+            tr_pred_cost_gain = self._trust_region_newton_cost_gain(
+                proposed_vector=tr_proposed_vector,
+                neg_jac=jacobians,
+                hessian_fim=hessians,
+                n_obs=n_obs
+            )
+
+            tr_proposed_vector_pad = self._pad_updates(
+                update_raw=tr_proposed_vector,
+                compute_a=compute_a,
+                compute_b=compute_b
+            )
+
+            self._trust_region_ops(
+                x_batch=x_batch,
+                likelihood=log_probs,
+                proposed_vector=tr_proposed_vector_pad,
+                proposed_gain=tr_pred_cost_gain,
+                compute_a=compute_a,
+                compute_b=compute_b,
+                batch_features=batch_features,
+                ll_prev=prev_ll
+            )
+
+        else:
+            if batch_features:
+                indices = tf.where(tf.logical_not(self.model.model_vars.converged))
+                update_var = tf.transpose(
+                    tf.scatter_nd(
+                        indices,
+                        tf.transpose(update),
+                        shape=(self.model.model_vars.n_features, update.get_shape()[0])
+                    )
+                )
+            else:
+                update_var = update
+            self.model.params.assign_sub(update_var)
+
+
+class IRLS(SecondOrderOptim):
+
+    def _calc_proposed_vector_and_pred_cost_gain(
+            self,
+            update_x,
+            radius_container,
+            n_obs,
+            gd,
+            neg_jac_x,
+            fim_x=None
+    ):
+        """
+        Calculates the proposed vector and predicted cost gain for either mean or scale part.
+        :param update_x: tf.tensor coefficients x features ? TODO
+
+        :param radius_container: tf.tensor ? x ? TODO
+
+        :param n_obs: ? TODO
+            Number of observations in current batch.
+        :param gd: boolean
+            If True, the proposed vector and predicted cost gain are
+            calculated by linear functions related to IRLS_GD(_TR) optimizer.
+            If False, use newton functions for IRLS_TR optimizer instead.
+        :param neg_jac_x: tf.Tensor coefficients x features ? TODO
+            Upper (mu part) or lower (r part) of negative jacobian matrix
+        :param fim_x
+            Upper (mu part) or lower (r part) of Fisher Inverse Matrix.
+            Defaults to None, is only needed if gd is False
+        :return proposed_vector_x, pred_cost_gain_x
+            Returns proposed vector and predicted cost gain after
+            trusted region update for either mu or r part, depending on x
+        """
+
+        proposed_vector_x = self._trust_region_update(
+            update_raw=update_x,
+            radius_container=radius_container,
+            n_obs=n_obs if gd else None
+        )
+        # here, functions have different number of arguments, thus
+        # must be written out
+        if gd:
+            pred_cost_gain_x = self._trust_region_linear_cost_gain(
+                proposed_vector=proposed_vector_x,
+                neg_jac=neg_jac_x
+            )
+        else:
+            pred_cost_gain_x = self._trust_region_newton_cost_gain(
+                proposed_vector=proposed_vector_x,
+                neg_jac=neg_jac_x,
+                hessian_fim=fim_x,
+                n_obs=n_obs
+            )
+
+        return proposed_vector_x, pred_cost_gain_x
+
+    def _trust_region_linear_cost_gain(
+            self,
+            proposed_vector,
+            neg_jac
+    ):
+        pred_cost_gain = tf.reduce_sum(tf.multiply(
+            proposed_vector,
+            tf.transpose(neg_jac)
+        ), axis=0)
+        return pred_cost_gain
+
+    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, prev_ll=None):
+
+        x_batch, log_probs, jac_a, jac_b, fim_a, fim_b, psd, n_obs = inputs
+        if not (compute_a or compute_b):
+            raise ValueError(
+                "Nothing can be trained. Please make sure at least one of train_mu and train_r is set to True.")
+        # Compute a and b model updates separately.
+        if compute_a:
+            # The FIM of the mean model is guaranteed to be
+            # positive semi-definite and can therefore be inverted
+            # with the Cholesky decomposition. This information is
+            # passed here with psd=True.
+            update_a = self._newton_type_update(
+                lhs=fim_a,
+                rhs=jac_a,
+                psd=True
+            )
+        if compute_b:
+
+            if self.gd:
+                update_b = tf.transpose(jac_b)
+
+            else:
+                update_b = self._newton_type_update(
+                    lhs=fim_b,
+                    rhs=jac_b,
+                    psd=False
+                )
+
+        if not self.trusted_region_mode:
+            if compute_a:
+                if compute_b:
+                    update_raw = tf.concat([update_a, update_b], axis=0)
+                else:
+                    update_raw = update_a
+            else:
+                update_raw = update_b
+
+            update = self._pad_updates(
+                update_raw=update_raw,
+                compute_a=compute_a,
+                compute_b=compute_b
+            )
+
+            if batch_features:
+                indices = tf.where(tf.logical_not(self.model.model_vars.converged))
+                update_var = tf.transpose(
+                    tf.scatter_nd(
+                        indices,
+                        tf.transpose(update),
+                        shape=(self.model.model_vars.n_features, update.get_shape()[0])
+                    )
+                )
+            else:
+                update_var = update
+            self.model.params.assign_sub(update_var)
+
+        else:
+
+            n_obs = tf.cast(n_obs, dtype=self._dtype)
+            # put together update_raw based on proposed vector and cost gain depending on train_r and train_mu
+            if compute_b:
+                if compute_a:
+                    if batch_features:
+                        radius_container = tf.boolean_mask(
+                            tensor=self.tr_radius,
+                            mask=tf.logical_not(self.model.model_vars.converged))
+                    else:
+                        radius_container = self.tr_radius
+                    tr_proposed_vector_b, tr_pred_cost_gain_b = self._calc_proposed_vector_and_pred_cost_gain(
+                        update_b, radius_container, n_obs, self.gd, jac_b, fim_b)
+
+                    tr_proposed_vector_a, tr_pred_cost_gain_a = self._calc_proposed_vector_and_pred_cost_gain(
+                        update_a, radius_container, n_obs, False, jac_a, fim_a)
+
+                    tr_update_raw = tf.concat([tr_proposed_vector_a, tr_proposed_vector_b], axis=0)
+                    tr_pred_cost_gain = tf.add(tr_pred_cost_gain_a, tr_pred_cost_gain_b)
+
+                else:
+                    radius_container = self.tr_radius_b if self.gd else self.tr_radius
+                    if batch_features:
+                        radius_container = tf.boolean_mask(
+                            tensor=radius_container,
+                            mask=tf.logical_not(self.model.model_vars.converged))
+
+                    tr_proposed_vector_b, tr_pred_cost_gain_b = self._calc_proposed_vector_and_pred_cost_gain(
+                        update_b, radius_container, n_obs, self.gd, jac_b, fim_b)
+
+                    # directly apply output of calc_proposed_vector_and_pred_cost_gain to tr_update_raw
+                    # and tr_pred_cost_gain
+                    tr_update_raw = tr_proposed_vector_b
+                    tr_pred_cost_gain = tr_pred_cost_gain_b
+            else:
+                if batch_features:
+                    radius_container = tf.boolean_mask(
+                        tensor=self.tr_radius,
+                        mask=tf.logical_not(self.model.model_vars.converged))
+                else:
+                    radius_container = self.tr_radius
+                # here train_r is False AND train_mu is true, so the output of the function can directly be applied to
+                # tr_update_raw and tr_pred_cost_gain, similar to train_r = True and train_mu = False
+                tr_update_raw, tr_pred_cost_gain = self._calc_proposed_vector_and_pred_cost_gain(
+                    update_a, radius_container, n_obs, False, jac_a, fim_a)
+
+            # perform update
+            tr_update = self._pad_updates(
+                update_raw=tr_update_raw,
+                compute_a=compute_a,
+                compute_b=compute_b
+            )
+
+            self._trust_region_ops(
+                x_batch,
+                log_probs,
+                tr_update,
+                tr_pred_cost_gain,
+                compute_a,
+                compute_b,
+                batch_features,
+                prev_ll
+            )
diff --git a/batchglm/train/tf2/base_glm/processModel.py b/batchglm/train/tf2/base_glm/processModel.py
new file mode 100644
index 00000000..4b6aedf7
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/processModel.py
@@ -0,0 +1,9 @@
+from .external import ProcessModelBase
+import abc
+
+
+class ProcessModelGLM(ProcessModelBase):
+
+    @abc.abstractmethod
+    def param_bounds(self, dtype):
+        pass
diff --git a/batchglm/train/tf2/base_glm/training_strategies.py b/batchglm/train/tf2/base_glm/training_strategies.py
new file mode 100644
index 00000000..63f295d3
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/training_strategies.py
@@ -0,0 +1,111 @@
+from enum import Enum
+
+
+class TrainingStrategies(Enum):
+
+    AUTO = None
+
+    DEFAULT = \
+        {
+            "optim_algo": "default_adam",
+            "jacobian": True,
+            "hessian": False,
+            "fim": False,
+            "concat_grads": True
+        }
+
+    GD = \
+        {
+            "optim_algo": "gd",
+            "jacobian": True,
+            "hessian": False,
+            "fim": False,
+            "concat_grads": True
+        }
+
+    ADAM =  \
+        {
+            "optim_algo": "adam",
+            "jacobian": True,
+            "hessian": False,
+            "fim": False,
+            "concat_grads": True
+        }
+
+    ADAGRAD = \
+        {
+            "optim_algo": "adagrad",
+            "jacobian": True,
+            "hessian": False,
+            "fim": False,
+            "concat_grads": True
+        }
+
+    RMSPROP = \
+        {
+            "optim_algo": "rmsprop",
+            "jacobian": True,
+            "hessian": False,
+            "fim": False,
+            "concat_grads": True
+        }
+
+    IRLS = \
+        {
+            "optim_algo": "irls",
+            "jacobian": False,
+            "hessian": False,
+            "fim": True,
+            "concat_grads": False,
+            "calc_separated": True
+        }
+
+    IRLS_TR = \
+        {
+            "optim_algo": "irls_tr",
+            "jacobian": False,
+            "hessian": False,
+            "fim": True,
+            "concat_grads": False,
+            "calc_separated": True
+        }
+
+    IRLS_GD = \
+        {
+            "optim_algo": "irls_gd",
+            "jacobian": False,
+            "hessian": False,
+            "fim": True,
+            "concat_grads": False,
+            "calc_separated": True
+        }
+
+    IRLS_GD_TR = \
+        {
+            "optim_algo": "irls_gd_tr",
+            "jacobian": False,
+            "hessian": False,
+            "fim": True,
+            "concat_grads": False,
+            "calc_separated": True
+        }
+
+    NR = \
+        {
+            "optim_algo": "nr",
+            "jacobian": False,
+            "hessian": True,
+            "fim": False,
+            "concat_grads": True,
+            "calc_separated": False
+        }
+
+    NR_TR = \
+        {
+            "optim_algo": "nr_tr",
+            "jacobian": False,
+            "hessian": True,
+            "fim": False,
+            "concat_grads": True,
+            "calc_separated": False
+        }
diff --git a/batchglm/train/tf2/base_glm/vars.py b/batchglm/train/tf2/base_glm/vars.py
new file mode 100644
index 00000000..4b0debca
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/vars.py
@@ -0,0 +1,86 @@
+import numpy as np
+import tensorflow as tf
+import abc
+
+from .model import ProcessModelGLM
+
+
+class ModelVarsGLM(ProcessModelGLM):
+    """ Build tf.Variables to be optimzed and their constraints.
+
+    a_var and b_var slices of the tf.Variable params which contains
+    all parameters to be optimized during model estimation.
+    Params is defined across both location and scale model so that
+    the hessian can be computed for the entire model.
+    a and b are the clipped parameter values which also contain
+    constraints and constrained dependent coefficients which are not
+    directly optimized.
+    """
+
+    constraints_loc: tf.Tensor
+    constraints_scale: tf.Tensor
+    params: tf.Variable
+    a_var: tf.Tensor
+    b_var: tf.Tensor
+    updated: np.ndarray
+    converged: np.ndarray
+    dtype: str
+    n_features: int
+
+    def __init__(
+            self,
+            init_a: np.ndarray,
+            init_b: np.ndarray,
+            constraints_loc: np.ndarray,
+            constraints_scale: np.ndarray,
+            dtype: str
+    ):
+        """
+
+        :param init_a: nd.array (mean model size x features)
+            Initialisation for all parameters of mean model.
+        :param init_b: nd.array (dispersion model size x features)
+            Initialisation for all parameters of dispersion model.
+        :param dtype: Precision used in tensorflow.
+        """
+        self.constraints_loc = tf.convert_to_tensor(constraints_loc, dtype)
+        self.constraints_scale = tf.convert_to_tensor(constraints_scale, dtype)
+
+        self.init_a = tf.convert_to_tensor(init_a, dtype=dtype)
+        self.init_b = tf.convert_to_tensor(init_b, dtype=dtype)
+
+        self.init_a_clipped = self.tf_clip_param(self.init_a, "a_var")
+        self.init_b_clipped = self.tf_clip_param(self.init_b, "b_var")
+
+        # Param is the only tf.Variable in the graph.
+        # a_var and b_var have to be slices of params.
+        self.params = tf.Variable(tf.concat(
+            [
+                self.init_a_clipped,
+                self.init_b_clipped,
+            ],
+            axis=0
+        ), name="params")
+
+        a_var = self.params[0:init_a.shape[0]]
+        b_var = self.params[init_a.shape[0]:]
+
+        self.a_var = self.tf_clip_param(a_var, "a_var")
+        self.b_var = self.tf_clip_param(b_var, "b_var")
+
+        # Properties to follow gene-wise convergence.
+        self.updated = np.repeat(a=True, repeats=self.params.shape[1])  # Initialise to is updated.
+        self.converged = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to non-converged.
+
+        self.dtype = dtype
+        self.n_features = self.params.shape[1]
+        self.idx_train_loc = np.arange(0, init_a.shape[0])
+        self.idx_train_scale = np.arange(init_a.shape[0], init_a.shape[0]+init_b.shape[0])
+
+    @abc.abstractmethod
+    def param_bounds(self, dtype):
+        pass
+
+    def convergence_update(self, status: np.ndarray, features_updated: np.ndarray):
+        self.converged = status.copy()
+        self.updated = features_updated
diff --git a/batchglm/train/tf2/glm_beta/__init__.py b/batchglm/train/tf2/glm_beta/__init__.py
new file mode 100644
index 00000000..a616f181
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/__init__.py
@@ -0,0 +1,5 @@
+from .processModel import ProcessModel
+from .vars import ModelVars
+from .estimator import Estimator
+
+from .model import BetaGLM
diff --git a/batchglm/train/tf2/glm_beta/estimator.py b/batchglm/train/tf2/glm_beta/estimator.py
new file mode 100644
index 00000000..d35cdea2
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/estimator.py
@@ -0,0 +1,239 @@
+import logging
+from typing import Union
+
+import numpy as np
+
+from .external import closedform_beta_glm_logitmean, closedform_beta_glm_logsamplesize
+from .external import InputDataGLM, Model
+from .external import Estimator as GLMEstimator
+from .model import BetaGLM, LossGLMBeta
+from .processModel import ProcessModel
+from .vars import ModelVars
+
+
+class Estimator(GLMEstimator, ProcessModel):
+    """
+    Estimator for Generalized Linear Models (GLMs) with beta distributed noise.
+    Uses a logit linker function for loc and log linker function for scale.
+    """
+
+    model: BetaGLM
+
+    def __init__(
+            self,
+            input_data: InputDataGLM,
+            init_a: Union[np.ndarray, str] = "AUTO",
+            init_b: Union[np.ndarray, str] = "AUTO",
+            quick_scale: bool = False,
+            dtype="float64",
+    ):
+        """
+        Performs initialisation and creates a new estimator.
+
+        :param input_data: InputDataGLM
+            The input data
+        :param init_a: (Optional)
+            Low-level initial values for a. Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "random": initialize with random values
+                * "standard": initialize intercept with observed mean
+                * "init_model": initialize with another model (see `ìnit_model` parameter)
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'a'
+        :param init_b: (Optional)
+            Low-level initial values for b. Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "random": initialize with random values
+                * "standard": initialize with zeros
+                * "init_model": initialize with another model (see `ìnit_model` parameter)
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'b'
+        :param quick_scale: bool
+            Whether `scale` will be fitted faster and maybe less accurate.
+            Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
+        :param dtype: Precision used in tensorflow.
+        """
+
+        self._train_loc = True
+        self._train_scale = True
+
+        (init_a, init_b) = self.init_par(
+            input_data=input_data,
+            init_a=init_a,
+            init_b=init_b,
+            init_model=None
+        )
+        init_a = init_a.astype(dtype)
+        init_b = init_b.astype(dtype)
+        if quick_scale:
+            self._train_scale = False
+
+        self.model_vars = ModelVars(
+            init_a=init_a,
+            init_b=init_b,
+            constraints_loc=input_data.constraints_loc,
+            constraints_scale=input_data.constraints_scale,
+            dtype=dtype
+        )
+
+        super(Estimator, self).__init__(
+            input_data=input_data,
+            dtype=dtype
+        )
+
+    def train(
+        self,
+        batched_model=True,
+        batch_size: int = 500,
+        optimizer: str = "adam",
+        learning_rate: float = 1e-2,
+        convergence_criteria="step",
+        stopping_criteria=1000,
+        autograd=False,
+        featurewise = True,
+        benchmark: bool = False
+    ):
+        self.model = BetaGLM(model_vars=self.model_vars, dtype=self.model_vars.dtype,
+                             compute_a=self._train_loc, compute_b=self._train_scale, use_gradient_tape=autograd)
+        self._loss = LossGLMBeta()
+
+        optimizer_object, optimizer_enum = self.get_optimizer_object(optimizer, learning_rate)
+        self.model.TS = optimizer_enum.value
+
+        super(Estimator, self)._train(
+            noise_model="beta",
+            batched_model=batched_model,
+            batch_size=batch_size,
+            optimizer_object=optimizer_object,
+            optimizer_enum=optimizer_enum,
+            convergence_criteria=convergence_criteria,
+            stopping_criteria=stopping_criteria,
+            autograd=autograd,
+            benchmark=benchmark
+        )
+
+    def get_model_container(
+            self,
+            input_data
+    ):
+        return Model(input_data=input_data)
+
+    def init_par(
+            self,
+            input_data,
+            init_a,
+            init_b,
+            init_model
+    ):
+        r"""
+        standard:
+        Only initialise intercept and keep other coefficients as zero.
+
+        closed-form:
+        Initialize with Maximum Likelihood / Maximum of Momentum estimators
+        """
+
+        size_factors_init = input_data.size_factors
+
+        if init_model is None:
+            groupwise_means = None
+            init_a_str = None
+            if isinstance(init_a, str):
+                init_a_str = init_a.lower()
+                # Chose option if auto was chosen
+                if init_a.lower() == "auto":
+                    init_a = "closed_form"
+
+                if init_a.lower() == "closed_form":
+                    groupwise_means, init_a, rmsd_a = closedform_beta_glm_logitmean(
+                        x=input_data.x,
+                        design_loc=input_data.design_loc,
+                        constraints_loc=input_data.constraints_loc,
+                        size_factors=size_factors_init,
+                        link_fn=lambda mean: np.log(
+                            1/(1/self.np_clip_param(mean, "mean")-1)
+                        )
+                    )
+
+                    # train mu, if the closed-form solution is inaccurate
+                    self._train_loc = not (np.all(rmsd_a == 0) or rmsd_a.size == 0)
+
+                    logging.getLogger("batchglm").debug("Using closed-form MME initialization for mean")
+                elif init_a.lower() == "standard":
+                    overall_means = np.mean(input_data.x, axis=0)
+                    overall_means = self.np_clip_param(overall_means, "mean")
+
+                    init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
+                    init_a[0, :] = np.log(overall_means/(1-overall_means))
+                    self._train_loc = True
+
+                    logging.getLogger("batchglm").debug("Using standard initialization for mean")
+                elif init_a.lower() == "all_zero":
+                    init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
+                    self._train_loc = True
+
+                    logging.getLogger("batchglm").debug("Using all_zero initialization for mean")
+                else:
+                    raise ValueError("init_a string %s not recognized" % init_a)
+                logging.getLogger("batchglm").debug("Should train mean: %s", self._train_loc)
+            if isinstance(init_b, str):
+                if init_b.lower() == "auto":
+                    init_b = "standard"
+
+                if init_b.lower() == "standard":
+                    groupwise_scales, init_b_intercept, rmsd_b = closedform_beta_glm_logsamplesize(
+                        x=input_data.x,
+                        design_scale=input_data.design_scale[:, [0]],
+                        constraints=input_data.constraints_scale[[0], :][:, [0]],
+                        size_factors=size_factors_init,
+                        groupwise_means=None,
+                        link_fn=lambda samplesize: np.log(self.np_clip_param(samplesize, "samplesize"))
+                    )
+                    init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
+                    init_b[0, :] = init_b_intercept
+
+                    logging.getLogger("batchglm").debug("Using standard-form MME initialization for dispersion")
+                elif init_b.lower() == "closed_form":
+                    dmats_unequal = False
+                    if input_data.num_design_loc_params == input_data.num_design_scale_params:
+                        if np.any(input_data.design_loc != input_data.design_scale):
+                            dmats_unequal = True
+
+                    inits_unequal = False
+                    if init_a_str is not None:
+                        if init_a_str != init_b:
+                            inits_unequal = True
+
+                    if inits_unequal or dmats_unequal:
+                        raise ValueError(
+                            "cannot use closed_form init for scale model if scale model differs from loc model"
+                        )
+
+                    groupwise_scales, init_b, rmsd_b = closedform_beta_glm_logsamplesize(
+                        x=input_data.x,
+                        design_scale=input_data.design_scale,
+                        constraints=input_data.constraints_scale,
+                        size_factors=size_factors_init,
+                        groupwise_means=groupwise_means,
+                        link_fn=lambda samplesize: np.log(self.np_clip_param(samplesize, "samplesize"))
+                    )
+
+                    logging.getLogger("batchglm").debug("Using closed-form MME initialization for dispersion")
+                elif init_b.lower() == "all_zero":
+                    init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
+
+                    logging.getLogger("batchglm").debug("Using standard initialization for dispersion")
+                else:
+                    raise ValueError("init_b string %s not recognized" % init_b)
+                logging.getLogger("batchglm").debug("Should train r: %s", self._train_scale)
+        else:
+            init_a, init_b = self.get_init_from_model(init_a=init_a,
+                                                      init_b=init_b,
+                                                      input_data=input_data,
+                                                      init_model=init_model)
+
+        return init_a, init_b
diff --git a/batchglm/train/tf2/glm_beta/external.py b/batchglm/train/tf2/glm_beta/external.py
new file mode 100644
index 00000000..f7b5d508
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/external.py
@@ -0,0 +1,12 @@
+from batchglm import pkg_constants
+import batchglm.data as data_utils
+
+from batchglm.models.base_glm.utils import closedform_glm_mean, closedform_glm_scale
+from batchglm.models.glm_beta import _EstimatorGLM, InputDataGLM, Model
+from batchglm.models.glm_beta.utils import closedform_beta_glm_logitmean, closedform_beta_glm_logsamplesize
+from batchglm.utils.linalg import groupwise_solve_lm
+
+from batchglm.train.tf2.base_glm import ProcessModelGLM, GLM, LossGLM, Estimator, ModelVarsGLM
+from batchglm.train.tf2.base_glm import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
+from batchglm.train.tf2.base_glm import FIMGLM, JacobianGLM, HessianGLM
+
diff --git a/batchglm/train/tf2/glm_beta/layers.py b/batchglm/train/tf2/glm_beta/layers.py
new file mode 100644
index 00000000..2eae4735
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/layers.py
@@ -0,0 +1,53 @@
+import tensorflow as tf
+from .external import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
+from .processModel import ProcessModel
+
+
+class UnpackParams(UnpackParamsGLM, ProcessModel):
+    """
+    Full class.
+    """
+
+
+class LinearLoc(LinearLocGLM, ProcessModel):
+
+    def with_size_factors(self, eta_loc, size_factors):
+        raise NotImplementedError("There are no size_factors for GLMs with Beta noise.")
+
+
+class LinearScale(LinearScaleGLM, ProcessModel):
+    """
+    Full Class
+    """
+
+
+class LinkerLoc(LinkerLocGLM):
+
+    def _inv_linker(self, loc: tf.Tensor):
+        return 1 / (1 + tf.exp(-loc))
+
+
+class LinkerScale(LinkerScaleGLM):
+
+    def _inv_linker(self, scale: tf.Tensor):
+        return tf.exp(scale)
+
+
+class Likelihood(LikelihoodGLM, ProcessModel):
+
+    def _ll(self, eta_loc, eta_scale, loc, scale, x, n_features):
+
+        if isinstance(x, tf.SparseTensor):
+            one_minus_x = -tf.sparse.add(x, -tf.ones_like(loc))
+        else:
+            one_minus_x = 1 - x
+
+        one_minus_loc = 1 - loc
+        log_probs = tf.math.lgamma(scale) - tf.math.lgamma(loc * scale) \
+                    - tf.math.lgamma(one_minus_loc * scale) \
+                    + (scale * loc - 1) * tf.math.log(x) \
+                    + (one_minus_loc * scale - 1) * tf.math.log(one_minus_x)
+
+        log_probs = self.tf_clip_param(log_probs, "log_probs")
+
+        return log_probs
diff --git a/batchglm/train/tf2/glm_beta/layers_gradients.py b/batchglm/train/tf2/glm_beta/layers_gradients.py
new file mode 100644
index 00000000..566e9b44
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/layers_gradients.py
@@ -0,0 +1,144 @@
+import tensorflow as tf
+from .external import FIMGLM, JacobianGLM, HessianGLM
+
+
+class FIM(FIMGLM):
+    # No Fisher Information Matrices due to unsolvable E[log(X)]
+
+    def _weight_fim_aa(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        assert False, "not implemented"
+
+    def _weight_fim_bb(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        assert False, "not implemented"
+
+
+class Jacobian(JacobianGLM):
+
+    def _weights_jac_a(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        one_minus_loc = 1 - loc
+        if isinstance(x, tf.SparseTensor):
+            const1 = tf.math.log(tf.sparse.add(tf.zeros_like(loc), x).__div__(-tf.sparse.add(x, -tf.ones_like(loc))))
+        else:
+            const1 = tf.math.log(x / (1 - x))
+        const2 = - tf.math.digamma(loc * scale) + tf.math.digamma(one_minus_loc * scale) + const1
+        const = const2 * scale * loc * one_minus_loc
+        return const
+
+    def _weights_jac_b(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        if isinstance(x, tf.SparseTensor):
+            one_minus_x = - tf.sparse.add(x, -tf.ones_like(loc))
+        else:
+            one_minus_x = 1 - x
+        one_minus_loc = 1 - loc
+        const = scale * (tf.math.digamma(scale) - tf.math.digamma(loc * scale) * loc - tf.math.digamma(
+            one_minus_loc * scale) * one_minus_loc + loc * tf.math.log(x) + one_minus_loc * tf.math.log(
+            one_minus_x))
+        return const
+
+
+class Hessian(HessianGLM):
+
+    def _weight_hessian_aa(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        one_minus_loc = 1 - loc
+        loc_times_scale = loc * scale
+        one_minus_loc_times_scale = one_minus_loc * scale
+
+        if isinstance(x, tf.SparseTensor):
+            # Using the dense matrix  of the location model to serve the correct shapes for the sparse X.
+            const1 = tf.sparse.add(tf.zeros_like(loc), x).__div__(-tf.sparse.add(x, -tf.ones_like(loc)))
+            # Adding tf.zeros_like(loc) is a hack to avoid bug thrown by log on sparse matrix below,
+            # to_dense does not work.
+        else:
+            const1 = tf.math.log(x / (tf.ones_like(x) - x))
+
+        const2 = (1 - 2 * loc) * (
+                - tf.math.digamma(loc_times_scale) + tf.math.digamma(one_minus_loc_times_scale) + const1)
+        const3 = loc * one_minus_loc_times_scale * (
+                - tf.math.polygamma(tf.ones_like(loc), loc_times_scale) - tf.math.polygamma(tf.ones_like(loc),
+                                                                                            one_minus_loc_times_scale))
+        const = loc * one_minus_loc_times_scale * (const2 + const3)
+        return const
+
+    def _weight_hessian_ab(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        one_minus_loc = 1 - loc
+        loc_times_scale = loc * scale
+        one_minus_loc_times_scale = one_minus_loc * scale
+        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
+
+        if isinstance(x, tf.SparseTensor):
+            # Using the dense matrix  of the location model to serve the correct shapes for the sparse X.
+            const1 = tf.sparse.add(tf.zeros_like(loc), x).__div__(-tf.sparse.add(x, -tf.ones_like(loc)))
+            # Adding tf.zeros_like(loc) is a hack to avoid bug thrown by log on sparse matrix below,
+            # to_dense does not work.
+        else:
+            const1 = tf.math.log(x / (1 - x))
+
+        const2 = - tf.math.digamma(loc_times_scale) + tf.math.digamma(one_minus_loc_times_scale) + const1
+        const3 = scale * (- tf.math.polygamma(scalar_one, loc_times_scale) * loc + one_minus_loc * tf.math.polygamma(
+            scalar_one,
+            one_minus_loc_times_scale))
+
+        const = loc * one_minus_loc_times_scale * (const2 + const3)
+
+        return const
+
+    def _weight_hessian_bb(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        one_minus_loc = 1 - loc
+        loc_times_scale = loc * scale
+        one_minus_loc_times_scale = one_minus_loc * scale
+        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
+
+        if isinstance(x, tf.SparseTensor):
+            # Using the dense matrix  of the location model to serve the correct shapes for the sparse X.
+            const1 = tf.sparse.add(tf.zeros_like(loc), x).__div__(-tf.sparse.add(x, -tf.ones_like(loc)))
+            # Adding tf.zeros_like(loc) is a hack to avoid bug thrown by log on sparse matrix below,
+            # to_dense does not work.
+            const2 = loc * (tf.math.log(tf.sparse.add(tf.zeros_like(loc), x)) - tf.math.digamma(loc_times_scale)) \
+                     - one_minus_loc * (tf.math.digamma(one_minus_loc_times_scale) + tf.math.log(const1)) \
+                     + tf.math.digamma(scale)
+        else:
+            const1 = tf.math.log(x / (1 - x))
+            const2 = loc * (tf.math.log(x) - tf.math.digamma(loc_times_scale)) \
+                     - one_minus_loc * (tf.math.digamma(one_minus_loc_times_scale) + tf.math.log(const1)) \
+                     + tf.math.digamma(scale)
+        const3 = scale * (- tf.square(loc) * tf.math.polygamma(scalar_one, loc_times_scale)
+                          + tf.math.polygamma(scalar_one, scale)
+                          - tf.math.polygamma(scalar_one, one_minus_loc_times_scale) * tf.square(one_minus_loc))
+        const = scale * (const2 + const3)
+
+        return const
diff --git a/batchglm/train/tf2/glm_beta/model.py b/batchglm/train/tf2/glm_beta/model.py
new file mode 100644
index 00000000..435c9c53
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/model.py
@@ -0,0 +1,44 @@
+import logging
+
+from .layers import UnpackParams, LinearLoc, LinearScale, LinkerLoc, LinkerScale, Likelihood
+from .external import GLM, LossGLM
+from .layers_gradients import Jacobian, Hessian, FIM
+from .processModel import ProcessModel
+
+logger = logging.getLogger(__name__)
+
+
+class BetaGLM(GLM, ProcessModel):
+
+    def __init__(
+            self,
+            model_vars,
+            dtype,
+            compute_a,
+            compute_b,
+            use_gradient_tape
+    ):
+        self.compute_a = compute_a
+        self.compute_b = compute_b
+
+        super(BetaGLM, self).__init__(
+            model_vars=model_vars,
+            unpack_params=UnpackParams(),
+            linear_loc=LinearLoc(),
+            linear_scale=LinearScale(),
+            linker_loc=LinkerLoc(),
+            linker_scale=LinkerScale(),
+            likelihood=Likelihood(dtype),
+            jacobian=Jacobian(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
+            hessian=Hessian(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
+            fim=FIM(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
+            use_gradient_tape=use_gradient_tape
+
+        )
+
+
+class LossGLMBeta(LossGLM):
+
+    """
+    Full class
+    """
diff --git a/batchglm/train/tf2/glm_beta/processModel.py b/batchglm/train/tf2/glm_beta/processModel.py
new file mode 100644
index 00000000..c21811a4
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/processModel.py
@@ -0,0 +1,45 @@
+from .external import ProcessModelGLM
+import tensorflow as tf
+import numpy as np
+from .external import pkg_constants
+
+
+class ProcessModel(ProcessModelGLM):
+
+    def param_bounds(
+            self,
+            dtype
+    ):
+        if isinstance(dtype, tf.DType):
+            dmax = dtype.max
+            dtype = dtype.as_numpy_dtype
+        else:
+            dtype = np.dtype(dtype)
+            dmax = np.finfo(dtype).max
+            dtype = dtype.type
+
+        zero = np.nextafter(0, np.inf, dtype=dtype)
+        one = np.nextafter(1, -np.inf, dtype=dtype)
+
+        sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT)
+        bounds_min = {
+            "a_var": np.log(zero / (1 - zero)) / sf,
+            "b_var": np.log(zero) / sf,
+            "eta_loc": np.log(zero / (1 - zero)) / sf,
+            "eta_scale": np.log(zero) / sf,
+            "mean": np.nextafter(0, np.inf, dtype=dtype),
+            "samplesize": np.nextafter(0, np.inf, dtype=dtype),
+            "probs": dtype(0),
+            "log_probs": np.log(zero),
+        }
+        bounds_max = {
+            "a_var": np.log(one / (1 - one)) / sf,
+            "b_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_loc": np.log(one / (1 - one)) / sf,
+            "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "mean": one,
+            "samplesize": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "probs": dtype(1),
+            "log_probs": dtype(0),
+        }
+        return bounds_min, bounds_max
diff --git a/batchglm/train/tf2/glm_beta/vars.py b/batchglm/train/tf2/glm_beta/vars.py
new file mode 100644
index 00000000..b1200abc
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/vars.py
@@ -0,0 +1,8 @@
+from .model import ProcessModel
+from .external import ModelVarsGLM
+
+
+class ModelVars(ProcessModel, ModelVarsGLM):
+    """
+    Full class.
+    """
diff --git a/batchglm/train/tf2/glm_nb/__init__.py b/batchglm/train/tf2/glm_nb/__init__.py
new file mode 100644
index 00000000..f8cd6ee7
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/__init__.py
@@ -0,0 +1,5 @@
+from .processModel import ProcessModel
+from .vars import ModelVars
+from .estimator import Estimator
+
+from .model import NBGLM
diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
new file mode 100644
index 00000000..3cad4c19
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -0,0 +1,266 @@
+import logging
+from typing import Union
+import numpy as np
+
+from .external import InputDataGLM, Model
+from .external import closedform_nb_glm_logmu, closedform_nb_glm_logphi
+
+from .model import NBGLM, LossGLMNB
+from .vars import ModelVars
+from .processModel import ProcessModel
+from .external import Estimator as GLMEstimator
+
+
+class Estimator(GLMEstimator, ProcessModel):
+    """
+    Estimator for Generalized Linear Models (GLMs) with negative binomial noise.
+    Uses the natural logarithm as linker function.
+    """
+    model: NBGLM
+
+    def __init__(
+            self,
+            input_data: InputDataGLM,
+            init_a: Union[np.ndarray, str] = "AUTO",
+            init_b: Union[np.ndarray, str] = "AUTO",
+            quick_scale: bool = False,
+            dtype="float64",
+    ):
+        """
+        Performs initialisation and creates a new estimator.
+
+        :param input_data: InputDataGLM
+            The input data
+        :param init_a: (Optional)
+            Low-level initial values for a. Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "random": initialize with random values
+                * "standard": initialize intercept with observed mean
+                * "init_model": initialize with another model (see `ìnit_model` parameter)
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'a'
+        :param init_b: (Optional)
+            Low-level initial values for b. Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "random": initialize with random values
+                * "standard": initialize with zeros
+                * "init_model": initialize with another model (see `ìnit_model` parameter)
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'b'
+        :param quick_scale: bool
+            Whether `scale` will be fitted faster and maybe less accurate.
+            Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
+        :param dtype: Precision used in tensorflow.
+        """
+        self._train_loc = True
+        self._train_scale = True
+
+        (init_a, init_b) = self.init_par(
+            input_data=input_data,
+            init_a=init_a,
+            init_b=init_b,
+            init_model=None
+        )
+        init_a = init_a.astype(dtype)
+        init_b = init_b.astype(dtype)
+        if quick_scale:
+            self._train_scale = False
+
+        self.model_vars = ModelVars(
+            init_a=init_a,
+            init_b=init_b,
+            constraints_loc=input_data.constraints_loc,
+            constraints_scale=input_data.constraints_scale,
+            dtype=dtype
+        )
+
+        super(Estimator, self).__init__(
+            input_data=input_data,
+            dtype=dtype
+        )
+
+    def train(
+            self,
+            batched_model: bool = True,
+            batch_size: int = 500,
+            optimizer: str = "adam",
+            learning_rate: float = 1e-2,
+            convergence_criteria: str = "step",
+            stopping_criteria: int = 1000,
+            autograd: bool = False,
+            featurewise: bool = True,
+            benchmark: bool = False
+    ):
+        self.model = NBGLM(
+            model_vars=self.model_vars,
+            dtype=self.model_vars.dtype,
+            compute_a=self._train_loc,
+            compute_b=self._train_scale,
+            use_gradient_tape=autograd
+        )
+
+        self._loss = LossGLMNB()
+
+        optimizer_object, optimizer_enum = self.get_optimizer_object(optimizer, learning_rate)
+        self.model.TS = optimizer_enum.value
+
+        super(Estimator, self)._train(
+            noise_model="nb",
+            batched_model=batched_model,
+            batch_size=batch_size,
+            optimizer_object=optimizer_object,
+            optimizer_enum=optimizer_enum,
+            convergence_criteria=convergence_criteria,
+            stopping_criteria=stopping_criteria,
+            autograd=autograd,
+            featurewise=featurewise,
+            benchmark=benchmark
+        )
+
+    def get_model_container(
+            self,
+            input_data
+    ):
+        return Model(input_data=input_data)
+
+    def init_par(
+            self,
+            input_data,
+            init_a,
+            init_b,
+            init_model
+    ):
+        r"""
+        standard:
+        Only initialise intercept and keep other coefficients as zero.
+
+        closed-form:
+        Initialize with Maximum Likelihood / Maximum of Momentum estimators
+
+        Idea:
+        $$
+            \theta &= f(x) \\
+            \Rightarrow f^{-1}(\theta) &= x \\
+                &= (D \cdot D^{+}) \cdot x \\
+                &= D \cdot (D^{+} \cdot x) \\
+                &= D \cdot x' = f^{-1}(\theta)
+        $$
+        """
+
+        size_factors_init = input_data.size_factors
+        if size_factors_init is not None:
+            size_factors_init = np.expand_dims(size_factors_init, axis=1)
+            size_factors_init = np.broadcast_to(
+                array=size_factors_init,
+                shape=[input_data.num_observations, input_data.num_features]
+            )
+
+        if init_model is None:
+            groupwise_means = None
+            init_a_str = None
+            if isinstance(init_a, str):
+                init_a_str = init_a.lower()
+                # Chose option if auto was chosen
+                if init_a.lower() == "auto":
+                    init_a = "standard"
+
+                if init_a.lower() == "closed_form":
+                    groupwise_means, init_a, rmsd_a = closedform_nb_glm_logmu(
+                        x=input_data.x,
+                        design_loc=input_data.design_loc,
+                        constraints_loc=input_data.constraints_loc,
+                        size_factors=size_factors_init,
+                        link_fn=lambda loc: np.log(self.np_clip_param(loc, "loc"))
+                    )
+
+                    # train mu, if the closed-form solution is inaccurate
+                    self._train_loc = not (np.all(rmsd_a == 0) or rmsd_a.size == 0)
+
+                    if input_data.size_factors is not None:
+                        if np.any(input_data.size_factors != 1):
+                            self._train_loc = True
+
+                    logging.getLogger("batchglm").debug("Using closed-form MLE initialization for mean")
+                    logging.getLogger("batchglm").debug("Should train loc: %s", self._train_loc)
+                elif init_a.lower() == "standard":
+                    overall_means = np.mean(input_data.x, axis=0)  # directly calculate the mean
+                    overall_means = self.np_clip_param(overall_means, "loc")
+
+                    init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
+                    init_a[0, :] = np.log(overall_means)
+                    self._train_loc = True
+
+                    logging.getLogger("batchglm").debug("Using standard initialization for mean")
+                    logging.getLogger("batchglm").debug("Should train loc: %s", self._train_loc)
+                elif init_a.lower() == "all_zero":
+                    init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
+                    self._train_loc = True
+
+                    logging.getLogger("batchglm").debug("Using all_zero initialization for mean")
+                    logging.getLogger("batchglm").debug("Should train loc: %s", self._train_loc)
+                else:
+                    raise ValueError("init_a string %s not recognized" % init_a)
+
+            if isinstance(init_b, str):
+                if init_b.lower() == "auto":
+                    init_b = "standard"
+
+                if init_b.lower() == "standard":
+                    groupwise_scales, init_b_intercept, rmsd_b = closedform_nb_glm_logphi(
+                        x=input_data.x,
+                        design_scale=input_data.design_scale[:, [0]],
+                        constraints=input_data.constraints_scale[[0], :][:, [0]],
+                        size_factors=size_factors_init,
+                        groupwise_means=None,
+                        link_fn=lambda scale: np.log(self.np_clip_param(scale, "scale"))
+                    )
+                    init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
+                    init_b[0, :] = init_b_intercept
+
+                    logging.getLogger("batchglm").debug("Using standard-form MME initialization for dispersion")
+                    logging.getLogger("batchglm").debug("Should train scale: %s", self._train_scale)
+                elif init_b.lower() == "closed_form":
+                    dmats_unequal = False
+                    if input_data.design_loc.shape[1] == input_data.design_scale.shape[1]:
+                        if np.any(input_data.design_loc != input_data.design_scale):
+                            dmats_unequal = True
+
+                    inits_unequal = False
+                    if init_a_str is not None:
+                        if init_a_str != init_b:
+                            inits_unequal = True
+
+                    if inits_unequal or dmats_unequal:
+                        raise ValueError(
+                            "cannot use closed_form init for scale model if scale model differs from loc model"
+                        )
+
+                    groupwise_scales, init_b, rmsd_b = closedform_nb_glm_logphi(
+                        x=input_data.x,
+                        design_scale=input_data.design_scale,
+                        constraints=input_data.constraints_scale,
+                        size_factors=size_factors_init,
+                        groupwise_means=groupwise_means,
+                        link_fn=lambda scale: np.log(self.np_clip_param(scale, "scale"))
+                    )
+
+                    logging.getLogger("batchglm").debug("Using closed-form MME initialization for dispersion")
+                    logging.getLogger("batchglm").debug("Should train scale: %s", self._train_scale)
+                elif init_b.lower() == "all_zero":
+                    init_b = np.zeros([input_data.num_scale_params, input_data.x.shape[1]])
+
+                    logging.getLogger("batchglm").debug("Using standard initialization for dispersion")
+                    logging.getLogger("batchglm").debug("Should train scale: %s", self._train_scale)
+                else:
+                    raise ValueError("init_b string %s not recognized" % init_b)
+        else:
+            init_a, init_b = self.get_init_from_model(init_a=init_a,
+                                                      init_b=init_b,
+                                                      input_data=input_data,
+                                                      init_model=init_model)
+
+        return init_a, init_b
diff --git a/batchglm/train/tf2/glm_nb/external.py b/batchglm/train/tf2/glm_nb/external.py
new file mode 100644
index 00000000..d5c3a2e7
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/external.py
@@ -0,0 +1,18 @@
+import batchglm.data as data_utils
+
+from batchglm.models.glm_nb import _EstimatorGLM, InputDataGLM, Model
+from batchglm.models.base_glm.utils import closedform_glm_mean, closedform_glm_scale
+from batchglm.models.glm_nb.utils import closedform_nb_glm_logmu, closedform_nb_glm_logphi
+
+from batchglm.utils.linalg import groupwise_solve_lm
+from batchglm import pkg_constants
+
+from batchglm.train.tf2.base_glm import GLM
+from batchglm.train.tf2.base_glm import ProcessModelGLM, ModelVarsGLM
+
+# import necessary base_glm layers
+from batchglm.train.tf2.base_glm import LinearLocGLM, LinearScaleGLM, LinkerLocGLM
+from batchglm.train.tf2.base_glm import LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
+from batchglm.train.tf2.base_glm import FIMGLM, JacobianGLM, HessianGLM
+from batchglm.train.tf2.base_glm import LossGLM
+from batchglm.train.tf2.base_glm import Estimator
diff --git a/batchglm/train/tf2/glm_nb/layers.py b/batchglm/train/tf2/glm_nb/layers.py
new file mode 100644
index 00000000..b180c9eb
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/layers.py
@@ -0,0 +1,59 @@
+import tensorflow as tf
+from .processModel import ProcessModel
+from .external import LinearLocGLM, LinearScaleGLM, LinkerLocGLM
+from .external import LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
+
+
+class UnpackParams(UnpackParamsGLM, ProcessModel):
+    """
+    Full class.
+    """
+
+
+class LinearLoc(LinearLocGLM, ProcessModel):
+
+    def with_size_factors(self, eta_loc, size_factors):
+        return tf.add(eta_loc, tf.math.log(size_factors))
+
+
+class LinearScale(LinearScaleGLM, ProcessModel):
+    """
+    Full class.
+    """
+
+
+class LinkerLoc(LinkerLocGLM):
+
+    def _inv_linker(self, loc: tf.Tensor):
+        return tf.exp(loc)
+
+
+class LinkerScale(LinkerScaleGLM):
+
+    def _inv_linker(self, scale: tf.Tensor):
+        return tf.exp(scale)
+
+
+class Likelihood(LikelihoodGLM, ProcessModel):
+
+    def _ll(self, eta_loc, eta_scale, loc, scale, x, n_features):
+
+        # Log-likelihood:
+        log_r_plus_mu = tf.math.log(scale + loc)
+        if isinstance(x, tf.SparseTensor):
+            log_probs_sparse = x.__mul__(eta_loc - log_r_plus_mu)
+            log_probs_dense = tf.math.lgamma(tf.sparse.add(x, scale)) - \
+                              tf.math.lgamma(tf.sparse.add(x, tf.ones(shape=x.dense_shape, dtype=self.ll_dtype))) - \
+                              tf.math.lgamma(scale) + \
+                              tf.multiply(scale, eta_scale - log_r_plus_mu)
+            log_probs = tf.sparse.add(log_probs_sparse, log_probs_dense)
+            # log_probs.set_shape([None, n_features])  # need as shape completely lost.
+        else:
+            log_probs = tf.math.lgamma(scale + x) - \
+                        tf.math.lgamma(x + tf.ones_like(x)) - \
+                        tf.math.lgamma(scale) + \
+                        tf.multiply(x, eta_loc - log_r_plus_mu) + \
+                        tf.multiply(scale, eta_scale - log_r_plus_mu)
+
+        log_probs = self.tf_clip_param(log_probs, "log_probs")
+        return log_probs
diff --git a/batchglm/train/tf2/glm_nb/layers_gradients.py b/batchglm/train/tf2/glm_nb/layers_gradients.py
new file mode 100644
index 00000000..8ff079c6
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/layers_gradients.py
@@ -0,0 +1,144 @@
+import tensorflow as tf
+from .external import FIMGLM, JacobianGLM, HessianGLM
+
+
+class FIM(FIMGLM):
+
+    def _weight_fim_aa(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        const = tf.divide(scale, scale + loc)
+        w = tf.multiply(loc, const)
+
+        return w
+
+    def _weight_fim_bb(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        return tf.zeros_like(scale)
+
+
+class Jacobian(JacobianGLM):
+
+    def _weights_jac_a(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        if isinstance(x, tf.SparseTensor):  # or isinstance(x, tf.SparseTensorValue):
+            const = tf.sparse.add(x, tf.negative(loc))
+        else:
+            const = tf.subtract(x, loc)
+        return tf.divide(tf.multiply(scale, const), tf.add(loc, scale))
+
+    def _weights_jac_b(self, x, loc, scale):
+        # Pre-define sub-graphs that are used multiple times:
+        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
+        if isinstance(x, tf.SparseTensor):  # or isinstance(x, tf.SparseTensorValue):
+            scale_plus_x = tf.sparse.add(x, scale)
+        else:
+            scale_plus_x = scale + x
+
+        r_plus_mu = scale + loc
+
+        # Define graphs for individual terms of constant term of hessian:
+        const1 = tf.subtract(
+            tf.math.digamma(x=scale_plus_x),
+            tf.math.digamma(x=scale)
+        )
+        const2 = tf.negative(scale_plus_x / r_plus_mu)
+        const3 = tf.add(
+            tf.math.log(scale),
+            scalar_one - tf.math.log(r_plus_mu)
+        )
+        const = tf.add_n([const1, const2, const3])  # [observations, features]
+        const = scale * const
+
+        return const
+
+
+class Hessian(HessianGLM):
+
+    def _weight_hessian_ab(self, x, loc, scale):
+
+        if isinstance(x, tf.SparseTensor):
+            x_minus_mu = tf.sparse.add(x, -loc)
+        else:
+            x_minus_mu = x - loc
+
+        const = tf.multiply(
+            loc * scale,
+            tf.divide(
+                x_minus_mu,
+                tf.square(loc + scale)
+            )
+        )
+
+        return const
+
+    def _weight_hessian_aa(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        if isinstance(x, tf.SparseTensor):# or isinstance(x, tf.SparseTensorValue):
+            x_by_scale_plus_one = tf.sparse.add(x.__div__(scale), tf.ones_like(scale))
+        else:
+            x_by_scale_plus_one = x / scale + tf.ones_like(scale)
+
+        const = tf.negative(tf.multiply(
+            loc,
+            tf.divide(
+                x_by_scale_plus_one,
+                tf.square((loc / scale) + tf.ones_like(loc))
+            )
+        ))
+
+        return const
+
+    def _weight_hessian_bb(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        if isinstance(x, tf.SparseTensor):#  or isinstance(x, tf.SparseTensorValue):
+            scale_plus_x = tf.sparse.add(x, scale)
+        else:
+            scale_plus_x = x + scale
+
+        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
+        scalar_two = tf.constant(2, shape=(), dtype=self.dtype)
+        # Pre-define sub-graphs that are used multiple times:
+        scale_plus_loc = scale + loc
+        # Define graphs for individual terms of constant term of hessian:
+        const1 = tf.add(
+            tf.math.digamma(x=scale_plus_x),
+            scale * tf.math.polygamma(a=scalar_one, x=scale_plus_x)
+        )
+        const2 = tf.negative(tf.add(
+            tf.math.digamma(x=scale),
+            scale * tf.math.polygamma(a=scalar_one, x=scale)
+        ))
+        const3 = tf.negative(tf.divide(
+            tf.add(
+                loc * scale_plus_x,
+                scalar_two * scale * scale_plus_loc
+            ),
+            tf.square(scale_plus_loc)
+        ))
+        const4 = tf.add(
+            tf.math.log(scale),
+            scalar_two - tf.math.log(scale_plus_loc)
+        )
+        const = tf.add_n([const1, const2, const3, const4])
+        const = tf.multiply(scale, const)
+        return const
diff --git a/batchglm/train/tf2/glm_nb/model.py b/batchglm/train/tf2/glm_nb/model.py
new file mode 100644
index 00000000..665696ab
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/model.py
@@ -0,0 +1,43 @@
+import logging
+
+from .external import LossGLM, GLM
+from .layers import UnpackParams, LinearLoc, LinearScale, LinkerLoc, LinkerScale, Likelihood
+from .layers_gradients import Jacobian, Hessian, FIM
+
+from .processModel import ProcessModel
+
+logger = logging.getLogger(__name__)
+
+
+class NBGLM(GLM, ProcessModel):
+
+    def __init__(
+            self,
+            model_vars,
+            dtype,
+            compute_a,
+            compute_b,
+            use_gradient_tape
+    ):
+        self.compute_a = compute_a
+        self.compute_b = compute_b
+
+        super(NBGLM, self).__init__(
+            model_vars=model_vars,
+            unpack_params=UnpackParams(),
+            linear_loc=LinearLoc(),
+            linear_scale=LinearScale(),
+            linker_loc=LinkerLoc(),
+            linker_scale=LinkerScale(),
+            likelihood=Likelihood(dtype),
+            jacobian=Jacobian(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
+            hessian=Hessian(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
+            fim=FIM(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
+            use_gradient_tape=use_gradient_tape
+        )
+
+
+class LossGLMNB(LossGLM):
+    """
+    Full class
+    """
diff --git a/batchglm/train/tf2/glm_nb/processModel.py b/batchglm/train/tf2/glm_nb/processModel.py
new file mode 100644
index 00000000..6a177f7f
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/processModel.py
@@ -0,0 +1,42 @@
+from .external import ProcessModelGLM
+import tensorflow as tf
+import numpy as np
+from .external import pkg_constants
+
+
+class ProcessModel(ProcessModelGLM):
+
+    def param_bounds(
+            self,
+            dtype
+    ):
+        if isinstance(dtype, tf.DType):
+            dmax = dtype.max
+            dtype = dtype.as_numpy_dtype
+        else:
+            dtype = np.dtype(dtype)
+            dmax = np.finfo(dtype).max
+            dtype = dtype.type
+
+        sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT)
+        bounds_min = {
+            "a_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "b_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "eta_loc": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "eta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "loc": np.nextafter(0, np.inf, dtype=dtype),
+            "scale": np.nextafter(0, np.inf, dtype=dtype),
+            "probs": dtype(0),
+            "log_probs": np.log(np.nextafter(0, np.inf, dtype=dtype)),
+        }
+        bounds_max = {
+            "a_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "b_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_loc": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "loc": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "scale": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "probs": dtype(1),
+            "log_probs": dtype(0),
+        }
+        return bounds_min, bounds_max
diff --git a/batchglm/train/tf2/glm_nb/vars.py b/batchglm/train/tf2/glm_nb/vars.py
new file mode 100644
index 00000000..b1200abc
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/vars.py
@@ -0,0 +1,8 @@
+from .model import ProcessModel
+from .external import ModelVarsGLM
+
+
+class ModelVars(ProcessModel, ModelVarsGLM):
+    """
+    Full class.
+    """
diff --git a/batchglm/train/tf2/glm_norm/__init__.py b/batchglm/train/tf2/glm_norm/__init__.py
new file mode 100644
index 00000000..b6bf02af
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/__init__.py
@@ -0,0 +1,5 @@
+from .processModel import ProcessModel
+from .vars import ModelVars
+from .estimator import Estimator
+
+from .model import NormGLM
diff --git a/batchglm/train/tf2/glm_norm/estimator.py b/batchglm/train/tf2/glm_norm/estimator.py
new file mode 100644
index 00000000..cdd32b0f
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/estimator.py
@@ -0,0 +1,284 @@
+import logging
+import numpy as np
+import scipy.sparse
+from typing import Union
+
+from .external import closedform_norm_glm_logsd
+from .external import InputDataGLM, Model
+from .external import Estimator as GLMEstimator
+from .model import NormGLM, LossGLMNorm
+from .processModel import ProcessModel
+from .vars import ModelVars
+
+
+logger = logging.getLogger("batchglm")
+
+
+class Estimator(GLMEstimator, ProcessModel):
+    """
+    Estimator for Generalized Linear Models (GLMs) with normal distributed noise.
+    Uses the identity function as linker function for loc and a log-linker function for scale.
+    """
+
+    model: NormGLM
+    loss: LossGLMNorm
+
+    def __init__(
+            self,
+            input_data: InputDataGLM,
+            init_a: Union[np.ndarray, str] = "AUTO",
+            init_b: Union[np.ndarray, str] = "AUTO",
+            quick_scale: bool = False,
+            dtype="float64",
+    ):
+        """
+        Performs initialisation and creates a new estimator.
+
+        :param input_data: InputDataGLM
+            The input data
+        :param init_a: (Optional)
+            Low-level initial values for a. Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "all zero": initialize with zeros
+                * "random": initialize with random values
+                * "standard": initialize intercept with observed mean
+                * "init_model": initialize with another model (see `ìnit_model` parameter)
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'a'
+        :param init_b: (Optional)
+            Low-level initial values for b. Can be:
+
+            - str:
+                * "auto": automatically choose best initialization
+                * "random": initialize with random values
+                * "standard": initialize with zeros
+                * "init_model": initialize with another model (see `ìnit_model` parameter)
+                * "closed_form": try to initialize with closed form
+            - np.ndarray: direct initialization of 'b'
+        :param quick_scale: bool
+            Whether `scale` will be fitted faster and maybe less accurate.
+            Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
+        :param dtype: Precision used in tensorflow.
+        """
+
+        self._train_loc = True
+        self._train_scale = True
+
+        (init_a, init_b) = self.init_par(
+            input_data=input_data,
+            init_a=init_a,
+            init_b=init_b,
+            init_model=None
+        )
+        init_a = init_a.astype(dtype)
+        init_b = init_b.astype(dtype)
+        if quick_scale:
+            self._train_scale = False
+
+        self.model_vars = ModelVars(
+            init_a=init_a,
+            init_b=init_b,
+            constraints_loc=input_data.constraints_loc,
+            constraints_scale=input_data.constraints_scale,
+            dtype=dtype
+        )
+
+        super(Estimator, self).__init__(
+            input_data=input_data,
+            dtype=dtype
+        )
+
+    def train(
+        self,
+        batched_model=True,
+        batch_size: int = 500,
+        optimizer: str = "adam",
+        learning_rate: float = 1e-2,
+        convergence_criteria="step",
+        stopping_criteria=1000,
+        autograd=False,
+        featurewise = True,
+        benchmark: bool = False
+    ):
+
+        self.model = NormGLM(
+            model_vars=self.model_vars,
+            dtype=self.model_vars.dtype,
+            compute_a=self._train_loc,
+            compute_b=self._train_scale,
+            use_gradient_tape=autograd
+        )
+
+        self._loss = LossGLMNorm()
+
+        optimizer_object, optimizer_enum = self.get_optimizer_object(optimizer, learning_rate)
+        self.model.TS = optimizer_enum.value
+
+        super(Estimator, self)._train(
+            noise_model="norm",
+            batched_model=batched_model,
+            batch_size=batch_size,
+            optimizer_object=optimizer_object,
+            optimizer_enum=optimizer_enum,
+            convergence_criteria=convergence_criteria,
+            stopping_criteria=stopping_criteria,
+            autograd=autograd,
+            featurewise=featurewise,
+            benchmark=benchmark
+
+        )
+
+    def get_model_container(
+            self,
+            input_data
+    ):
+        return Model(input_data=input_data)
+
+    def init_par(self, input_data, init_a, init_b, init_model):
+        r"""
+        standard:
+        Only initialise intercept and keep other coefficients as zero.
+
+        closed-form:
+        Initialize with Maximum Likelihood / Maximum of Momentum estimators
+        """
+
+        size_factors_init = input_data.size_factors
+        if size_factors_init is not None:
+            size_factors_init = np.expand_dims(size_factors_init, axis=1)
+            size_factors_init = np.broadcast_to(
+                array=size_factors_init,
+                shape=[input_data.num_observations, input_data.num_features]
+            )
+
+        sf_given = False
+        if input_data.size_factors is not None:
+            if np.any(np.abs(input_data.size_factors - 1.) > 1e-8):
+                sf_given = True
+
+        is_ols_model = input_data.design_scale.shape[1] == 1 and \
+            np.all(np.abs(input_data.design_scale - 1.) < 1e-8) and not sf_given
+
+        if init_model is None:
+            groupwise_means = None
+            init_a_str = None
+            if isinstance(init_a, str):
+                init_a_str = init_a.lower()
+                # Chose option if auto was chosen
+                if init_a.lower() == "auto":
+                    init_a = "closed_form"
+
+                if init_a.lower() == "closed_form" or init_a.lower() == "standard":
+                    design_constr = np.matmul(input_data.design_loc, input_data.constraints_loc)
+                    # Iterate over genes if X is sparse to avoid large sparse tensor.
+                    # If X is dense, the least square problem can be vectorised easily.
+                    if isinstance(input_data.x, scipy.sparse.csr_matrix):
+                        init_a, rmsd_a, _, _ = np.linalg.lstsq(
+                            np.matmul(design_constr.T, design_constr),
+                            input_data.x.T.dot(design_constr).T,  # need double .T because of dot product on sparse.
+                            rcond=None
+                        )
+                    else:
+                        init_a, rmsd_a, _, _ = np.linalg.lstsq(
+                            np.matmul(design_constr.T, design_constr),
+                            np.matmul(design_constr.T, input_data.x),
+                            rcond=None
+                        )
+                    groupwise_means = None
+                    if is_ols_model:
+                        self._train_loc = False
+
+                    logger.debug("Using OLS initialization for location model")
+                elif init_a.lower() == "all_zero":
+                    init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
+                    self._train_loc = True
+
+                    logger.debug("Using all_zero initialization for mean")
+                else:
+                    raise ValueError("init_a string %s not recognized" % init_a)
+                logger.debug("Should train location model: %s", self._train_loc)
+
+            if isinstance(init_b, str):
+                if init_b.lower() == "auto":
+                    init_b = "standard"
+
+                if is_ols_model:
+                    # Calculated variance via E(x)^2 or directly depending on whether `mu` was specified.
+                    if isinstance(input_data.x, scipy.sparse.csr_matrix):
+                        expect_xsq = np.asarray(np.mean(input_data.x.power(2), axis=0))
+                    else:
+                        expect_xsq = np.expand_dims(np.mean(np.square(input_data.x), axis=0), axis=0)
+                    mean_model = np.matmul(
+                        np.matmul(input_data.design_loc, input_data.constraints_loc),
+                        init_a
+                    )
+                    expect_x_sq = np.mean(np.square(mean_model), axis=0)
+                    variance = (expect_xsq - expect_x_sq)
+                    init_b = np.log(np.sqrt(variance))
+                    self._train_scale = False
+
+                    logger.debug("Using residuals from OLS estimate for variance estimate")
+                elif init_b.lower() == "closed_form":
+                    dmats_unequal = False
+                    if input_data.design_loc.shape[1] == input_data.design_scale.shape[1]:
+                        if np.any(input_data.design_loc != input_data.design_scale):
+                            dmats_unequal = True
+
+                    inits_unequal = False
+                    if init_a_str is not None:
+                        if init_a_str != init_b:
+                            inits_unequal = True
+
+                    # Watch out: init_mean is full obs x features matrix and is very large in many cases.
+                    if inits_unequal or dmats_unequal:
+                        raise ValueError(
+                            "cannot use closed_form init for scale model \
+                            if scale model differs from loc model"
+                        )
+
+                    groupwise_scales, init_b, rmsd_b = closedform_norm_glm_logsd(
+                        x=input_data.x,
+                        design_scale=input_data.design_scale,
+                        constraints=input_data.constraints_scale,
+                        size_factors=size_factors_init,
+                        groupwise_means=groupwise_means,
+                        link_fn=lambda sd: np.log(self.np_clip_param(sd, "sd"))
+                    )
+
+                    # train scale, if the closed-form solution is inaccurate
+                    self._train_scale = not (np.all(rmsd_b == 0) or rmsd_b.size == 0)
+
+                    logger.debug("Using closed-form MME initialization for standard deviation")
+                elif init_b.lower() == "standard":
+                    groupwise_scales, init_b_intercept, rmsd_b = closedform_norm_glm_logsd(
+                        x=input_data.x,
+                        design_scale=input_data.design_scale[:, [0]],
+                        constraints=input_data.constraints_scale[[0], :][:, [0]],
+                        size_factors=size_factors_init,
+                        groupwise_means=None,
+                        link_fn=lambda sd: np.log(self.np_clip_param(sd, "sd"))
+                    )
+                    init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
+                    init_b[0, :] = init_b_intercept
+
+                    # train scale, if the closed-form solution is inaccurate
+                    self._train_scale = not (np.all(rmsd_b == 0) or rmsd_b.size == 0)
+
+                    logger.debug("Using closed-form MME initialization for standard deviation")
+                    logger.debug("Should train sd: %s", self._train_scale)
+                elif init_b.lower() == "all_zero":
+                    init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
+
+                    logger.debug("Using standard initialization for standard deviation")
+                else:
+                    raise ValueError("init_b string %s not recognized" % init_b)
+                logger.debug("Should train sd: %s", self._train_scale)
+        else:
+            init_a, init_b = self.get_init_from_model(init_a=init_a,
+                                                      init_b=init_b,
+                                                      input_data=input_data,
+                                                      init_model=init_model)
+
+        return init_a, init_b
diff --git a/batchglm/train/tf2/glm_norm/external.py b/batchglm/train/tf2/glm_norm/external.py
new file mode 100644
index 00000000..4b290d2e
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/external.py
@@ -0,0 +1,12 @@
+import batchglm.data as data_utils
+
+from batchglm.models.glm_norm import _EstimatorGLM, InputDataGLM, Model
+from batchglm.models.base_glm.utils import closedform_glm_mean, closedform_glm_scale
+from batchglm.models.glm_norm.utils import closedform_norm_glm_mean, closedform_norm_glm_logsd
+
+from batchglm.utils.linalg import groupwise_solve_lm
+from batchglm import pkg_constants
+
+from batchglm.train.tf2.base_glm import ProcessModelGLM, GLM, LossGLM, Estimator, ModelVarsGLM
+from batchglm.train.tf2.base_glm import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
+from batchglm.train.tf2.base_glm import FIMGLM, JacobianGLM, HessianGLM
diff --git a/batchglm/train/tf2/glm_norm/layers.py b/batchglm/train/tf2/glm_norm/layers.py
new file mode 100644
index 00000000..ba067352
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/layers.py
@@ -0,0 +1,49 @@
+import tensorflow as tf
+import numpy as np
+from .external import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
+from .processModel import ProcessModel
+
+
+class UnpackParams(UnpackParamsGLM, ProcessModel):
+    """
+    Full class.
+    """
+
+
+class LinearLoc(LinearLocGLM, ProcessModel):
+
+    def with_size_factors(self, eta_loc, size_factors):
+        return tf.multiply(eta_loc, size_factors)
+
+
+class LinearScale(LinearScaleGLM, ProcessModel):
+    """
+    Full Class
+    """
+
+
+class LinkerLoc(LinkerLocGLM):
+
+    def _inv_linker(self, loc: tf.Tensor):
+        return loc
+
+
+class LinkerScale(LinkerScaleGLM):
+
+    def _inv_linker(self, scale: tf.Tensor):
+        return tf.math.exp(scale)
+
+
+class Likelihood(LikelihoodGLM, ProcessModel):
+
+    def _ll(self, eta_loc, eta_scale, loc, scale, x, n_features):
+
+        const = tf.constant(-0.5 * np.log(2 * np.pi), shape=(), dtype=self.ll_dtype)
+        if isinstance(x, tf.SparseTensor):
+            log_probs = const - eta_scale - 0.5 * tf.math.square(tf.divide(tf.sparse.add(x, - loc), scale))
+            # log_probs.set_shape([None, a_var.shape[1]])  # Need this so as shape is completely lost.
+        else:
+            log_probs = const - eta_scale - 0.5 * tf.math.square(tf.divide(x - loc, scale))
+        log_probs = self.tf_clip_param(log_probs, "log_probs")
+
+        return log_probs
diff --git a/batchglm/train/tf2/glm_norm/layers_gradients.py b/batchglm/train/tf2/glm_norm/layers_gradients.py
new file mode 100644
index 00000000..e2b35119
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/layers_gradients.py
@@ -0,0 +1,116 @@
+import tensorflow as tf
+from .external import FIMGLM, JacobianGLM, HessianGLM
+
+
+class FIM(FIMGLM):
+
+    def _weight_fim_aa(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        w = tf.square(tf.divide(tf.ones_like(scale), scale))
+
+        return w
+
+    def _weight_fim_bb(
+            self,
+            x,
+            loc,
+            scale
+    ):
+        w = tf.constant(2, shape=loc.shape, dtype=self.dtype)
+
+        return w
+
+
+class Jacobian(JacobianGLM):
+
+    def _weights_jac_a(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        if isinstance(x, tf.SparseTensor):
+            const1 = tf.sparse.add(x, -loc)
+            const = tf.divide(const1, tf.square(scale))
+        else:
+            const1 = tf.subtract(x, loc)
+            const = tf.divide(const1, tf.square(scale))
+        return const
+
+    def _weights_jac_b(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
+        if isinstance(x, tf.SparseTensor):
+            const = tf.negative(scalar_one) + tf.math.square(
+                tf.divide(tf.sparse.add(x, -loc), scale)
+            )
+        else:
+            const = tf.negative(scalar_one) + tf.math.square(
+                tf.divide(tf.subtract(x, loc), scale)
+            )
+        return const
+
+
+class Hessian(HessianGLM):
+
+    def _weight_hessian_ab(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        scalar_two = tf.constant(2, shape=(), dtype=self.dtype)
+        if isinstance(x, tf.SparseTensor):
+            x_minus_loc = tf.sparse.add(x, -loc)
+        else:
+            x_minus_loc = x - loc
+
+        const = - tf.multiply(scalar_two,
+                              tf.divide(
+                                  x_minus_loc,
+                                  tf.square(scale)
+                              )
+                              )
+        return const
+
+    def _weight_hessian_aa(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        scalar_one = tf.constant(1, shape=(), dtype=self.dtype)
+        const = - tf.divide(scalar_one, tf.square(scale))
+
+        return const
+
+    def _weight_hessian_bb(
+            self,
+            x,
+            loc,
+            scale,
+    ):
+        scalar_two = tf.constant(2, shape=(), dtype=self.dtype)
+        if isinstance(x, tf.SparseTensor):
+            x_minus_loc = tf.sparse.add(x, -loc)
+        else:
+            x_minus_loc = x - loc
+
+        const = - tf.multiply(
+            scalar_two,
+            tf.math.square(
+                tf.divide(
+                    x_minus_loc,
+                    scale
+                )
+            )
+        )
+        return const
diff --git a/batchglm/train/tf2/glm_norm/model.py b/batchglm/train/tf2/glm_norm/model.py
new file mode 100644
index 00000000..e5b74734
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/model.py
@@ -0,0 +1,55 @@
+import logging
+
+from .layers import UnpackParams, LinearLoc, LinearScale, LinkerLoc, LinkerScale, Likelihood
+from .layers_gradients import Jacobian, Hessian, FIM
+from .external import GLM, LossGLM
+from .processModel import ProcessModel
+
+logger = logging.getLogger(__name__)
+
+
+class NormGLM(GLM, ProcessModel):
+
+    def __init__(
+            self,
+            model_vars,
+            dtype,
+            compute_a,
+            compute_b,
+            use_gradient_tape
+    ):
+        self.compute_a = compute_a
+        self.compute_b = compute_b
+
+        super(NormGLM, self).__init__(
+            model_vars=model_vars,
+            unpack_params=UnpackParams(),
+            linear_loc=LinearLoc(),
+            linear_scale=LinearScale(),
+            linker_loc=LinkerLoc(),
+            linker_scale=LinkerScale(),
+            likelihood=Likelihood(dtype),
+            jacobian=Jacobian(
+                model_vars=model_vars,
+                compute_a=self.compute_a,
+                compute_b=self.compute_b,
+                dtype=dtype),
+            hessian=Hessian(
+                model_vars=model_vars,
+                compute_a=self.compute_a,
+                compute_b=self.compute_b,
+                dtype=dtype),
+            fim=FIM(
+                model_vars=model_vars,
+                compute_a=self.compute_a,
+                compute_b=self.compute_b,
+                dtype=dtype),
+            use_gradient_tape=use_gradient_tape
+        )
+
+
+class LossGLMNorm(LossGLM):
+
+    """
+    Full class
+    """
diff --git a/batchglm/train/tf2/glm_norm/processModel.py b/batchglm/train/tf2/glm_norm/processModel.py
new file mode 100644
index 00000000..629099ff
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/processModel.py
@@ -0,0 +1,42 @@
+from .external import ProcessModelGLM
+import tensorflow as tf
+import numpy as np
+from .external import pkg_constants
+
+
+class ProcessModel(ProcessModelGLM):
+
+    def param_bounds(
+            self,
+            dtype
+    ):
+        if isinstance(dtype, tf.DType):
+            dmax = dtype.max
+            dtype = dtype.as_numpy_dtype
+        else:
+            dtype = np.dtype(dtype)
+            dmax = np.finfo(dtype).max
+            dtype = dtype.type
+
+        sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT)
+        bounds_min = {
+            "a_var": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
+            "b_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "eta_loc": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
+            "eta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "mean": np.nextafter(-dmax, np.inf, dtype=dtype) / sf,
+            "sd": np.nextafter(0, np.inf, dtype=dtype),
+            "probs": dtype(0),
+            "log_probs": np.log(np.nextafter(0, np.inf, dtype=dtype)),
+        }
+        bounds_max = {
+            "a_var": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "b_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_loc": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "mean": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "sd": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "probs": dtype(1),
+            "log_probs": dtype(0),
+        }
+        return bounds_min, bounds_max
diff --git a/batchglm/train/tf2/glm_norm/vars.py b/batchglm/train/tf2/glm_norm/vars.py
new file mode 100644
index 00000000..b1200abc
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/vars.py
@@ -0,0 +1,8 @@
+from .model import ProcessModel
+from .external import ModelVarsGLM
+
+
+class ModelVars(ProcessModel, ModelVarsGLM):
+    """
+    Full class.
+    """
diff --git a/batchglm/train/tf2/ops.py b/batchglm/train/tf2/ops.py
new file mode 100644
index 00000000..56fbf48b
--- /dev/null
+++ b/batchglm/train/tf2/ops.py
@@ -0,0 +1,59 @@
+import tensorflow as tf
+from typing import Union
+
+
+def swap_dims(tensor, axis0, axis1, exec_transpose=True, return_perm=False, name="swap_dims"):
+    """
+    Swaps two dimensions in a given tensor.
+
+    :param tensor: The tensor whose axes should be swapped
+    :param axis0: The first axis which should be swapped with `axis1`
+    :param axis1: The second axis which should be swapped with `axis0`
+    :param exec_transpose: Should the transpose operation be applied?
+    :param return_perm: Should the permutation argument for `tf.transpose` be returned?
+        Autmoatically true, if `exec_transpose` is False
+    :param name: The name scope of this op
+    :return: either retval, (retval, permutation) or permutation
+    """
+    with tf.name_scope(name):
+        rank = tf.range(tf.rank(tensor))
+        idx0 = rank[axis0]
+        idx1 = rank[axis1]
+        perm0 = tf.where(tf.equal(rank, idx0), tf.tile(tf.expand_dims(idx1, 0), [tf.size(rank)]), rank)
+        perm1 = tf.where(tf.equal(rank, idx1), tf.tile(tf.expand_dims(idx0, 0), [tf.size(rank)]), perm0)
+
+    if exec_transpose:
+        retval = tf.transpose(tensor, perm1)
+
+        if return_perm:
+            return retval, perm1
+        else:
+            return retval
+    else:
+        return perm1
+
+
+def stacked_lstsq(L, b, rcond=1e-10, name="stacked_lstsq"):
+    r"""
+    Solve `Lx = b`, via SVD least squares cutting of small singular values
+
+    :param L: tensor of shape (..., M, K)
+    :param b: tensor of shape (..., M, N).
+    :param rcond: threshold for inverse
+    :param name: name scope of this op
+    :return: x of shape (..., K, N)
+    """
+    with tf.name_scope(name):
+        u, s, v = tf.linalg.svd(L, full_matrices=False)
+        s_max = s.max(axis=-1, keepdims=True)
+        s_min = rcond * s_max
+
+        inv_s = tf.where(s >= s_min, tf.reciprocal(s), 0)
+
+        x = tf.einsum(
+            '...MK,...MN->...KN',
+            v,
+            tf.einsum('...K,...MK,...MN->...KN', inv_s, u, b)
+        )
+
+        return tf.conj(x)

From dbec1ced9dde1926a2163a73b0227e35b39d9cf1 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 11 Nov 2019 10:46:30 +0100
Subject: [PATCH 002/124] rm trainingStrategies, integrated them into model

---
 batchglm/train/tf2/base/estimator.py     |  6 +--
 batchglm/train/tf2/base/external.py      |  1 -
 batchglm/train/tf2/base_glm/__init__.py  |  1 -
 batchglm/train/tf2/base_glm/estimator.py | 66 ++++++++++--------------
 batchglm/train/tf2/base_glm/model.py     | 54 +++++++++++--------
 batchglm/train/tf2/glm_beta/estimator.py | 17 +++---
 batchglm/train/tf2/glm_beta/model.py     |  6 ++-
 batchglm/train/tf2/glm_nb/estimator.py   | 10 ++--
 batchglm/train/tf2/glm_nb/model.py       |  6 ++-
 batchglm/train/tf2/glm_norm/estimator.py |  7 ++-
 batchglm/train/tf2/glm_norm/model.py     |  6 ++-
 11 files changed, 95 insertions(+), 85 deletions(-)

diff --git a/batchglm/train/tf2/base/estimator.py b/batchglm/train/tf2/base/estimator.py
index 15fc0906..ef9899ef 100644
--- a/batchglm/train/tf2/base/estimator.py
+++ b/batchglm/train/tf2/base/estimator.py
@@ -1,4 +1,4 @@
-from .external import pkg_constants, TrainingStrategies
+from .external import pkg_constants
 from .model import ModelBase, LossBase
 
 import numpy as np
@@ -19,12 +19,12 @@ def _train(
             batched_model: bool,
             batch_size: int,
             optimizer_object: tf.keras.optimizers.Optimizer,
-            optimizer_enum: TrainingStrategies,
             convergence_criteria: str,
             stopping_criteria: int,
             autograd: bool,
             featurewise: bool,
-            benchmark: bool
+            benchmark: bool,
+            optimizer: str
     ):
         pass
 
diff --git a/batchglm/train/tf2/base/external.py b/batchglm/train/tf2/base/external.py
index 08784cca..9133cd4d 100644
--- a/batchglm/train/tf2/base/external.py
+++ b/batchglm/train/tf2/base/external.py
@@ -1,5 +1,4 @@
 #from batchglm.models.base import _Estimator_Base
 #from batchglm.xarray_sparse import SparseXArrayDataArray, SparseXArrayDataSet
-from batchglm.train.tf2.base_glm.training_strategies import TrainingStrategies
 #import batchglm.utils.stats as stat_utils
 from batchglm import pkg_constants
diff --git a/batchglm/train/tf2/base_glm/__init__.py b/batchglm/train/tf2/base_glm/__init__.py
index a662e17d..f87c8915 100644
--- a/batchglm/train/tf2/base_glm/__init__.py
+++ b/batchglm/train/tf2/base_glm/__init__.py
@@ -7,4 +7,3 @@
 from .layers import LikelihoodGLM, UnpackParamsGLM
 from .layers_gradients import JacobianGLM, HessianGLM, FIMGLM
 from .optim import NR, IRLS
-from .training_strategies import TrainingStrategies
diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 11cddd75..a993b3de 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -4,7 +4,6 @@
 import scipy
 import tensorflow as tf
 from .model import GLM
-from .training_strategies import TrainingStrategies
 from .external import TFEstimator, _EstimatorGLM
 from .optim import NR, IRLS
 from .external import pkg_constants
@@ -61,38 +60,25 @@ def __init__(
             input_data=input_data
         )
 
-    def train_sequence(self, training_strategy: []):
-        for strategy in training_strategy:
-            self.train(
-                batched_model=strategy['use_batching'],
-                optimizer=strategy['optim_algo'],
-                convergence_criteria=strategy['convergence_criteria'],
-                stopping_criteria=strategy['stopping_criteria'],
-                batch_size=strategy['batch_size'] if 'batch_size' in strategy else 500,
-                learning_rate=strategy['learning_rate'] if 'learning_rate' in strategy else 1e-2,
-                autograd=strategy['autograd'] if 'autograd' in strategy else False,
-                featurewise=strategy['featurewise'] if 'featurewise' in strategy else True
-            )
-
     def _train(
             self,
             noise_model: str,
             batched_model: bool = True,
             batch_size: int = 500,
             optimizer_object: tf.keras.optimizers.Optimizer = tf.keras.optimizers.Adam(),
-            optimizer_enum: TrainingStrategies = TrainingStrategies.DEFAULT,
             convergence_criteria: str = "step",
             stopping_criteria: int = 1000,
             autograd: bool = False,
             featurewise: bool = True,
             benchmark: bool = False,
+            optimizer: str = "adam"
     ):
 
         if not self._initialized:
             raise RuntimeError("Cannot train the model: \
                                 Estimator not initialized. Did you forget to call estimator.initialize() ?")
 
-        if autograd and optimizer_enum.value['hessian']:
+        if autograd and optimizer in ['nr', 'nr_tr']:
             logger.warning("Automatic differentiation is currently not supported for hessians. \
                             Falling back to closed form. Only Jacobians are calculated using autograd.")
 
@@ -129,14 +115,18 @@ def convergence_decision(convergence_status, train_step):
         ll_current = np.zeros([self._input_data.num_features], self.dtype) + np.nextafter(np.inf, 0, dtype=self.dtype)
 
         dataset_iterator = iter(input_list)
-        calc_separated = False
-        if optimizer_enum.value["hessian"] is True or optimizer_enum.value["fim"] is True:
-            second_order_optim = True
-            calc_separated = optimizer_enum.value['calc_separated']
+        irls_algo = False
+        nr_algo = False
+        if optimizer.lower() in ['nr','nr_tr']:
+            nr_algo = True
             update_func = optimizer_object.perform_parameter_update
+
+        elif optimizer.lower() in ['irls','irls_tr','irls_gd','irls_gd_tr']:
+            irls_algo = True
+            update_func = optimizer_object.perform_parameter_update
+
         else:
             update_func = optimizer_object.apply_gradients
-            second_order_optim = False
         n_obs = self._input_data.num_observations
 
         curr_norm_loc = np.sqrt(np.sum(np.square(
@@ -176,8 +166,8 @@ def convergence_decision(convergence_status, train_step):
                 x_batch = self.getModelInput(x_batch_tuple, batch_features, not_converged)
 
                 results = self.model(x_batch)
-            if second_order_optim:
-                if calc_separated:
+            if irls_algo or nr_algo:
+                if irls_algo:
                     update_func([x_batch, *results, False, n_obs], True, False, batch_features, ll_prev)
                     if self._train_scale:
                         update_func([x_batch, *results, False, n_obs], False, True, batch_features, ll_prev)
@@ -215,9 +205,9 @@ def convergence_decision(convergence_status, train_step):
                 jac_normalization = batch_size
             else:
                 jac_normalization = self._input_data.num_observations
-            if optimizer_enum.value["optim_algo"] in ['irls', 'irls_gd', 'irls_gd_tr', 'irls_tr']:
+            if irls_algo:
                 grad_numpy = tf.abs(tf.concat((results[1], results[2]), axis=1))
-            elif optimizer_enum.value["optim_algo"] in ['nr', 'nr_tr']:
+            elif nr_algo:
                 grad_numpy = tf.abs(results[1])
             else:
                 grad_numpy = tf.abs(tf.transpose(results[1]))
@@ -253,10 +243,10 @@ def convergence_decision(convergence_status, train_step):
         self._fisher_inv = tf.zeros(shape=()).numpy()
         self._hessian = tf.zeros(shape=()).numpy()
 
-        if optimizer_enum.value["hessian"] is True:
+        if nr_algo:
             self._hessian = results[2].numpy()
             self._jacobian = results[1].numpy()
-        elif optimizer_enum.value["fim"] is True:
+        elif irls_algo:
             self._fisher_inv = tf.concat([results[3], results[4]], axis=0).numpy()
             self._jacobian = tf.concat([results[1], results[2]], axis=0).numpy()
         else:
@@ -339,46 +329,46 @@ def get_optimizer_object(self, optimizer, learning_rate):
         optimizer = optimizer.lower()
 
         if optimizer == "gd":
-            return tf.keras.optimizers.SGD(learning_rate=learning_rate), TrainingStrategies.GD
+            return tf.keras.optimizers.SGD(learning_rate=learning_rate)
         if optimizer == "adam":
-            return tf.keras.optimizers.Adam(learning_rate=learning_rate), TrainingStrategies.ADAM
+            return tf.keras.optimizers.Adam(learning_rate=learning_rate)
         if optimizer == "adagrad":
-            return tf.keras.optimizers.Adagrad(learning_rate=learning_rate), TrainingStrategies.ADAGRAD
+            return tf.keras.optimizers.Adagrad(learning_rate=learning_rate)
         if optimizer == "rmsprop":
-            return tf.keras.optimizers.RMSprop(learning_rate=learning_rate), TrainingStrategies.RMSPROP
+            return tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
         if optimizer == "irls":
             return IRLS(dtype=self.dtype,
                         trusted_region_mode=False,
                         model=self.model,
-                        name="IRLS"), TrainingStrategies.IRLS
+                        name="IRLS")
         if optimizer == "irls_tr":
             return IRLS(dtype=self.dtype,
                         trusted_region_mode=True,
                         model=self.model,
-                        name="IRLS_TR"), TrainingStrategies.IRLS_TR
+                        name="IRLS_TR")
         if optimizer == "irls_gd":
             return IRLS(dtype=self.dtype,
                         trusted_region_mode=False,
                         model=self.model,
-                        name="IRLS_GD"), TrainingStrategies.IRLS_GD
+                        name="IRLS_GD")
         if optimizer == "irls_gd_tr":
             return IRLS(dtype=self.dtype,
                         trusted_region_mode=True,
                         model=self.model,
-                        name="IRLS_GD_TR"), TrainingStrategies.IRLS_GD_TR
+                        name="IRLS_GD_TR")
         if optimizer == "nr":
             return NR(dtype=self.dtype,
                       trusted_region_mode=False,
                       model=self.model,
-                      name="NR"), TrainingStrategies.NR
+                      name="NR")
         if optimizer == "nr_tr":
             return NR(dtype=self.dtype,
                       trusted_region_mode=True,
                       model=self.model,
-                      name="NR_TR"), TrainingStrategies.NR_TR
+                      name="NR_TR")
 
         logger.warning("No valid optimizer given. Default optimizer Adam chosen.")
-        return tf.keras.optimizers.Adam(learning_rate=learning_rate), TrainingStrategies.ADAM
+        return tf.keras.optimizers.Adam(learning_rate=learning_rate)
 
     def fetch_fn(self, idx):
         """
diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index cbf2d6d1..0d0e1fb4 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -3,8 +3,6 @@
 import numpy as np
 from .external import ModelBase, LossBase
 from .processModel import ProcessModelGLM
-from .training_strategies import TrainingStrategies
-
 logger = logging.getLogger("batchglm")
 
 
@@ -14,7 +12,6 @@ class GLM(ModelBase, ProcessModelGLM):
     base GLM class containg the model call.
     """
 
-    TS: {} = TrainingStrategies.DEFAULT.value
     compute_a: bool = True
     compute_b: bool = True
 
@@ -30,7 +27,8 @@ def __init__(
             jacobian: tf.keras.layers.Layer,
             hessian: tf.keras.layers.Layer,
             fim: tf.keras.layers.Layer,
-            use_gradient_tape: bool = False
+            optimizer: str,
+            use_gradient_tape: bool = False,
     ):
         super(GLM, self).__init__()
         self.model_vars = model_vars
@@ -55,6 +53,26 @@ def __init__(
         self.params_copy = None
         self.batch_features = False
 
+        self.calc_jacobian = False
+        self.calc_hessian = False
+        self.calc_fim = False
+        self.concat_grads = True
+
+        self._setParams(optimizer)
+
+    def _setParams(self, optimizer):
+
+        optimizer = optimizer.lower()
+        if optimizer in ['gd', 'adam', 'adagrad', 'rmsprop']:
+            self.calc_jacobian = True
+
+        elif optimizer in ['nr', 'nr_tr']:
+            self.calc_hessian = True
+
+        elif optimizer in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr']:
+            self.calc_fim = True
+            self.concat_grads = False
+
     def _call_parameters(self, inputs, keep_previous_params_copy=False):
         if not keep_previous_params_copy:
             if self.batch_features:
@@ -142,17 +160,14 @@ def call(self, inputs, training=False, mask=None):
 
         # This is for first order optimizations, which get the full jacobian
 
-        concat = self.TS["concat_grads"]
-
-        if self.TS["jacobian"] is True:
-            _, _, log_probs, jacobians = self._calc_jacobians(inputs, concat=concat)
+        if self.calc_jacobian:
+            _, _, log_probs, jacobians = self._calc_jacobians(inputs, concat=self.concat_grads)
             return log_probs, jacobians
 
         # This is for SecondOrder NR/NR_TR
-        if self.TS["hessian"] is True:
-
+        if self.calc_hessian:
             # with tf.GradientTape(persistent=True) as g2:
-            if concat:
+            if self.concat_grads:
                 loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, concat=True, transpose=False)
             else:
                 loc, scale, log_probs, jac_a, jac_b = self._calc_jacobians(inputs, concat=False, transpose=False)
@@ -186,21 +201,18 @@ def call(self, inputs, training=False, mask=None):
                 hessians = tf.negative(hessians)
             '''
             # else:
-            if concat:
+            if self.concat_grads:
                 hessians = tf.negative(self.hessian([*inputs[0:3], loc, scale, True]))
                 return log_probs, jacobians, hessians
-            else:
-                hes_aa, hes_ab, hes_ba, hes_bb = self.hessian([*inputs[0:3], loc, scale, False])
-                return log_probs, jac_a, jac_b, tf.negative(hes_aa), tf.negative(hes_ab), tf.negative(hes_ba), tf.negative(hes_bb)
-            # del g2 # need to delete this GradientTape because persistent is True.
 
+            hes_aa, hes_ab, hes_ba, hes_bb = self.hessian([*inputs[0:3], loc, scale, False])
+            return log_probs, jac_a, jac_b, tf.negative(hes_aa), \
+                tf.negative(hes_ab), tf.negative(hes_ba), tf.negative(hes_bb)
+            # del g2 # need to delete this GradientTape because persistent is True.
 
         # This is for SecondOrder IRLS/IRLS_GD/IRLS_TR/IRLS_GD_TR
-        if self.TS["fim"] is True:
-
-
-
-            if concat:
+        if self.calc_fim:
+            if self.concat_grads:
                 loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, concat=True, transpose=False)
                 fims = self.fim([*inputs[0:3], loc, scale, True])
 
diff --git a/batchglm/train/tf2/glm_beta/estimator.py b/batchglm/train/tf2/glm_beta/estimator.py
index d35cdea2..a3c33ec4 100644
--- a/batchglm/train/tf2/glm_beta/estimator.py
+++ b/batchglm/train/tf2/glm_beta/estimator.py
@@ -97,23 +97,28 @@ def train(
         featurewise = True,
         benchmark: bool = False
     ):
-        self.model = BetaGLM(model_vars=self.model_vars, dtype=self.model_vars.dtype,
-                             compute_a=self._train_loc, compute_b=self._train_scale, use_gradient_tape=autograd)
+        self.model = BetaGLM(
+            model_vars=self.model_vars,
+            dtype=self.model_vars.dtype,
+            compute_a=self._train_loc,
+            compute_b=self._train_scale,
+            use_gradient_tape=autograd,
+            optimizer=optimizer
+        )
         self._loss = LossGLMBeta()
 
-        optimizer_object, optimizer_enum = self.get_optimizer_object(optimizer, learning_rate)
-        self.model.TS = optimizer_enum.value
+        optimizer_object = self.get_optimizer_object(optimizer, learning_rate)
 
         super(Estimator, self)._train(
             noise_model="beta",
             batched_model=batched_model,
             batch_size=batch_size,
             optimizer_object=optimizer_object,
-            optimizer_enum=optimizer_enum,
             convergence_criteria=convergence_criteria,
             stopping_criteria=stopping_criteria,
             autograd=autograd,
-            benchmark=benchmark
+            benchmark=benchmark,
+            optimizer=optimizer
         )
 
     def get_model_container(
diff --git a/batchglm/train/tf2/glm_beta/model.py b/batchglm/train/tf2/glm_beta/model.py
index 435c9c53..bdc7f1cd 100644
--- a/batchglm/train/tf2/glm_beta/model.py
+++ b/batchglm/train/tf2/glm_beta/model.py
@@ -16,7 +16,8 @@ def __init__(
             dtype,
             compute_a,
             compute_b,
-            use_gradient_tape
+            use_gradient_tape,
+            optimizer
     ):
         self.compute_a = compute_a
         self.compute_b = compute_b
@@ -32,7 +33,8 @@ def __init__(
             jacobian=Jacobian(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
             hessian=Hessian(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
             fim=FIM(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
-            use_gradient_tape=use_gradient_tape
+            use_gradient_tape=use_gradient_tape,
+            optimizer=optimizer
 
         )
 
diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index 3cad4c19..00a03675 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -100,25 +100,25 @@ def train(
             dtype=self.model_vars.dtype,
             compute_a=self._train_loc,
             compute_b=self._train_scale,
-            use_gradient_tape=autograd
+            use_gradient_tape=autograd,
+            optimizer=optimizer
         )
 
         self._loss = LossGLMNB()
 
-        optimizer_object, optimizer_enum = self.get_optimizer_object(optimizer, learning_rate)
-        self.model.TS = optimizer_enum.value
+        optimizer_object = self.get_optimizer_object(optimizer, learning_rate)
 
         super(Estimator, self)._train(
             noise_model="nb",
             batched_model=batched_model,
             batch_size=batch_size,
             optimizer_object=optimizer_object,
-            optimizer_enum=optimizer_enum,
             convergence_criteria=convergence_criteria,
             stopping_criteria=stopping_criteria,
             autograd=autograd,
             featurewise=featurewise,
-            benchmark=benchmark
+            benchmark=benchmark,
+            optimizer=optimizer
         )
 
     def get_model_container(
diff --git a/batchglm/train/tf2/glm_nb/model.py b/batchglm/train/tf2/glm_nb/model.py
index 665696ab..af1f524e 100644
--- a/batchglm/train/tf2/glm_nb/model.py
+++ b/batchglm/train/tf2/glm_nb/model.py
@@ -17,7 +17,8 @@ def __init__(
             dtype,
             compute_a,
             compute_b,
-            use_gradient_tape
+            use_gradient_tape,
+            optimizer
     ):
         self.compute_a = compute_a
         self.compute_b = compute_b
@@ -33,7 +34,8 @@ def __init__(
             jacobian=Jacobian(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
             hessian=Hessian(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
             fim=FIM(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
-            use_gradient_tape=use_gradient_tape
+            use_gradient_tape=use_gradient_tape,
+            optimizer=optimizer
         )
 
 
diff --git a/batchglm/train/tf2/glm_norm/estimator.py b/batchglm/train/tf2/glm_norm/estimator.py
index cdd32b0f..90b51740 100644
--- a/batchglm/train/tf2/glm_norm/estimator.py
+++ b/batchglm/train/tf2/glm_norm/estimator.py
@@ -108,20 +108,19 @@ def train(
             dtype=self.model_vars.dtype,
             compute_a=self._train_loc,
             compute_b=self._train_scale,
-            use_gradient_tape=autograd
+            use_gradient_tape=autograd,
+            optimizer=optimizer
         )
 
         self._loss = LossGLMNorm()
 
-        optimizer_object, optimizer_enum = self.get_optimizer_object(optimizer, learning_rate)
-        self.model.TS = optimizer_enum.value
+        optimizer_object = self.get_optimizer_object(optimizer, learning_rate)
 
         super(Estimator, self)._train(
             noise_model="norm",
             batched_model=batched_model,
             batch_size=batch_size,
             optimizer_object=optimizer_object,
-            optimizer_enum=optimizer_enum,
             convergence_criteria=convergence_criteria,
             stopping_criteria=stopping_criteria,
             autograd=autograd,
diff --git a/batchglm/train/tf2/glm_norm/model.py b/batchglm/train/tf2/glm_norm/model.py
index e5b74734..58e31636 100644
--- a/batchglm/train/tf2/glm_norm/model.py
+++ b/batchglm/train/tf2/glm_norm/model.py
@@ -16,7 +16,8 @@ def __init__(
             dtype,
             compute_a,
             compute_b,
-            use_gradient_tape
+            use_gradient_tape,
+            optimizer
     ):
         self.compute_a = compute_a
         self.compute_b = compute_b
@@ -44,7 +45,8 @@ def __init__(
                 compute_a=self.compute_a,
                 compute_b=self.compute_b,
                 dtype=dtype),
-            use_gradient_tape=use_gradient_tape
+            use_gradient_tape=use_gradient_tape,
+            optimizer=optimizer
         )
 
 

From e35a44ff3a626d863935b5e80297942c181eaf59 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 11 Nov 2019 14:38:40 +0100
Subject: [PATCH 003/124] added trainingstrategies according to tf1

---
 batchglm/train/tf2/glm_beta/estimator.py      |  2 +
 .../train/tf2/glm_beta/training_strategies.py | 37 +++++++++++++++++++
 batchglm/train/tf2/glm_nb/estimator.py        |  2 +
 .../train/tf2/glm_nb/training_strategies.py   | 37 +++++++++++++++++++
 batchglm/train/tf2/glm_norm/estimator.py      |  3 +-
 .../train/tf2/glm_norm/training_strategies.py | 27 ++++++++++++++
 6 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 batchglm/train/tf2/glm_beta/training_strategies.py
 create mode 100644 batchglm/train/tf2/glm_nb/training_strategies.py
 create mode 100644 batchglm/train/tf2/glm_norm/training_strategies.py

diff --git a/batchglm/train/tf2/glm_beta/estimator.py b/batchglm/train/tf2/glm_beta/estimator.py
index a3c33ec4..4268894f 100644
--- a/batchglm/train/tf2/glm_beta/estimator.py
+++ b/batchglm/train/tf2/glm_beta/estimator.py
@@ -9,6 +9,7 @@
 from .model import BetaGLM, LossGLMBeta
 from .processModel import ProcessModel
 from .vars import ModelVars
+from .training_strategies import TrainingStrategies
 
 
 class Estimator(GLMEstimator, ProcessModel):
@@ -57,6 +58,7 @@ def __init__(
             Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
         :param dtype: Precision used in tensorflow.
         """
+        self.TrainingStrategies = TrainingStrategies
 
         self._train_loc = True
         self._train_scale = True
diff --git a/batchglm/train/tf2/glm_beta/training_strategies.py b/batchglm/train/tf2/glm_beta/training_strategies.py
new file mode 100644
index 00000000..9bd8b271
--- /dev/null
+++ b/batchglm/train/tf2/glm_beta/training_strategies.py
@@ -0,0 +1,37 @@
+from enum import Enum
+
+class TrainingStrategies(Enum):
+
+    AUTO = None
+    DEFAULT = [
+        {
+            "convergence_criteria": "all_converged_ll",
+            "stopping_criteria": 1e-8,
+            "use_batching": False,
+            "optim_algo": "nr_tr",
+        },
+    ]
+    INEXACT = [
+        {
+            "convergence_criteria": "all_converged_ll",
+            "stopping_criteria": 1e-6,
+            "use_batching": False,
+            "optim_algo": "nr_tr",
+        },
+    ]
+    EXACT = [
+        {
+            "convergence_criteria": "all_converged_ll",
+            "stopping_criteria": 1e-8,
+            "use_batching": False,
+            "optim_algo": "nr_tr",
+        },
+    ]
+    IRLS = [
+        {
+            "convergence_criteria": "all_converged_ll",
+            "stopping_criteria": 1e-8,
+            "use_batching": False,
+            "optim_algo": "irls_tr",
+        },
+    ]
\ No newline at end of file
diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index 00a03675..a922f381 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -9,6 +9,7 @@
 from .vars import ModelVars
 from .processModel import ProcessModel
 from .external import Estimator as GLMEstimator
+from .training_strategies import TrainingStrategies
 
 
 class Estimator(GLMEstimator, ProcessModel):
@@ -56,6 +57,7 @@ def __init__(
             Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
         :param dtype: Precision used in tensorflow.
         """
+        self.TrainingStrategies = TrainingStrategies
         self._train_loc = True
         self._train_scale = True
 
diff --git a/batchglm/train/tf2/glm_nb/training_strategies.py b/batchglm/train/tf2/glm_nb/training_strategies.py
new file mode 100644
index 00000000..9bd8b271
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/training_strategies.py
@@ -0,0 +1,37 @@
+from enum import Enum
+
+class TrainingStrategies(Enum):
+
+    AUTO = None
+    DEFAULT = [
+        {
+            "convergence_criteria": "all_converged_ll",
+            "stopping_criteria": 1e-8,
+            "use_batching": False,
+            "optim_algo": "nr_tr",
+        },
+    ]
+    INEXACT = [
+        {
+            "convergence_criteria": "all_converged_ll",
+            "stopping_criteria": 1e-6,
+            "use_batching": False,
+            "optim_algo": "nr_tr",
+        },
+    ]
+    EXACT = [
+        {
+            "convergence_criteria": "all_converged_ll",
+            "stopping_criteria": 1e-8,
+            "use_batching": False,
+            "optim_algo": "nr_tr",
+        },
+    ]
+    IRLS = [
+        {
+            "convergence_criteria": "all_converged_ll",
+            "stopping_criteria": 1e-8,
+            "use_batching": False,
+            "optim_algo": "irls_tr",
+        },
+    ]
\ No newline at end of file
diff --git a/batchglm/train/tf2/glm_norm/estimator.py b/batchglm/train/tf2/glm_norm/estimator.py
index 90b51740..6f00150a 100644
--- a/batchglm/train/tf2/glm_norm/estimator.py
+++ b/batchglm/train/tf2/glm_norm/estimator.py
@@ -9,6 +9,7 @@
 from .model import NormGLM, LossGLMNorm
 from .processModel import ProcessModel
 from .vars import ModelVars
+from .training_strategies import TrainingStrategies
 
 
 logger = logging.getLogger("batchglm")
@@ -62,7 +63,7 @@ def __init__(
             Useful in scenarios where fitting the exact `scale` is not absolutely necessary.
         :param dtype: Precision used in tensorflow.
         """
-
+        self.TrainingStrategies = TrainingStrategies
         self._train_loc = True
         self._train_scale = True
 
diff --git a/batchglm/train/tf2/glm_norm/training_strategies.py b/batchglm/train/tf2/glm_norm/training_strategies.py
new file mode 100644
index 00000000..2ba524a7
--- /dev/null
+++ b/batchglm/train/tf2/glm_norm/training_strategies.py
@@ -0,0 +1,27 @@
+from enum import Enum
+
+
+class TrainingStrategies(Enum):
+
+    AUTO = None
+    DEFAULT = [
+        {
+            "convergence_criteria": "all_converged",
+            "use_batching": False,
+            "optim_algo": "irls_tr",
+        },
+    ]
+    IRLS = [
+        {
+            "convergence_criteria": "all_converged",
+            "use_batching": False,
+            "optim_algo": "irls_tr",
+        },
+    ]
+    IRLS_BATCHED = [
+        {
+            "convergence_criteria": "all_converged",
+            "use_batching": True,
+            "optim_algo": "irls_tr",
+        },
+    ]

From 224c366892fcdba9824528b02db1d3ccc066ecd4 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 12 Nov 2019 10:05:20 +0100
Subject: [PATCH 004/124] refactoring batched_model->use_batching/is_batched

---
 batchglm/train/tf2/base/estimator.py     |  2 +-
 batchglm/train/tf2/base_glm/estimator.py | 10 +++++-----
 batchglm/train/tf2/glm_beta/estimator.py | 12 ++++++------
 batchglm/train/tf2/glm_nb/estimator.py   |  4 ++--
 batchglm/train/tf2/glm_norm/estimator.py | 12 ++++++------
 5 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/batchglm/train/tf2/base/estimator.py b/batchglm/train/tf2/base/estimator.py
index ef9899ef..a90ec901 100644
--- a/batchglm/train/tf2/base/estimator.py
+++ b/batchglm/train/tf2/base/estimator.py
@@ -16,7 +16,7 @@ def __init__(self, input_data, dtype):
 
     def _train(
             self,
-            batched_model: bool,
+            is_batched: bool,
             batch_size: int,
             optimizer_object: tf.keras.optimizers.Optimizer,
             convergence_criteria: str,
diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index a993b3de..5a4e8699 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -63,7 +63,7 @@ def __init__(
     def _train(
             self,
             noise_model: str,
-            batched_model: bool = True,
+            is_batched: bool = True,
             batch_size: int = 500,
             optimizer_object: tf.keras.optimizers.Optimizer = tf.keras.optimizers.Adam(),
             convergence_criteria: str = "step",
@@ -87,7 +87,7 @@ def _train(
         data_ids = tf.data.Dataset.from_tensor_slices(
             (tf.range(self._input_data.num_observations, name="sample_index", dtype=tf.dtypes.int64))
         )
-        if batched_model:
+        if is_batched:
             data = data_ids.shuffle(buffer_size=2 * batch_size).repeat().batch(batch_size)
         else:
             data = data_ids.shuffle(buffer_size=2 * batch_size).batch(batch_size, drop_remainder=True)
@@ -146,7 +146,7 @@ def convergence_decision(convergence_status, train_step):
             if train_step % 10 == 0:
                 logger.info('step %i', train_step)
 
-            if not batched_model:
+            if not is_batched:
                 results = None
                 x_batch = None
                 first_batch = True
@@ -164,8 +164,8 @@ def convergence_decision(convergence_status, train_step):
             else:
                 x_batch_tuple = next(dataset_iterator)
                 x_batch = self.getModelInput(x_batch_tuple, batch_features, not_converged)
-
                 results = self.model(x_batch)
+
             if irls_algo or nr_algo:
                 if irls_algo:
                     update_func([x_batch, *results, False, n_obs], True, False, batch_features, ll_prev)
@@ -201,7 +201,7 @@ def convergence_decision(convergence_status, train_step):
                 updated_lls = tf.scatter_nd(indices, ll_current, shape=ll_prev.shape)
                 ll_current = np.where(features_updated, updated_lls.numpy(), ll_prev)
 
-            if batched_model:
+            if is_batched:
                 jac_normalization = batch_size
             else:
                 jac_normalization = self._input_data.num_observations
diff --git a/batchglm/train/tf2/glm_beta/estimator.py b/batchglm/train/tf2/glm_beta/estimator.py
index 4268894f..35e6b486 100644
--- a/batchglm/train/tf2/glm_beta/estimator.py
+++ b/batchglm/train/tf2/glm_beta/estimator.py
@@ -89,14 +89,14 @@ def __init__(
 
     def train(
         self,
-        batched_model=True,
+        use_batching: bool = True,
         batch_size: int = 500,
         optimizer: str = "adam",
         learning_rate: float = 1e-2,
-        convergence_criteria="step",
-        stopping_criteria=1000,
-        autograd=False,
-        featurewise = True,
+        convergence_criteria: str = "step",
+        stopping_criteria: int = 1000,
+        autograd: bool = False,
+        featurewise: bool = True,
         benchmark: bool = False
     ):
         self.model = BetaGLM(
@@ -113,7 +113,7 @@ def train(
 
         super(Estimator, self)._train(
             noise_model="beta",
-            batched_model=batched_model,
+            use_batching=use_batching,
             batch_size=batch_size,
             optimizer_object=optimizer_object,
             convergence_criteria=convergence_criteria,
diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index a922f381..1baa74e1 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -87,7 +87,7 @@ def __init__(
 
     def train(
             self,
-            batched_model: bool = True,
+            use_batching: bool = True,
             batch_size: int = 500,
             optimizer: str = "adam",
             learning_rate: float = 1e-2,
@@ -112,7 +112,7 @@ def train(
 
         super(Estimator, self)._train(
             noise_model="nb",
-            batched_model=batched_model,
+            use_batching=use_batching,
             batch_size=batch_size,
             optimizer_object=optimizer_object,
             convergence_criteria=convergence_criteria,
diff --git a/batchglm/train/tf2/glm_norm/estimator.py b/batchglm/train/tf2/glm_norm/estimator.py
index 6f00150a..1e0f3fef 100644
--- a/batchglm/train/tf2/glm_norm/estimator.py
+++ b/batchglm/train/tf2/glm_norm/estimator.py
@@ -93,14 +93,14 @@ def __init__(
 
     def train(
         self,
-        batched_model=True,
+        use_batching: bool = True,
         batch_size: int = 500,
         optimizer: str = "adam",
         learning_rate: float = 1e-2,
-        convergence_criteria="step",
-        stopping_criteria=1000,
-        autograd=False,
-        featurewise = True,
+        convergence_criteria: str = "step",
+        stopping_criteria: int = 1000,
+        autograd: bool = False,
+        featurewise: bool = True,
         benchmark: bool = False
     ):
 
@@ -119,7 +119,7 @@ def train(
 
         super(Estimator, self)._train(
             noise_model="norm",
-            batched_model=batched_model,
+            use_batching=use_batching,
             batch_size=batch_size,
             optimizer_object=optimizer_object,
             convergence_criteria=convergence_criteria,

From 94f6568c5d43a39fd8c05478e3c1c0dbda806443 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 12 Nov 2019 13:16:48 +0100
Subject: [PATCH 005/124] refactoring optimizer->optim_algo

---
 batchglm/train/tf2/base_glm/estimator.py |  8 ++++----
 batchglm/train/tf2/glm_beta/estimator.py | 10 +++++-----
 batchglm/train/tf2/glm_nb/estimator.py   | 10 +++++-----
 batchglm/train/tf2/glm_norm/estimator.py | 11 ++++++-----
 4 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 5a4e8699..8036c5d7 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -71,14 +71,14 @@ def _train(
             autograd: bool = False,
             featurewise: bool = True,
             benchmark: bool = False,
-            optimizer: str = "adam"
+            optim_algo: str = "adam"
     ):
 
         if not self._initialized:
             raise RuntimeError("Cannot train the model: \
                                 Estimator not initialized. Did you forget to call estimator.initialize() ?")
 
-        if autograd and optimizer in ['nr', 'nr_tr']:
+        if autograd and optim_algo.lower() in ['nr', 'nr_tr']:
             logger.warning("Automatic differentiation is currently not supported for hessians. \
                             Falling back to closed form. Only Jacobians are calculated using autograd.")
 
@@ -117,11 +117,11 @@ def convergence_decision(convergence_status, train_step):
         dataset_iterator = iter(input_list)
         irls_algo = False
         nr_algo = False
-        if optimizer.lower() in ['nr','nr_tr']:
+        if optim_algo.lower() in ['nr','nr_tr']:
             nr_algo = True
             update_func = optimizer_object.perform_parameter_update
 
-        elif optimizer.lower() in ['irls','irls_tr','irls_gd','irls_gd_tr']:
+        elif optim_algo.lower() in ['irls','irls_tr','irls_gd','irls_gd_tr']:
             irls_algo = True
             update_func = optimizer_object.perform_parameter_update
 
diff --git a/batchglm/train/tf2/glm_beta/estimator.py b/batchglm/train/tf2/glm_beta/estimator.py
index 35e6b486..b11b92f3 100644
--- a/batchglm/train/tf2/glm_beta/estimator.py
+++ b/batchglm/train/tf2/glm_beta/estimator.py
@@ -91,7 +91,7 @@ def train(
         self,
         use_batching: bool = True,
         batch_size: int = 500,
-        optimizer: str = "adam",
+        optim_algo: str = "adam",
         learning_rate: float = 1e-2,
         convergence_criteria: str = "step",
         stopping_criteria: int = 1000,
@@ -105,22 +105,22 @@ def train(
             compute_a=self._train_loc,
             compute_b=self._train_scale,
             use_gradient_tape=autograd,
-            optimizer=optimizer
+            optimizer=optim_algo
         )
         self._loss = LossGLMBeta()
 
-        optimizer_object = self.get_optimizer_object(optimizer, learning_rate)
+        optimizer_object = self.get_optimizer_object(optim_algo, learning_rate)
 
         super(Estimator, self)._train(
             noise_model="beta",
-            use_batching=use_batching,
+            is_batched=use_batching,
             batch_size=batch_size,
             optimizer_object=optimizer_object,
             convergence_criteria=convergence_criteria,
             stopping_criteria=stopping_criteria,
             autograd=autograd,
             benchmark=benchmark,
-            optimizer=optimizer
+            optim_algo=optim_algo
         )
 
     def get_model_container(
diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index 1baa74e1..d5a7b3f8 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -89,7 +89,7 @@ def train(
             self,
             use_batching: bool = True,
             batch_size: int = 500,
-            optimizer: str = "adam",
+            optim_algo: str = "adam",
             learning_rate: float = 1e-2,
             convergence_criteria: str = "step",
             stopping_criteria: int = 1000,
@@ -103,16 +103,16 @@ def train(
             compute_a=self._train_loc,
             compute_b=self._train_scale,
             use_gradient_tape=autograd,
-            optimizer=optimizer
+            optimizer=optim_algo
         )
 
         self._loss = LossGLMNB()
 
-        optimizer_object = self.get_optimizer_object(optimizer, learning_rate)
+        optimizer_object = self.get_optimizer_object(optim_algo, learning_rate)
 
         super(Estimator, self)._train(
             noise_model="nb",
-            use_batching=use_batching,
+            is_batched=use_batching,
             batch_size=batch_size,
             optimizer_object=optimizer_object,
             convergence_criteria=convergence_criteria,
@@ -120,7 +120,7 @@ def train(
             autograd=autograd,
             featurewise=featurewise,
             benchmark=benchmark,
-            optimizer=optimizer
+            optim_algo=optim_algo
         )
 
     def get_model_container(
diff --git a/batchglm/train/tf2/glm_norm/estimator.py b/batchglm/train/tf2/glm_norm/estimator.py
index 1e0f3fef..3486920b 100644
--- a/batchglm/train/tf2/glm_norm/estimator.py
+++ b/batchglm/train/tf2/glm_norm/estimator.py
@@ -95,7 +95,7 @@ def train(
         self,
         use_batching: bool = True,
         batch_size: int = 500,
-        optimizer: str = "adam",
+        optim_algo: str = "adam",
         learning_rate: float = 1e-2,
         convergence_criteria: str = "step",
         stopping_criteria: int = 1000,
@@ -110,23 +110,24 @@ def train(
             compute_a=self._train_loc,
             compute_b=self._train_scale,
             use_gradient_tape=autograd,
-            optimizer=optimizer
+            optimizer=optim_algo
         )
 
         self._loss = LossGLMNorm()
 
-        optimizer_object = self.get_optimizer_object(optimizer, learning_rate)
+        optimizer_object = self.get_optimizer_object(optim_algo, learning_rate)
 
         super(Estimator, self)._train(
             noise_model="norm",
-            use_batching=use_batching,
+            is_batched=use_batching,
             batch_size=batch_size,
             optimizer_object=optimizer_object,
             convergence_criteria=convergence_criteria,
             stopping_criteria=stopping_criteria,
             autograd=autograd,
             featurewise=featurewise,
-            benchmark=benchmark
+            benchmark=benchmark,
+            optim_algo=optim_algo
 
         )
 

From 161dfd01e1e3a14f5f1e5ff7b16cb908ba595aed Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 15 Nov 2019 21:43:19 +0100
Subject: [PATCH 006/124] set_floatx("float64") for layers

---
 batchglm/train/tf2/base_glm/layers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/batchglm/train/tf2/base_glm/layers.py b/batchglm/train/tf2/base_glm/layers.py
index 8ced3a4b..b09b2642 100644
--- a/batchglm/train/tf2/base_glm/layers.py
+++ b/batchglm/train/tf2/base_glm/layers.py
@@ -2,6 +2,7 @@
 
 import abc
 import tensorflow as tf
+tf.keras.backend.set_floatx("float64")
 
 from .processModel import ProcessModelGLM
 

From 0f80f0b93ec85e8eae888aa2d239e141a9673757 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 15 Nov 2019 21:44:23 +0100
Subject: [PATCH 007/124] only create new model if not yet present

---
 batchglm/train/tf2/glm_beta/estimator.py | 18 ++++++++++--------
 batchglm/train/tf2/glm_nb/estimator.py   | 17 +++++++++--------
 batchglm/train/tf2/glm_norm/estimator.py | 18 +++++++++---------
 3 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/batchglm/train/tf2/glm_beta/estimator.py b/batchglm/train/tf2/glm_beta/estimator.py
index b11b92f3..bf8b81f5 100644
--- a/batchglm/train/tf2/glm_beta/estimator.py
+++ b/batchglm/train/tf2/glm_beta/estimator.py
@@ -99,14 +99,16 @@ def train(
         featurewise: bool = True,
         benchmark: bool = False
     ):
-        self.model = BetaGLM(
-            model_vars=self.model_vars,
-            dtype=self.model_vars.dtype,
-            compute_a=self._train_loc,
-            compute_b=self._train_scale,
-            use_gradient_tape=autograd,
-            optimizer=optim_algo
-        )
+
+        if self.model is None:
+            self.model = BetaGLM(
+                model_vars=self.model_vars,
+                dtype=self.model_vars.dtype,
+                compute_a=self._train_loc,
+                compute_b=self._train_scale,
+                use_gradient_tape=autograd,
+                optimizer=optim_algo
+            )
         self._loss = LossGLMBeta()
 
         optimizer_object = self.get_optimizer_object(optim_algo, learning_rate)
diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index d5a7b3f8..dc266c57 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -97,14 +97,15 @@ def train(
             featurewise: bool = True,
             benchmark: bool = False
     ):
-        self.model = NBGLM(
-            model_vars=self.model_vars,
-            dtype=self.model_vars.dtype,
-            compute_a=self._train_loc,
-            compute_b=self._train_scale,
-            use_gradient_tape=autograd,
-            optimizer=optim_algo
-        )
+        if self.model is None:
+            self.model = NBGLM(
+                model_vars=self.model_vars,
+                dtype=self.model_vars.dtype,
+                compute_a=self._train_loc,
+                compute_b=self._train_scale,
+                use_gradient_tape=autograd,
+                optimizer=optim_algo
+            )
 
         self._loss = LossGLMNB()
 
diff --git a/batchglm/train/tf2/glm_norm/estimator.py b/batchglm/train/tf2/glm_norm/estimator.py
index 3486920b..5d0571ae 100644
--- a/batchglm/train/tf2/glm_norm/estimator.py
+++ b/batchglm/train/tf2/glm_norm/estimator.py
@@ -103,15 +103,15 @@ def train(
         featurewise: bool = True,
         benchmark: bool = False
     ):
-
-        self.model = NormGLM(
-            model_vars=self.model_vars,
-            dtype=self.model_vars.dtype,
-            compute_a=self._train_loc,
-            compute_b=self._train_scale,
-            use_gradient_tape=autograd,
-            optimizer=optim_algo
-        )
+        if self.model is None:
+            self.model = NormGLM(
+                model_vars=self.model_vars,
+                dtype=self.model_vars.dtype,
+                compute_a=self._train_loc,
+                compute_b=self._train_scale,
+                use_gradient_tape=autograd,
+                optimizer=optim_algo
+            )
 
         self._loss = LossGLMNorm()
 

From ee4ccd1fdd26f572bd5a264412e404ac783cc3c8 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 15 Nov 2019 21:45:26 +0100
Subject: [PATCH 008/124] batchsize now max(n_obs) + fix negation in loss

---
 batchglm/train/tf2/base_glm/estimator.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 8036c5d7..abba7f23 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -26,6 +26,7 @@ def initialize(self, **kwargs):
         self.times = []
         self.converged = []
         self._initialized = True
+        self.model = None
 
     def finalize(self, **kwargs):
         """
@@ -35,11 +36,12 @@ def finalize(self, **kwargs):
         Changes .model entry from tf-based EstimatorGraph to numpy based Model instance and
         transfers relevant attributes.
         """
+
         a_var, b_var = self.model.unpack_params([self.model.params, self.model.model_vars.a_var.get_shape()[0]])
         self.model = self.get_model_container(self._input_data)
         self.model._a_var = a_var
         self.model._b_var = b_var
-        self._loss = tf.reduce_sum(-self._log_likelihood / self.input_data.num_observations)
+        self._loss = tf.reduce_sum(np.negative(self._log_likelihood) / self.input_data.num_observations)
 
     def __init__(
             self,
@@ -64,7 +66,7 @@ def _train(
             self,
             noise_model: str,
             is_batched: bool = True,
-            batch_size: int = 500,
+            batch_size: int = 100,
             optimizer_object: tf.keras.optimizers.Optimizer = tf.keras.optimizers.Adam(),
             convergence_criteria: str = "step",
             stopping_criteria: int = 1000,
@@ -73,7 +75,8 @@ def _train(
             benchmark: bool = False,
             optim_algo: str = "adam"
     ):
-
+        if batch_size > self.input_data.num_observations:
+            batch_size = self.input_data.num_observations
         if not self._initialized:
             raise RuntimeError("Cannot train the model: \
                                 Estimator not initialized. Did you forget to call estimator.initialize() ?")
@@ -84,6 +87,7 @@ def _train(
 
         self.noise_model = noise_model
         # Slice data and create batches
+
         data_ids = tf.data.Dataset.from_tensor_slices(
             (tf.range(self._input_data.num_observations, name="sample_index", dtype=tf.dtypes.int64))
         )
@@ -172,6 +176,7 @@ def convergence_decision(convergence_status, train_step):
                     if self._train_scale:
                         update_func([x_batch, *results, False, n_obs], False, True, batch_features, ll_prev)
                 else:
+                    print(results)
                     update_func([x_batch, *results, False, n_obs], True, True, batch_features, ll_prev)
                 features_updated = self.model.model_vars.updated
             else:

From 88ae48a370ee9da54af52cc1822034b4e1c07dd3 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 15 Nov 2019 21:47:45 +0100
Subject: [PATCH 009/124] fix default convergence criteria

---
 batchglm/train/tf2/glm_beta/training_strategies.py | 10 +++++-----
 batchglm/train/tf2/glm_nb/training_strategies.py   | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/batchglm/train/tf2/glm_beta/training_strategies.py b/batchglm/train/tf2/glm_beta/training_strategies.py
index 9bd8b271..301b8df8 100644
--- a/batchglm/train/tf2/glm_beta/training_strategies.py
+++ b/batchglm/train/tf2/glm_beta/training_strategies.py
@@ -5,7 +5,7 @@ class TrainingStrategies(Enum):
     AUTO = None
     DEFAULT = [
         {
-            "convergence_criteria": "all_converged_ll",
+            "convergence_criteria": "all_converged",
             "stopping_criteria": 1e-8,
             "use_batching": False,
             "optim_algo": "nr_tr",
@@ -13,7 +13,7 @@ class TrainingStrategies(Enum):
     ]
     INEXACT = [
         {
-            "convergence_criteria": "all_converged_ll",
+            "convergence_criteria": "all_converged",
             "stopping_criteria": 1e-6,
             "use_batching": False,
             "optim_algo": "nr_tr",
@@ -21,7 +21,7 @@ class TrainingStrategies(Enum):
     ]
     EXACT = [
         {
-            "convergence_criteria": "all_converged_ll",
+            "convergence_criteria": "all_converged",
             "stopping_criteria": 1e-8,
             "use_batching": False,
             "optim_algo": "nr_tr",
@@ -29,9 +29,9 @@ class TrainingStrategies(Enum):
     ]
     IRLS = [
         {
-            "convergence_criteria": "all_converged_ll",
+            "convergence_criteria": "all_converged",
             "stopping_criteria": 1e-8,
             "use_batching": False,
             "optim_algo": "irls_tr",
         },
-    ]
\ No newline at end of file
+    ]
diff --git a/batchglm/train/tf2/glm_nb/training_strategies.py b/batchglm/train/tf2/glm_nb/training_strategies.py
index 9bd8b271..301b8df8 100644
--- a/batchglm/train/tf2/glm_nb/training_strategies.py
+++ b/batchglm/train/tf2/glm_nb/training_strategies.py
@@ -5,7 +5,7 @@ class TrainingStrategies(Enum):
     AUTO = None
     DEFAULT = [
         {
-            "convergence_criteria": "all_converged_ll",
+            "convergence_criteria": "all_converged",
             "stopping_criteria": 1e-8,
             "use_batching": False,
             "optim_algo": "nr_tr",
@@ -13,7 +13,7 @@ class TrainingStrategies(Enum):
     ]
     INEXACT = [
         {
-            "convergence_criteria": "all_converged_ll",
+            "convergence_criteria": "all_converged",
             "stopping_criteria": 1e-6,
             "use_batching": False,
             "optim_algo": "nr_tr",
@@ -21,7 +21,7 @@ class TrainingStrategies(Enum):
     ]
     EXACT = [
         {
-            "convergence_criteria": "all_converged_ll",
+            "convergence_criteria": "all_converged",
             "stopping_criteria": 1e-8,
             "use_batching": False,
             "optim_algo": "nr_tr",
@@ -29,9 +29,9 @@ class TrainingStrategies(Enum):
     ]
     IRLS = [
         {
-            "convergence_criteria": "all_converged_ll",
+            "convergence_criteria": "all_converged",
             "stopping_criteria": 1e-8,
             "use_batching": False,
             "optim_algo": "irls_tr",
         },
-    ]
\ No newline at end of file
+    ]

From f0211b8dafbdbd2ce42a93af2cf8d72d9461b4f3 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 15 Nov 2019 22:00:36 +0100
Subject: [PATCH 010/124] reset training_strategies correctly

---
 .../train/tf2/glm_beta/training_strategies.py |  8 +++----
 .../train/tf2/glm_nb/training_strategies.py   | 23 +++++--------------
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/batchglm/train/tf2/glm_beta/training_strategies.py b/batchglm/train/tf2/glm_beta/training_strategies.py
index 301b8df8..b6db5b22 100644
--- a/batchglm/train/tf2/glm_beta/training_strategies.py
+++ b/batchglm/train/tf2/glm_beta/training_strategies.py
@@ -5,7 +5,7 @@ class TrainingStrategies(Enum):
     AUTO = None
     DEFAULT = [
         {
-            "convergence_criteria": "all_converged",
+            "convergence_criteria": "all_converged_ll",
             "stopping_criteria": 1e-8,
             "use_batching": False,
             "optim_algo": "nr_tr",
@@ -13,7 +13,7 @@ class TrainingStrategies(Enum):
     ]
     INEXACT = [
         {
-            "convergence_criteria": "all_converged",
+            "convergence_criteria": "all_converged_ll",
             "stopping_criteria": 1e-6,
             "use_batching": False,
             "optim_algo": "nr_tr",
@@ -21,7 +21,7 @@ class TrainingStrategies(Enum):
     ]
     EXACT = [
         {
-            "convergence_criteria": "all_converged",
+            "convergence_criteria": "all_converged_ll",
             "stopping_criteria": 1e-8,
             "use_batching": False,
             "optim_algo": "nr_tr",
@@ -29,7 +29,7 @@ class TrainingStrategies(Enum):
     ]
     IRLS = [
         {
-            "convergence_criteria": "all_converged",
+            "convergence_criteria": "all_converged_ll",
             "stopping_criteria": 1e-8,
             "use_batching": False,
             "optim_algo": "irls_tr",
diff --git a/batchglm/train/tf2/glm_nb/training_strategies.py b/batchglm/train/tf2/glm_nb/training_strategies.py
index 301b8df8..e8b39257 100644
--- a/batchglm/train/tf2/glm_nb/training_strategies.py
+++ b/batchglm/train/tf2/glm_nb/training_strategies.py
@@ -6,32 +6,21 @@ class TrainingStrategies(Enum):
     DEFAULT = [
         {
             "convergence_criteria": "all_converged",
-            "stopping_criteria": 1e-8,
             "use_batching": False,
-            "optim_algo": "nr_tr",
+            "optim_algo": "irls_gd_tr",
         },
     ]
-    INEXACT = [
-        {
-            "convergence_criteria": "all_converged",
-            "stopping_criteria": 1e-6,
-            "use_batching": False,
-            "optim_algo": "nr_tr",
-        },
-    ]
-    EXACT = [
+    IRLS = [
         {
             "convergence_criteria": "all_converged",
-            "stopping_criteria": 1e-8,
             "use_batching": False,
-            "optim_algo": "nr_tr",
+            "optim_algo": "irls_gd_tr",
         },
     ]
-    IRLS = [
+    IRLS_BATCHED = [
         {
             "convergence_criteria": "all_converged",
-            "stopping_criteria": 1e-8,
-            "use_batching": False,
-            "optim_algo": "irls_tr",
+            "use_batching": True,
+            "optim_algo": "irls_gd_tr",
         },
     ]

From 60027f7fd6491538abb700352641cccb1ef2ba3b Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 15 Nov 2019 22:45:52 +0100
Subject: [PATCH 011/124] convert params to numpy in finalize

---
 batchglm/train/tf2/base_glm/estimator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index abba7f23..d122248e 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -39,9 +39,9 @@ def finalize(self, **kwargs):
 
         a_var, b_var = self.model.unpack_params([self.model.params, self.model.model_vars.a_var.get_shape()[0]])
         self.model = self.get_model_container(self._input_data)
-        self.model._a_var = a_var
-        self.model._b_var = b_var
-        self._loss = tf.reduce_sum(np.negative(self._log_likelihood) / self.input_data.num_observations)
+        self.model._a_var = a_var.numpy()
+        self.model._b_var = b_var.numpy()
+        self._loss = tf.reduce_sum(np.negative(self._log_likelihood) / self.input_data.num_observations).numpy()
 
     def __init__(
             self,

From e4a5594b821ce2163c04fe59b0b7fd46354d2b5b Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 15 Nov 2019 23:41:53 +0100
Subject: [PATCH 012/124] added api imports back in

---
 batchglm/api/models/tf2/glm_beta.py | 4 ++--
 batchglm/api/models/tf2/glm_norm.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/batchglm/api/models/tf2/glm_beta.py b/batchglm/api/models/tf2/glm_beta.py
index 8b5f563e..67cffbf8 100644
--- a/batchglm/api/models/tf2/glm_beta.py
+++ b/batchglm/api/models/tf2/glm_beta.py
@@ -1,2 +1,2 @@
-#from batchglm.models.glm_beta import InputDataGLM, Model, Simulator
-#from batchglm.train.tf2.glm_beta import Estimator
+from batchglm.models.glm_beta import InputDataGLM, Model, Simulator
+from batchglm.train.tf2.glm_beta import Estimator
diff --git a/batchglm/api/models/tf2/glm_norm.py b/batchglm/api/models/tf2/glm_norm.py
index 45fc0453..13e1727d 100644
--- a/batchglm/api/models/tf2/glm_norm.py
+++ b/batchglm/api/models/tf2/glm_norm.py
@@ -1,2 +1,2 @@
-#from batchglm.models.glm_norm import InputDataGLM, Model, Simulator
-#from batchglm.train.tf2.glm_norm import Estimator
+from batchglm.models.glm_norm import InputDataGLM, Model, Simulator
+from batchglm.train.tf2.glm_norm import Estimator

From ee92dedf429598f578a6816ca56722b7dfdc2540 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 29 Nov 2019 17:29:08 +0100
Subject: [PATCH 013/124] cleanup, warning logs, no final assign jac, hes

---
 batchglm/train/tf2/base_glm/estimator.py | 143 ++++++++++++++---------
 1 file changed, 85 insertions(+), 58 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index d122248e..eda643c0 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -75,6 +75,7 @@ def _train(
             benchmark: bool = False,
             optim_algo: str = "adam"
     ):
+        print(featurewise)
         if batch_size > self.input_data.num_observations:
             batch_size = self.input_data.num_observations
         if not self._initialized:
@@ -148,7 +149,7 @@ def convergence_decision(convergence_status, train_step):
             not_converged = np.logical_not(self.model.model_vars.converged)
             ll_prev = ll_current.copy()
             if train_step % 10 == 0:
-                logger.info('step %i', train_step)
+                logger.warning('step %i', train_step)
 
             if not is_batched:
                 results = None
@@ -172,12 +173,30 @@ def convergence_decision(convergence_status, train_step):
 
             if irls_algo or nr_algo:
                 if irls_algo:
-                    update_func([x_batch, *results, False, n_obs], True, False, batch_features, ll_prev)
+                    update_func(
+                        [x_batch, *results, False, n_obs],
+                        True,
+                        False,
+                        batch_features,
+                        ll_prev
+                    )
                     if self._train_scale:
-                        update_func([x_batch, *results, False, n_obs], False, True, batch_features, ll_prev)
+                        update_func(
+                            [x_batch, *results, False, n_obs],
+                            False,
+                            True,
+                            batch_features,
+                            ll_prev
+                        )
                 else:
                     print(results)
-                    update_func([x_batch, *results, False, n_obs], True, True, batch_features, ll_prev)
+                    update_func(
+                        [x_batch, *results, False, n_obs],
+                        True,
+                        True,
+                        batch_features,
+                        ll_prev
+                    )
                 features_updated = self.model.model_vars.updated
             else:
                 if batch_features:
@@ -218,11 +237,22 @@ def convergence_decision(convergence_status, train_step):
                 grad_numpy = tf.abs(tf.transpose(results[1]))
             if batch_features:
                 indices = tf.where(not_converged)
-                grad_numpy = tf.scatter_nd(indices, grad_numpy, shape=(self.model.model_vars.n_features,
-                                                                       self.model.params.get_shape()[0]))
+                grad_numpy = tf.scatter_nd(
+                    indices,
+                    grad_numpy,
+                    shape=(self.model.model_vars.n_features, self.model.params.get_shape()[0])
+                )
             grad_numpy = grad_numpy.numpy()
-            convergences = self.calculate_convergence(converged_prev, ll_prev, prev_norm_loc, prev_norm_scale,
-                                                      ll_current, jac_normalization, grad_numpy, features_updated)
+            convergences = self.calculate_convergence(
+                converged_prev,
+                ll_prev,
+                prev_norm_loc,
+                prev_norm_scale,
+                ll_current,
+                jac_normalization,
+                grad_numpy,
+                features_updated
+            )
             converged_current, converged_f, converged_g, converged_x = convergences
 
             self.model.model_vars.convergence_update(converged_current, features_updated)
@@ -231,12 +261,14 @@ def convergence_decision(convergence_status, train_step):
                 if featurewise and not batch_features:
                     batch_features = True
                     self.model.batch_features = batch_features
-                logger.info("Step: %i loss: %f, converged %i, updated %i, (logs: %i, grad: %i, x_step: %i)",
-                            train_step,
-                            np.sum(ll_current),
-                            num_converged,
-                            np.sum(features_updated).astype("int32"),
-                            np.sum(converged_f), np.sum(converged_g), np.sum(converged_x))
+                logger.warning(
+                    "Step: %i loss: %f, converged %i, updated %i, (logs: %i, grad: %i, x_step: %i)",
+                    train_step,
+                    np.sum(ll_current),
+                    num_converged,
+                    np.sum(features_updated).astype("int32"),
+                    np.sum(converged_f), np.sum(converged_g), np.sum(converged_x)
+                )
             train_step += 1
             if benchmark:
                 t1_epoch = time.time()
@@ -252,13 +284,18 @@ def convergence_decision(convergence_status, train_step):
             self._hessian = results[2].numpy()
             self._jacobian = results[1].numpy()
         elif irls_algo:
-            self._fisher_inv = tf.concat([results[3], results[4]], axis=0).numpy()
-            self._jacobian = tf.concat([results[1], results[2]], axis=0).numpy()
+            # TODO: maybe report fisher inv here. But concatenation only works if !intercept_scale
+            # self._fisher_inv = tf.concat([results[3], results[4]], axis=0).numpy()
+            self._fisher_inv = None # self.model.calc_hessians(x_batch)[3]
+            self._jacobian = None # tf.concat([results[1], results[2]], axis=0).numpy()
         else:
             self._jacobian = results[1].numpy()
 
     def getModelInput(self, x_batch_tuple: tuple, batch_features: bool, not_converged):
-
+        """
+            Checks whether batch_features is true and returns a smaller x_batch tuple reduced
+            in feature space. Otherwise returns the x_batch.
+        """
         if batch_features:
             x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor = x_batch_tuple
             if isinstance(self._input_data.x, scipy.sparse.csr_matrix):
@@ -281,6 +318,9 @@ def getModelInput(self, x_batch_tuple: tuple, batch_features: bool, not_converge
 
     def calculate_convergence(self, converged_prev, ll_prev, prev_norm_loc, prev_norm_scale, ll_current,
                               jac_normalization, grad_numpy, features_updated):
+        """
+            Wrapper method to perform all necessary convergence checks.
+        """
         def get_convergence(converged_previous, condition1, condition2):
             return np.logical_or(converged_previous, np.logical_and(condition1, condition2))
 
@@ -329,51 +369,38 @@ def calc_x_step(idx_train, prev_norm):
                                                 x_norm_scale < pkg_constants.XTOL_BY_FEATURE_SCALE)
         return converged_current, converged_f, converged_g, converged_x
 
-    def get_optimizer_object(self, optimizer, learning_rate):
+    def get_optimizer_object(self, optimizer: str, learning_rate):
+        """
+            Creates an optimizer object based on the given optimizer string.
+        """
 
         optimizer = optimizer.lower()
 
         if optimizer == "gd":
-            return tf.keras.optimizers.SGD(learning_rate=learning_rate)
-        if optimizer == "adam":
-            return tf.keras.optimizers.Adam(learning_rate=learning_rate)
-        if optimizer == "adagrad":
-            return tf.keras.optimizers.Adagrad(learning_rate=learning_rate)
-        if optimizer == "rmsprop":
-            return tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
-        if optimizer == "irls":
-            return IRLS(dtype=self.dtype,
-                        trusted_region_mode=False,
-                        model=self.model,
-                        name="IRLS")
-        if optimizer == "irls_tr":
-            return IRLS(dtype=self.dtype,
-                        trusted_region_mode=True,
-                        model=self.model,
-                        name="IRLS_TR")
-        if optimizer == "irls_gd":
-            return IRLS(dtype=self.dtype,
-                        trusted_region_mode=False,
-                        model=self.model,
-                        name="IRLS_GD")
-        if optimizer == "irls_gd_tr":
-            return IRLS(dtype=self.dtype,
-                        trusted_region_mode=True,
-                        model=self.model,
-                        name="IRLS_GD_TR")
-        if optimizer == "nr":
-            return NR(dtype=self.dtype,
-                      trusted_region_mode=False,
-                      model=self.model,
-                      name="NR")
-        if optimizer == "nr_tr":
-            return NR(dtype=self.dtype,
-                      trusted_region_mode=True,
-                      model=self.model,
-                      name="NR_TR")
-
-        logger.warning("No valid optimizer given. Default optimizer Adam chosen.")
-        return tf.keras.optimizers.Adam(learning_rate=learning_rate)
+            optim_obj = tf.keras.optimizers.SGD(learning_rate=learning_rate)
+        elif optimizer == "adam":
+            optim_obj = tf.keras.optimizers.Adam(learning_rate=learning_rate)
+        elif optimizer == "adagrad":
+            optim_obj = tf.keras.optimizers.Adagrad(learning_rate=learning_rate)
+        elif optimizer == "rmsprop":
+            optim_obj = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
+        else:
+            tr_mode = optimizer.endswith('tr')
+            init_dict = {
+                "dtype": self.dtype,
+                "model": self.model,
+                "name": optimizer,
+                "trusted_region_mode": tr_mode
+            }
+            if optimizer.startswith('irls'):
+                optim_obj = IRLS(**init_dict)
+            elif optimizer.startswith('nr'):
+                optim_obj = NR(**init_dict)
+            else:
+                optim_obj = tf.keras.optimizers.Adam(learning_rate=learning_rate)
+                logger.warning("No valid optimizer given. Default optimizer Adam chosen.")
+
+        return optim_obj
 
     def fetch_fn(self, idx):
         """

From 65016d282e3aa16968ba266b4260af78e37d2fac Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 29 Nov 2019 17:31:24 +0100
Subject: [PATCH 014/124] hessian calculation moved to its own method

---
 batchglm/train/tf2/base_glm/model.py | 92 +++++++++++++++-------------
 1 file changed, 48 insertions(+), 44 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index 0d0e1fb4..226ce816 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -155,6 +155,52 @@ def _calc_jacobians(self, inputs, concat, transpose=True):
             return loc, scale, log_probs, tf.negative(jacobians)
         return loc, scale, log_probs, tf.negative(jac_a), tf.negative(jac_b)
 
+    def calc_hessians(self, inputs, concat=False):
+        # with tf.GradientTape(persistent=True) as g2:
+        if concat:
+            loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, concat=True, transpose=False)
+        else:
+            loc, scale, log_probs, jac_a, jac_b = self._calc_jacobians(inputs, concat=False, transpose=False)
+        # results_arr = [jacobians[:, i] for i in tf.range(self.params_copy.get_shape()[0])]
+
+        '''
+        autograd not yet working. TODO: Search error in the following code:
+
+        if self.use_gradient_tape:
+
+            i = tf.constant(0, tf.int32)
+            h_tensor_array = tf.TensorArray(  # hessian slices [:,:,j]
+                dtype=self.params_copy.dtype,
+                size=self.params_copy.get_shape()[0],
+                clear_after_read=False
+            )
+            while i < self.params_copy.get_shape()[0]:
+                grad = g2.gradient(results_arr[i], self.params_copy)
+                h_tensor_array.write(index=i, value=grad)
+                i += 1
+
+            # h_tensor_array is a TensorArray, reshape this into a tensor so that it can be used
+            # in down-stream computation graphs.
+
+            hessians = tf.transpose(tf.reshape(
+                h_tensor_array.stack(),
+                tf.stack((self.params_copy.get_shape()[0],
+                          self.params_copy.get_shape()[0],
+                          self.params_copy.get_shape()[1]))
+            ), perm=[2, 1, 0])
+            hessians = tf.negative(hessians)
+        '''
+        # else:
+        print('opsdfopdsfpodsfpodsfpo')
+        if concat:
+            hessians = tf.negative(self.hessian([*inputs[0:3], loc, scale, True]))
+            return log_probs, jacobians, hessians
+
+        hes_aa, hes_ab, hes_ba, hes_bb = self.hessian([*inputs[0:3], loc, scale, False])
+        return log_probs, jac_a, jac_b, tf.negative(hes_aa), \
+            tf.negative(hes_ab), tf.negative(hes_ba), tf.negative(hes_bb)
+        # del g2 # need to delete this GradientTape because persistent is True.
+
     def call(self, inputs, training=False, mask=None):
         # X_data, design_loc, design_scale, size_factors = inputs
 
@@ -166,50 +212,8 @@ def call(self, inputs, training=False, mask=None):
 
         # This is for SecondOrder NR/NR_TR
         if self.calc_hessian:
-            # with tf.GradientTape(persistent=True) as g2:
-            if self.concat_grads:
-                loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, concat=True, transpose=False)
-            else:
-                loc, scale, log_probs, jac_a, jac_b = self._calc_jacobians(inputs, concat=False, transpose=False)
-            # results_arr = [jacobians[:, i] for i in tf.range(self.params_copy.get_shape()[0])]
-
-            '''
-            autograd not yet working. TODO: Search error in the following code:
-
-            if self.use_gradient_tape:
-
-                i = tf.constant(0, tf.int32)
-                h_tensor_array = tf.TensorArray(  # hessian slices [:,:,j]
-                    dtype=self.params_copy.dtype,
-                    size=self.params_copy.get_shape()[0],
-                    clear_after_read=False
-                )
-                while i < self.params_copy.get_shape()[0]:
-                    grad = g2.gradient(results_arr[i], self.params_copy)
-                    h_tensor_array.write(index=i, value=grad)
-                    i += 1
-
-                # h_tensor_array is a TensorArray, reshape this into a tensor so that it can be used
-                # in down-stream computation graphs.
-
-                hessians = tf.transpose(tf.reshape(
-                    h_tensor_array.stack(),
-                    tf.stack((self.params_copy.get_shape()[0],
-                              self.params_copy.get_shape()[0],
-                              self.params_copy.get_shape()[1]))
-                ), perm=[2, 1, 0])
-                hessians = tf.negative(hessians)
-            '''
-            # else:
-            if self.concat_grads:
-                hessians = tf.negative(self.hessian([*inputs[0:3], loc, scale, True]))
-                return log_probs, jacobians, hessians
-
-            hes_aa, hes_ab, hes_ba, hes_bb = self.hessian([*inputs[0:3], loc, scale, False])
-            return log_probs, jac_a, jac_b, tf.negative(hes_aa), \
-                tf.negative(hes_ab), tf.negative(hes_ba), tf.negative(hes_bb)
-            # del g2 # need to delete this GradientTape because persistent is True.
-
+            results = self.calc_hessians(inputs, concat=self.concat_grads)
+            return results
         # This is for SecondOrder IRLS/IRLS_GD/IRLS_TR/IRLS_GD_TR
         if self.calc_fim:
             if self.concat_grads:

From 486d1ba7b294eada729cb42abd74392b7d8019d1 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 29 Nov 2019 17:32:04 +0100
Subject: [PATCH 015/124] lower case name checks

---
 batchglm/train/tf2/base_glm/optim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index 04bd2f16..ae50a2fa 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -150,7 +150,7 @@ def _trust_region_ops(
     def __init__(self, dtype: tf.dtypes.DType, trusted_region_mode: bool, model: tf.keras.Model, name: str):
 
         self.model = model
-        self.gd = name in ['IRLS_GD', 'IRLS_GD_TR']
+        self.gd = name in ['irls_gd', 'irls_gd_tr']
 
         super(SecondOrderOptim, self).__init__(name)
 

From a97be15fb93e9f3748cae735b6b339bb5a1596f0 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Sat, 30 Nov 2019 15:39:04 +0100
Subject: [PATCH 016/124] added ADAM/ADAM_BATCHED to nb training strategies

---
 batchglm/train/tf2/glm_nb/training_strategies.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/batchglm/train/tf2/glm_nb/training_strategies.py b/batchglm/train/tf2/glm_nb/training_strategies.py
index e8b39257..858ec8a6 100644
--- a/batchglm/train/tf2/glm_nb/training_strategies.py
+++ b/batchglm/train/tf2/glm_nb/training_strategies.py
@@ -24,3 +24,17 @@ class TrainingStrategies(Enum):
             "optim_algo": "irls_gd_tr",
         },
     ]
+    ADAM_BATCHED = [
+        {
+            "convergence_criteria": "all_converged",
+            "use_batching": True,
+            "optim_algo": "adam",
+        },
+    ]
+    ADAM = [
+        {
+            "convergence_criteria": "all_converged",
+            "use_batching": False,
+            "optim_algo": "adam",
+        },
+    ]

From b47cfcd5d3bf3b69a5590aee6c288bda464ed71a Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Thu, 12 Dec 2019 10:46:57 +0100
Subject: [PATCH 017/124] added final eval run + fix featurewise dim problem

---
 batchglm/train/tf2/base_glm/estimator.py | 32 ++++++++++++++++++++----
 batchglm/train/tf2/base_glm/model.py     |  2 +-
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index eda643c0..3c67d439 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -75,7 +75,6 @@ def _train(
             benchmark: bool = False,
             optim_algo: str = "adam"
     ):
-        print(featurewise)
         if batch_size > self.input_data.num_observations:
             batch_size = self.input_data.num_observations
         if not self._initialized:
@@ -189,7 +188,6 @@ def convergence_decision(convergence_status, train_step):
                             ll_prev
                         )
                 else:
-                    print(results)
                     update_func(
                         [x_batch, *results, False, n_obs],
                         True,
@@ -276,21 +274,45 @@ def convergence_decision(convergence_status, train_step):
                 self.converged.append(num_converged)
 
         # Evaluate final params
+        logger.warning("Final Evaluation run.")
         self._log_likelihood = results[0].numpy()
         self._fisher_inv = tf.zeros(shape=()).numpy()
         self._hessian = tf.zeros(shape=()).numpy()
+        self.model.batch_features = False
+
+        # change to hessian mode since we still use hessian instead of FIM for self._fisher_inv
+        if irls_algo:
+            self.model.calc_fim = False
+            self.model.calc_hessian = True
+            self.model.concat_grads = True
+
+        first_batch = True
+        for x_batch_tuple in input_list:
+            current_results = self.model(x_batch_tuple)
+            if first_batch:
+                results = list(current_results)
+                first_batch = False
+            else:
+                for i, x in enumerate(current_results):
+                    results[i] += x
 
         if nr_algo:
             self._hessian = results[2].numpy()
             self._jacobian = results[1].numpy()
         elif irls_algo:
             # TODO: maybe report fisher inv here. But concatenation only works if !intercept_scale
-            # self._fisher_inv = tf.concat([results[3], results[4]], axis=0).numpy()
-            self._fisher_inv = None # self.model.calc_hessians(x_batch)[3]
-            self._jacobian = None # tf.concat([results[1], results[2]], axis=0).numpy()
+            self._fisher_inv = results[2].numpy()
+            self._jacobian = results[1].numpy()
+
+            # change back to FIM mode
+            self.model.calc_fim = True
+            self.model.calc_hessian = False
+            self.model.concat_grads = False
         else:
             self._jacobian = results[1].numpy()
 
+        self.model.batch_features = batch_features
+
     def getModelInput(self, x_batch_tuple: tuple, batch_features: bool, not_converged):
         """
             Checks whether batch_features is true and returns a smaller x_batch tuple reduced
diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index 226ce816..732aa1c7 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -91,6 +91,7 @@ def _call_parameters(self, inputs, keep_previous_params_copy=False):
 
     def calc_ll(self, inputs, keep_previous_params_copy=False):
         parameters = self._call_parameters(inputs[1:], keep_previous_params_copy)
+
         log_probs = self.likelihood([*parameters[:-2], inputs[0], np.sum(self.model_vars.updated)])
         return (log_probs, *parameters[2:])
 
@@ -191,7 +192,6 @@ def calc_hessians(self, inputs, concat=False):
             hessians = tf.negative(hessians)
         '''
         # else:
-        print('opsdfopdsfpodsfpodsfpo')
         if concat:
             hessians = tf.negative(self.hessian([*inputs[0:3], loc, scale, True]))
             return log_probs, jacobians, hessians

From 2c26a2327f97adc391a923397e867042ec9e4a81 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Sun, 12 Jan 2020 19:07:20 +0100
Subject: [PATCH 018/124] bugfix: size_factors init dimension expansion

---
 batchglm/train/tf2/glm_beta/estimator.py |  8 +++-----
 batchglm/train/tf2/glm_nb/estimator.py   | 14 +++-----------
 batchglm/train/tf2/glm_norm/estimator.py | 12 ++----------
 3 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/batchglm/train/tf2/glm_beta/estimator.py b/batchglm/train/tf2/glm_beta/estimator.py
index bf8b81f5..bd266e02 100644
--- a/batchglm/train/tf2/glm_beta/estimator.py
+++ b/batchglm/train/tf2/glm_beta/estimator.py
@@ -146,8 +146,6 @@ def init_par(
         Initialize with Maximum Likelihood / Maximum of Momentum estimators
         """
 
-        size_factors_init = input_data.size_factors
-
         if init_model is None:
             groupwise_means = None
             init_a_str = None
@@ -162,7 +160,7 @@ def init_par(
                         x=input_data.x,
                         design_loc=input_data.design_loc,
                         constraints_loc=input_data.constraints_loc,
-                        size_factors=size_factors_init,
+                        size_factors=input_data.size_factors_init,
                         link_fn=lambda mean: np.log(
                             1/(1/self.np_clip_param(mean, "mean")-1)
                         )
@@ -198,7 +196,7 @@ def init_par(
                         x=input_data.x,
                         design_scale=input_data.design_scale[:, [0]],
                         constraints=input_data.constraints_scale[[0], :][:, [0]],
-                        size_factors=size_factors_init,
+                        size_factors=input_data.size_factors,
                         groupwise_means=None,
                         link_fn=lambda samplesize: np.log(self.np_clip_param(samplesize, "samplesize"))
                     )
@@ -226,7 +224,7 @@ def init_par(
                         x=input_data.x,
                         design_scale=input_data.design_scale,
                         constraints=input_data.constraints_scale,
-                        size_factors=size_factors_init,
+                        size_factors=input_data.size_factors,
                         groupwise_means=groupwise_means,
                         link_fn=lambda samplesize: np.log(self.np_clip_param(samplesize, "samplesize"))
                     )
diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index dc266c57..3409e0ab 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -154,14 +154,6 @@ def init_par(
         $$
         """
 
-        size_factors_init = input_data.size_factors
-        if size_factors_init is not None:
-            size_factors_init = np.expand_dims(size_factors_init, axis=1)
-            size_factors_init = np.broadcast_to(
-                array=size_factors_init,
-                shape=[input_data.num_observations, input_data.num_features]
-            )
-
         if init_model is None:
             groupwise_means = None
             init_a_str = None
@@ -176,7 +168,7 @@ def init_par(
                         x=input_data.x,
                         design_loc=input_data.design_loc,
                         constraints_loc=input_data.constraints_loc,
-                        size_factors=size_factors_init,
+                        size_factors=input_data.size_factors,
                         link_fn=lambda loc: np.log(self.np_clip_param(loc, "loc"))
                     )
 
@@ -217,7 +209,7 @@ def init_par(
                         x=input_data.x,
                         design_scale=input_data.design_scale[:, [0]],
                         constraints=input_data.constraints_scale[[0], :][:, [0]],
-                        size_factors=size_factors_init,
+                        size_factors=input_data.size_factors,
                         groupwise_means=None,
                         link_fn=lambda scale: np.log(self.np_clip_param(scale, "scale"))
                     )
@@ -246,7 +238,7 @@ def init_par(
                         x=input_data.x,
                         design_scale=input_data.design_scale,
                         constraints=input_data.constraints_scale,
-                        size_factors=size_factors_init,
+                        size_factors=input_data.size_factors,
                         groupwise_means=groupwise_means,
                         link_fn=lambda scale: np.log(self.np_clip_param(scale, "scale"))
                     )
diff --git a/batchglm/train/tf2/glm_norm/estimator.py b/batchglm/train/tf2/glm_norm/estimator.py
index 5d0571ae..70de3c91 100644
--- a/batchglm/train/tf2/glm_norm/estimator.py
+++ b/batchglm/train/tf2/glm_norm/estimator.py
@@ -146,14 +146,6 @@ def init_par(self, input_data, init_a, init_b, init_model):
         Initialize with Maximum Likelihood / Maximum of Momentum estimators
         """
 
-        size_factors_init = input_data.size_factors
-        if size_factors_init is not None:
-            size_factors_init = np.expand_dims(size_factors_init, axis=1)
-            size_factors_init = np.broadcast_to(
-                array=size_factors_init,
-                shape=[input_data.num_observations, input_data.num_features]
-            )
-
         sf_given = False
         if input_data.size_factors is not None:
             if np.any(np.abs(input_data.size_factors - 1.) > 1e-8):
@@ -243,7 +235,7 @@ def init_par(self, input_data, init_a, init_b, init_model):
                         x=input_data.x,
                         design_scale=input_data.design_scale,
                         constraints=input_data.constraints_scale,
-                        size_factors=size_factors_init,
+                        size_factors=input_data.size_factors,
                         groupwise_means=groupwise_means,
                         link_fn=lambda sd: np.log(self.np_clip_param(sd, "sd"))
                     )
@@ -257,7 +249,7 @@ def init_par(self, input_data, init_a, init_b, init_model):
                         x=input_data.x,
                         design_scale=input_data.design_scale[:, [0]],
                         constraints=input_data.constraints_scale[[0], :][:, [0]],
-                        size_factors=size_factors_init,
+                        size_factors=input_data.size_factors,
                         groupwise_means=None,
                         link_fn=lambda sd: np.log(self.np_clip_param(sd, "sd"))
                     )

From cc2d0e01ae8ebeb5eae0d3a393591687f91aa868 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Sun, 12 Jan 2020 20:46:14 +0100
Subject: [PATCH 019/124] rm size_factors_tensor postprocessing in fetch_fn

---
 batchglm/train/tf2/base_glm/estimator.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 3c67d439..57dbf5d9 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -156,7 +156,6 @@ def convergence_decision(convergence_status, train_step):
                 first_batch = True
                 for x_batch_tuple in input_list:
                     x_batch = self.getModelInput(x_batch_tuple, batch_features, not_converged)
-
                     current_results = self.model(x_batch)
                     if first_batch:
                         results = list(current_results)
@@ -483,9 +482,6 @@ def fetch_fn(self, idx):
                 inp=[idx],
                 Tout=self._input_data.size_factors.dtype,
             )
-
-            size_factors_tensor.set_shape(idx.get_shape())
-            size_factors_tensor = tf.expand_dims(size_factors_tensor, axis=-1)
             size_factors_tensor = tf.cast(size_factors_tensor, dtype=self.dtype)
 
         else:

From 0e849b6c29f4dc7311d59bf2b3ed20098fe7bf35 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 13 Jan 2020 01:30:02 +0100
Subject: [PATCH 020/124] bugfix: calc_x_step: if cond never true

---
 batchglm/train/tf2/base_glm/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 57dbf5d9..2c6c7091 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -349,7 +349,7 @@ def get_convergence_by_method(converged_previous, condition1, condition2):
             return np.logical_and(np.logical_not(converged_previous), np.logical_and(condition1, condition2))
 
         def calc_x_step(idx_train, prev_norm):
-            if len(idx_train) > 0 and len(self.values) > 1:
+            if len(idx_train) > 0:
                 curr_norm = np.sqrt(np.sum(np.square(
                     np.abs(self.model.params.numpy()[idx_train, :])
                 ), axis=0))

From 1cfa504c0015334b42b0d341242903f1e509670a Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 14 Jan 2020 01:44:15 +0100
Subject: [PATCH 021/124] added TRTOL_BY_FEATURE_LOC/SCALE thresholds

---
 batchglm/pkg_constants.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index 9afb32bf..a0c4ed2e 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -27,6 +27,9 @@
 GTOL_BY_FEATURE_LOC = 1e-8
 GTOL_BY_FEATURE_SCALE = 1e-8
 
+TRTOL_BY_FEATURE_LOC = 1e-12
+TRTOL_BY_FEATURE_SCALE = 1e-12
+
 try:
     import tensorflow as tf
 

From 7939d8849294dda9aae1a4bcec5661a942a079b2 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 14 Jan 2020 01:45:47 +0100
Subject: [PATCH 022/124] added tr convergence / reworked convergence func

---
 batchglm/train/tf2/base_glm/estimator.py | 113 ++++++++++++++---------
 1 file changed, 70 insertions(+), 43 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 2c6c7091..6cd4aa03 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -147,8 +147,8 @@ def convergence_decision(convergence_status, train_step):
 
             not_converged = np.logical_not(self.model.model_vars.converged)
             ll_prev = ll_current.copy()
-            if train_step % 10 == 0:
-                logger.warning('step %i', train_step)
+            #if train_step % 10 == 0:
+            logger.warning('step %i: loss: %s', train_step, np.array2string(ll_current[0:10]))
 
             if not is_batched:
                 results = None
@@ -248,23 +248,33 @@ def convergence_decision(convergence_status, train_step):
                 ll_current,
                 jac_normalization,
                 grad_numpy,
-                features_updated
+                features_updated,
+                optimizer_object
             )
-            converged_current, converged_f, converged_g, converged_x = convergences
-
+            #converged_current, converged_f, converged_g, converged_x = convergences
+            converged_current = convergences[0]
             self.model.model_vars.convergence_update(converged_current, features_updated)
-            num_converged = np.sum(converged_current).astype("int32")
-            if np.sum(converged_current) != np.sum(converged_prev):
+            num_converged = np.sum(converged_current)
+            if num_converged != np.sum(converged_prev):
                 if featurewise and not batch_features:
                     batch_features = True
                     self.model.batch_features = batch_features
-                logger.warning(
-                    "Step: %i loss: %f, converged %i, updated %i, (logs: %i, grad: %i, x_step: %i)",
+                logger_pattern = "Step: %i loss: %f, converged %i, updated %i, (logs: %i, grad: %i, x_step: %i"
+                logger_data = [
                     train_step,
                     np.sum(ll_current),
-                    num_converged,
+                    num_converged.astype("int32"),
                     np.sum(features_updated).astype("int32"),
-                    np.sum(converged_f), np.sum(converged_g), np.sum(converged_x)
+                    *[np.sum(convergence_vals) for convergence_vals in convergences[1:]]
+                ]
+                if (irls_algo or nr_algo) and optimizer_object.trusted_region_mode:
+                    logger_pattern += " tr: %i)"
+                else:
+                    logger_pattern += ")"
+
+                logger.warning(
+                    logger_pattern,
+                    *logger_data
                 )
             train_step += 1
             if benchmark:
@@ -338,16 +348,42 @@ def getModelInput(self, x_batch_tuple: tuple, batch_features: bool, not_converge
         return x_batch
 
     def calculate_convergence(self, converged_prev, ll_prev, prev_norm_loc, prev_norm_scale, ll_current,
-                              jac_normalization, grad_numpy, features_updated):
+                              jac_normalization, grad_numpy, features_updated, optimizer_object):
         """
             Wrapper method to perform all necessary convergence checks.
         """
-        def get_convergence(converged_previous, condition1, condition2):
-            return np.logical_or(converged_previous, np.logical_and(condition1, condition2))
 
-        def get_convergence_by_method(converged_previous, condition1, condition2):
-            return np.logical_and(np.logical_not(converged_previous), np.logical_and(condition1, condition2))
+        total_converged = converged_prev.copy()
+        not_converged_prev = ~ converged_prev
+        """
+        Get all converged features due to change in ll < LLTOL_BY_FEATURE
+        IMPORTANT: we need to ensure they have also been updated, otherwise ll_prev = ll_current!
+        """
+        ll_difference = np.abs(ll_prev - ll_current) / ll_prev
+        ll_converged = (ll_difference < pkg_constants.LLTOL_BY_FEATURE) & features_updated
+        epoch_ll_converged = not_converged_prev & ll_converged  # formerly known as converged_f
+
+        total_converged |= ll_converged
 
+        """
+        Now getting convergence based on change in gradient below threshold:
+        """
+        grad_loc = np.sum(grad_numpy[:, self.model.model_vars.idx_train_loc], axis=1)
+        grad_norm_loc = grad_loc / jac_normalization
+        grad_scale = np.sum(grad_numpy[:, self.model.model_vars.idx_train_scale], axis=1)
+        grad_norm_scale = grad_scale / jac_normalization
+
+        grad_norm_loc_converged = grad_norm_loc < pkg_constants.GTOL_BY_FEATURE_LOC
+        grad_norm_scale_converged = grad_norm_scale < pkg_constants.GTOL_BY_FEATURE_SCALE
+
+        grad_converged = grad_norm_loc_converged & grad_norm_scale_converged
+        epoch_grad_converged = not_converged_prev & grad_converged  # formerly known as converged_g
+
+        total_converged |= grad_converged
+
+        """
+        Now getting convergence based on change of coefficients below threshold:
+        """
         def calc_x_step(idx_train, prev_norm):
             if len(idx_train) > 0:
                 curr_norm = np.sqrt(np.sum(np.square(
@@ -360,35 +396,26 @@ def calc_x_step(idx_train, prev_norm):
         x_norm_loc = calc_x_step(self.model.model_vars.idx_train_loc, prev_norm_loc)
         x_norm_scale = calc_x_step(self.model.model_vars.idx_train_scale, prev_norm_scale)
 
-        ll_converged = np.abs(ll_prev - ll_current) / ll_prev < pkg_constants.LLTOL_BY_FEATURE
+        x_norm_loc_converged = x_norm_loc < pkg_constants.XTOL_BY_FEATURE_LOC
+        x_norm_scale_converged = x_norm_scale < pkg_constants.XTOL_BY_FEATURE_SCALE
 
-        converged_current = get_convergence(converged_prev, ll_converged, features_updated)
+        step_converged = x_norm_loc_converged & x_norm_scale_converged
+        epoch_step_converged = not_converged_prev & step_converged
 
-        # those features which were not converged in the prev run, but converged now
-        converged_f = get_convergence_by_method(converged_prev, ll_converged, features_updated)
-        grad_loc = np.sum(grad_numpy[:, self.model.model_vars.idx_train_loc], axis=1)
-        grad_norm_loc = grad_loc / jac_normalization
-        grad_scale = np.sum(grad_numpy[:, self.model.model_vars.idx_train_scale], axis=1)
-        grad_norm_scale = grad_scale / jac_normalization
-
-        converged_current = get_convergence(converged_current,
-                                            grad_norm_loc < pkg_constants.GTOL_BY_FEATURE_LOC,
-                                            grad_norm_scale < pkg_constants.GTOL_BY_FEATURE_SCALE)
-        # those features which were not converged in the prev run, but converged now
-        converged_g = get_convergence_by_method(converged_prev,
-                                                grad_norm_loc < pkg_constants.GTOL_BY_FEATURE_LOC,
-                                                grad_norm_scale < pkg_constants.GTOL_BY_FEATURE_SCALE)
-
-        # Step length:
-        converged_current = get_convergence(converged_current,
-                                            x_norm_loc < pkg_constants.XTOL_BY_FEATURE_LOC,
-                                            x_norm_scale < pkg_constants.XTOL_BY_FEATURE_SCALE)
-
-        # those features which were not converged in the prev run, but converged now
-        converged_x = get_convergence_by_method(converged_prev,
-                                                x_norm_loc < pkg_constants.XTOL_BY_FEATURE_LOC,
-                                                x_norm_scale < pkg_constants.XTOL_BY_FEATURE_SCALE)
-        return converged_current, converged_f, converged_g, converged_x
+        total_converged |= step_converged
+        """
+        In case we use irls_tr/irls_gd_tr or nr_tr, we can also utilize the trusted region radius.
+        For now it must not be below the threshold for the X step of the scale model.
+        """
+        if hasattr(optimizer_object, 'trusted_region_mode') and optimizer_object.trusted_region_mode:
+            converged_tr = optimizer_object.tr_radius.numpy() < pkg_constants.TRTOL_BY_FEATURE_LOC
+            if hasattr(optimizer_object, 'tr_radius_b') and self._train_scale:
+                converged_tr &= optimizer_object.tr_radius_b.numpy() < pkg_constants.TRTOL_BY_FEATURE_SCALE
+            epoch_tr_converged = not_converged_prev & converged_tr
+            total_converged |= epoch_tr_converged
+            return total_converged, epoch_ll_converged, epoch_grad_converged, epoch_step_converged, epoch_tr_converged
+
+        return total_converged, epoch_ll_converged, epoch_grad_converged, epoch_step_converged
 
     def get_optimizer_object(self, optimizer: str, learning_rate):
         """

From ffe23ceef6be87912d6ae47a79f3b16e84559aff Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 14 Jan 2020 21:26:56 +0100
Subject: [PATCH 023/124] bugfix: major updates of convergence method

---
 batchglm/train/tf2/base_glm/estimator.py | 84 +++++++++++++++---------
 1 file changed, 53 insertions(+), 31 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 6cd4aa03..cb745c23 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -133,10 +133,7 @@ def convergence_decision(convergence_status, train_step):
             update_func = optimizer_object.apply_gradients
         n_obs = self._input_data.num_observations
 
-        curr_norm_loc = np.sqrt(np.sum(np.square(
-            np.abs(self.model.params.numpy()[self.model.model_vars.idx_train_loc, :])), axis=0))
-        curr_norm_scale = np.sqrt(np.sum(np.square(
-            np.abs(self.model.params.numpy()[self.model.model_vars.idx_train_scale, :])), axis=0))
+        prev_params = self.model.params.numpy()
 
         batch_features = False
         while convergence_decision(converged_current, train_step):
@@ -243,14 +240,14 @@ def convergence_decision(convergence_status, train_step):
             convergences = self.calculate_convergence(
                 converged_prev,
                 ll_prev,
-                prev_norm_loc,
-                prev_norm_scale,
                 ll_current,
+                prev_params,
                 jac_normalization,
                 grad_numpy,
                 features_updated,
                 optimizer_object
             )
+            prev_params = self.model.params.numpy()
             #converged_current, converged_f, converged_g, converged_x = convergences
             converged_current = convergences[0]
             self.model.model_vars.convergence_update(converged_current, features_updated)
@@ -347,7 +344,7 @@ def getModelInput(self, x_batch_tuple: tuple, batch_features: bool, not_converge
 
         return x_batch
 
-    def calculate_convergence(self, converged_prev, ll_prev, prev_norm_loc, prev_norm_scale, ll_current,
+    def calculate_convergence(self, converged_prev, ll_prev, ll_current, prev_params,
                               jac_normalization, grad_numpy, features_updated, optimizer_object):
         """
             Wrapper method to perform all necessary convergence checks.
@@ -363,7 +360,7 @@ def calculate_convergence(self, converged_prev, ll_prev, prev_norm_loc, prev_nor
         ll_converged = (ll_difference < pkg_constants.LLTOL_BY_FEATURE) & features_updated
         epoch_ll_converged = not_converged_prev & ll_converged  # formerly known as converged_f
 
-        total_converged |= ll_converged
+        total_converged |= epoch_ll_converged
 
         """
         Now getting convergence based on change in gradient below threshold:
@@ -376,7 +373,7 @@ def calculate_convergence(self, converged_prev, ll_prev, prev_norm_loc, prev_nor
         grad_norm_loc_converged = grad_norm_loc < pkg_constants.GTOL_BY_FEATURE_LOC
         grad_norm_scale_converged = grad_norm_scale < pkg_constants.GTOL_BY_FEATURE_SCALE
 
-        grad_converged = grad_norm_loc_converged & grad_norm_scale_converged
+        grad_converged = grad_norm_loc_converged & grad_norm_scale_converged & features_updated
         epoch_grad_converged = not_converged_prev & grad_converged  # formerly known as converged_g
 
         total_converged |= grad_converged
@@ -384,36 +381,22 @@ def calculate_convergence(self, converged_prev, ll_prev, prev_norm_loc, prev_nor
         """
         Now getting convergence based on change of coefficients below threshold:
         """
-        def calc_x_step(idx_train, prev_norm):
-            if len(idx_train) > 0:
-                curr_norm = np.sqrt(np.sum(np.square(
-                    np.abs(self.model.params.numpy()[idx_train, :])
-                ), axis=0))
-                return np.abs(curr_norm - prev_norm)
-            else:
-                return np.zeros([self.model.model_vars.n_features]) + np.nextafter(np.inf, 0, dtype=self.dtype)
-
-        x_norm_loc = calc_x_step(self.model.model_vars.idx_train_loc, prev_norm_loc)
-        x_norm_scale = calc_x_step(self.model.model_vars.idx_train_scale, prev_norm_scale)
 
-        x_norm_loc_converged = x_norm_loc < pkg_constants.XTOL_BY_FEATURE_LOC
-        x_norm_scale_converged = x_norm_scale < pkg_constants.XTOL_BY_FEATURE_SCALE
+        x_step_converged = self.calc_x_step(prev_params, features_updated)
+        epoch_step_converged = not_converged_prev & x_step_converged
 
-        step_converged = x_norm_loc_converged & x_norm_scale_converged
-        epoch_step_converged = not_converged_prev & step_converged
-
-        total_converged |= step_converged
         """
         In case we use irls_tr/irls_gd_tr or nr_tr, we can also utilize the trusted region radius.
-        For now it must not be below the threshold for the X step of the scale model.
+        For now it must not be below the threshold for the X step of the loc model.
         """
         if hasattr(optimizer_object, 'trusted_region_mode') and optimizer_object.trusted_region_mode:
-            converged_tr = optimizer_object.tr_radius.numpy() < pkg_constants.TRTOL_BY_FEATURE_LOC
+            converged_tr = optimizer_object.tr_radius.numpy() < pkg_constants.XTOL_BY_FEATURE_LOC
             if hasattr(optimizer_object, 'tr_radius_b') and self._train_scale:
-                converged_tr &= optimizer_object.tr_radius_b.numpy() < pkg_constants.TRTOL_BY_FEATURE_SCALE
+                converged_tr &= optimizer_object.tr_radius_b.numpy() < pkg_constants.XTOL_BY_FEATURE_SCALE
             epoch_tr_converged = not_converged_prev & converged_tr
-            total_converged |= epoch_tr_converged
-            return total_converged, epoch_ll_converged, epoch_grad_converged, epoch_step_converged, epoch_tr_converged
+            epoch_step_converged |= epoch_tr_converged
+
+        total_converged |= epoch_step_converged
 
         return total_converged, epoch_ll_converged, epoch_grad_converged, epoch_step_converged
 
@@ -550,3 +533,42 @@ def get_init_from_model(init_a, init_b, input_data, init_model):
     @abc.abstractmethod
     def get_model_container(self, input_data):
         pass
+
+    def calc_x_step(self, prev_params, features_updated):
+
+        def get_norm_converged(model: str, prev_params):
+            if model == 'loc':
+                idx_train = self.model.model_vars.idx_train_loc
+                XTOL = pkg_constants.XTOL_BY_FEATURE_LOC
+            elif model == 'scale':
+                idx_train = self.model.model_vars.idx_train_scale
+                XTOL = pkg_constants.XTOL_BY_FEATURE_SCALE
+            else:
+                assert False, "Supply either 'loc' or 'scale'!"
+            x_step = self.model.params.numpy() - prev_params
+            x_norm = np.sqrt(np.sum(np.square(x_step[idx_train, :]), axis=0))
+            return x_norm < XTOL
+
+        """
+        We use a trick here: First we set both the loc and scale convergence to True.
+        It is not necessary to use an array with length = number of features, since bitwise
+        AND also works with a single boolean.
+        """
+        loc_conv = np.bool_(True)
+        scale_conv = np.bool_(True)
+
+        """
+        Now we check which models need to be trained. E.g. if you are using quick_scale = True,
+        self._train_scale will be False and so the above single True value will be used.
+        """
+        if self._train_loc:
+            loc_conv = get_norm_converged('loc', prev_params)
+        if self._train_scale:
+            scale_conv = get_norm_converged('scale', prev_params)
+
+        """
+        Finally, we check that only features updated in this epoch can evaluate to True.
+        This is only a problem for 2nd order optims with trusted region mode, since it might occur,
+        that a feature isn't updated, so the x_step is zero although not yet converged.
+        """
+        return loc_conv & scale_conv & features_updated

From 2e17945b6c611ceb575d071a14bdec167714d95d Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 14 Jan 2020 21:31:53 +0100
Subject: [PATCH 024/124] bugfix: remove curr/prev_norm_lo/scale

---
 batchglm/train/tf2/base_glm/estimator.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index cb745c23..1fc7752a 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -209,8 +209,6 @@ def convergence_decision(convergence_status, train_step):
                 self.values.append(self.model.trainable_variables[0].numpy().copy())
 
             # Update converged status
-            prev_norm_loc = curr_norm_loc.copy()
-            prev_norm_scale = curr_norm_scale.copy()
             converged_prev = converged_current.copy()
             ll_current = self.loss.norm_neg_log_likelihood(results[0]).numpy()
 

From c4bcdb37cb9a05c2194ca5b2e5f2c3e01d0ec8ed Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 14 Jan 2020 22:18:57 +0100
Subject: [PATCH 025/124] bugfix: logging pattern

---
 batchglm/train/tf2/base_glm/estimator.py | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 1fc7752a..3cf277df 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -145,7 +145,7 @@ def convergence_decision(convergence_status, train_step):
             not_converged = np.logical_not(self.model.model_vars.converged)
             ll_prev = ll_current.copy()
             #if train_step % 10 == 0:
-            logger.warning('step %i: loss: %s', train_step, np.array2string(ll_current[0:10]))
+            logger.warning('step %i: loss: %f', train_step, np.sum(ll_current))
 
             if not is_batched:
                 results = None
@@ -254,22 +254,14 @@ def convergence_decision(convergence_status, train_step):
                 if featurewise and not batch_features:
                     batch_features = True
                     self.model.batch_features = batch_features
-                logger_pattern = "Step: %i loss: %f, converged %i, updated %i, (logs: %i, grad: %i, x_step: %i"
-                logger_data = [
+                logger_pattern = "Step: %i loss: %f, converged %i, updated %i, (logs: %i, grad: %i, x_step: %i)"
+                logger.warning(
+                    logger_pattern,
                     train_step,
                     np.sum(ll_current),
                     num_converged.astype("int32"),
                     np.sum(features_updated).astype("int32"),
                     *[np.sum(convergence_vals) for convergence_vals in convergences[1:]]
-                ]
-                if (irls_algo or nr_algo) and optimizer_object.trusted_region_mode:
-                    logger_pattern += " tr: %i)"
-                else:
-                    logger_pattern += ")"
-
-                logger.warning(
-                    logger_pattern,
-                    *logger_data
                 )
             train_step += 1
             if benchmark:

From f2bd91b1d7a61b2734a73dd6902416f8bca30593 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 15 Jan 2020 03:13:48 +0100
Subject: [PATCH 026/124] removed shuffling from full data model

---
 batchglm/train/tf2/base_glm/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 3cf277df..51cba419 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -94,7 +94,7 @@ def _train(
         if is_batched:
             data = data_ids.shuffle(buffer_size=2 * batch_size).repeat().batch(batch_size)
         else:
-            data = data_ids.shuffle(buffer_size=2 * batch_size).batch(batch_size, drop_remainder=True)
+            data = data_ids.batch(batch_size, drop_remainder=True)
         input_list = data.map(self.fetch_fn, num_parallel_calls=pkg_constants.TF_NUM_THREADS)
 
         # Iterate until conditions are fulfilled.

From 42181c712dc16e616be364bddf58983da68c9a30 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 15 Jan 2020 19:56:55 +0100
Subject: [PATCH 027/124] bugfix: missing inversion of hessian in last run

---
 batchglm/train/tf2/base_glm/estimator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 51cba419..62de0e3e 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -296,8 +296,8 @@ def convergence_decision(convergence_status, train_step):
             self._hessian = results[2].numpy()
             self._jacobian = results[1].numpy()
         elif irls_algo:
-            # TODO: maybe report fisher inv here. But concatenation only works if !intercept_scale
-            self._fisher_inv = results[2].numpy()
+            # TODO: maybe report fisher inf here. But concatenation only works if !intercept_scale
+            self._fisher_inv = tf.linalg.inv(results[2]).numpy()
             self._jacobian = results[1].numpy()
 
             # change back to FIM mode

From 183a19f819111a6886d25518bee3c0894c42797c Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 15 Jan 2020 22:14:05 +0100
Subject: [PATCH 028/124] bugfix: final run should return non neg ll

---
 batchglm/train/tf2/base_glm/estimator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 62de0e3e..fda0a771 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -271,7 +271,6 @@ def convergence_decision(convergence_status, train_step):
 
         # Evaluate final params
         logger.warning("Final Evaluation run.")
-        self._log_likelihood = results[0].numpy()
         self._fisher_inv = tf.zeros(shape=()).numpy()
         self._hessian = tf.zeros(shape=()).numpy()
         self.model.batch_features = False
@@ -292,6 +291,8 @@ def convergence_decision(convergence_status, train_step):
                 for i, x in enumerate(current_results):
                     results[i] += x
 
+        self._log_likelihood = self.loss.norm_log_likelihood(results[0].numpy())
+
         if nr_algo:
             self._hessian = results[2].numpy()
             self._jacobian = results[1].numpy()

From 10e10c2dd05e3b48d0be945db16cb1f153511fe8 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Thu, 16 Jan 2020 23:10:34 +0100
Subject: [PATCH 029/124] bugfix: correct jacobian and hessians in final run

---
 batchglm/train/tf2/base_glm/estimator.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index fda0a771..499b05b7 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -95,7 +95,7 @@ def _train(
             data = data_ids.shuffle(buffer_size=2 * batch_size).repeat().batch(batch_size)
         else:
             data = data_ids.batch(batch_size, drop_remainder=True)
-        input_list = data.map(self.fetch_fn, num_parallel_calls=pkg_constants.TF_NUM_THREADS)
+        input_list = data.map(self.fetch_fn, num_parallel_calls=pkg_constants.TF_NUM_THREADS).prefetch(2)
 
         # Iterate until conditions are fulfilled.
         train_step = 0
@@ -146,12 +146,12 @@ def convergence_decision(convergence_status, train_step):
             ll_prev = ll_current.copy()
             #if train_step % 10 == 0:
             logger.warning('step %i: loss: %f', train_step, np.sum(ll_current))
-
             if not is_batched:
                 results = None
                 x_batch = None
                 first_batch = True
                 for x_batch_tuple in input_list:
+
                     x_batch = self.getModelInput(x_batch_tuple, batch_features, not_converged)
                     current_results = self.model(x_batch)
                     if first_batch:
@@ -160,7 +160,6 @@ def convergence_decision(convergence_status, train_step):
                     else:
                         for i, x in enumerate(current_results):
                             results[i] += x
-
             else:
                 x_batch_tuple = next(dataset_iterator)
                 x_batch = self.getModelInput(x_batch_tuple, batch_features, not_converged)
@@ -245,6 +244,7 @@ def convergence_decision(convergence_status, train_step):
                 features_updated,
                 optimizer_object
             )
+
             prev_params = self.model.params.numpy()
             #converged_current, converged_f, converged_g, converged_x = convergences
             converged_current = convergences[0]
@@ -292,21 +292,21 @@ def convergence_decision(convergence_status, train_step):
                     results[i] += x
 
         self._log_likelihood = self.loss.norm_log_likelihood(results[0].numpy())
+        self._jacobian = tf.reduce_sum(tf.abs(results[1] / self.input_data.num_observations), axis=1)
 
         if nr_algo:
-            self._hessian = results[2].numpy()
-            self._jacobian = results[1].numpy()
+            self._hessian = -results[2].numpy()
+
         elif irls_algo:
             # TODO: maybe report fisher inf here. But concatenation only works if !intercept_scale
             self._fisher_inv = tf.linalg.inv(results[2]).numpy()
-            self._jacobian = results[1].numpy()
+            self._hessian = -results[2].numpy()
 
             # change back to FIM mode
             self.model.calc_fim = True
             self.model.calc_hessian = False
             self.model.concat_grads = False
-        else:
-            self._jacobian = results[1].numpy()
+
 
         self.model.batch_features = batch_features
 

From db803b59991d4eddc695a347a4a667cca0edf8c9 Mon Sep 17 00:00:00 2001
From: kikky <klaudia.adamowicz.91@gmail.com>
Date: Thu, 16 Jan 2020 23:51:03 +0100
Subject: [PATCH 030/124] Custom Generator implemented (testing)

---
 batchglm/train/tf2/base_glm/estimator.py | 56 ++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index fda0a771..30a8261e 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -88,6 +88,9 @@ def _train(
         self.noise_model = noise_model
         # Slice data and create batches
 
+        """
+        with map
+        """
         data_ids = tf.data.Dataset.from_tensor_slices(
             (tf.range(self._input_data.num_observations, name="sample_index", dtype=tf.dtypes.int64))
         )
@@ -97,6 +100,21 @@ def _train(
             data = data_ids.batch(batch_size, drop_remainder=True)
         input_list = data.map(self.fetch_fn, num_parallel_calls=pkg_constants.TF_NUM_THREADS)
 
+        """
+        with custom Generator
+        """
+        custom_generator = self.Data_Generator(num_observations=self.input_data.num_observations,
+                                          input_data=self._input_data,
+                                          batch_size=batch_size,
+                                          drop_remainder=True)
+
+        dataset = tf.data.Dataset.from_generator(
+            generator=custom_generator,
+            output_types=(self._input_data.x.dtype, self._input_data.design_loc.dtype,
+                          self._input_data.design_scale.dtype, self._input_data.size_factors.dtype)
+        )
+        # output_shapes = (tf.TensorShape([]), tf.TensorShape([None])))
+
         # Iterate until conditions are fulfilled.
         train_step = 0
 
@@ -491,6 +509,44 @@ def fetch_fn(self, idx):
         # feature batching
         return x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor
 
+    class Data_Generator:
+        def __init__(self,
+                     num_observations,
+                     input_data,
+                     batch_size: int = 1000,
+                     drop_remainder: bool = True):
+            self.num_observations = num_observations
+            self.input_data = input_data
+            self.batch_size = batch_size
+            self.drop_remainder = drop_remainder
+
+        def __next__(self):
+            data = np.random.shuffle(self.num_observations)
+            for id in range(0, self.num_observations, self.batch_size):
+                """
+                Get data only if it is not the last batch while
+                drop_remainder is set on True.
+                """
+                if not ((id+self.batch_size) > self.num_observations and self.drop_remainder):
+                    """
+                    Generate data with size = batch_size or
+                    generate smaller data for remaining data (if smaller than batch_size)
+                    """
+                    if (id+self.batch_size) < self.num_observations:
+                        idx = data[id:(id+self.batch_size)]
+                    else:
+                        idx = data[id:self.num_observations]
+
+                    if isinstance(self.input_data.x, scipy.sparse.csr_matrix):
+                        x_tensor_idx, x_tensor_val, x =  self.input_data.fetch_x_sparse([idx])
+                        x_tensor = tf.SparseTensor(x_tensor_idx, x_tensor_val, x)
+                    else:
+                        x_tensor =self.input_data.fetch_x_dense([idx])
+                    design_loc_tensor = self.input_data.fetch_design_loc([idx])
+                    design_scale_tensor = self.input_data.fetch_design_scale([idx])
+                    size_factors_tensor =  self.input_data.fetch_size_factors([idx])
+                    yield x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor
+
     @staticmethod
     def get_init_from_model(init_a, init_b, input_data, init_model):
         # Locations model:

From 6d907c5fbe4c8cec8047ed942ce5dc875bc47d41 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 17 Jan 2020 01:58:05 +0100
Subject: [PATCH 031/124] cleanup in model, support multiple optim runs

---
 batchglm/train/tf2/base_glm/estimator.py | 23 ++------
 batchglm/train/tf2/base_glm/model.py     | 72 ++++++------------------
 batchglm/train/tf2/glm_beta/estimator.py |  3 +
 batchglm/train/tf2/glm_nb/estimator.py   |  2 +
 batchglm/train/tf2/glm_norm/estimator.py |  2 +
 5 files changed, 29 insertions(+), 73 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 499b05b7..d394c7ab 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -271,15 +271,10 @@ def convergence_decision(convergence_status, train_step):
 
         # Evaluate final params
         logger.warning("Final Evaluation run.")
-        self._fisher_inv = tf.zeros(shape=()).numpy()
-        self._hessian = tf.zeros(shape=()).numpy()
         self.model.batch_features = False
 
         # change to hessian mode since we still use hessian instead of FIM for self._fisher_inv
-        if irls_algo:
-            self.model.calc_fim = False
-            self.model.calc_hessian = True
-            self.model.concat_grads = True
+        self.model.setMethod('nr_tr')
 
         first_batch = True
         for x_batch_tuple in input_list:
@@ -294,19 +289,9 @@ def convergence_decision(convergence_status, train_step):
         self._log_likelihood = self.loss.norm_log_likelihood(results[0].numpy())
         self._jacobian = tf.reduce_sum(tf.abs(results[1] / self.input_data.num_observations), axis=1)
 
-        if nr_algo:
-            self._hessian = -results[2].numpy()
-
-        elif irls_algo:
-            # TODO: maybe report fisher inf here. But concatenation only works if !intercept_scale
-            self._fisher_inv = tf.linalg.inv(results[2]).numpy()
-            self._hessian = -results[2].numpy()
-
-            # change back to FIM mode
-            self.model.calc_fim = True
-            self.model.calc_hessian = False
-            self.model.concat_grads = False
-
+        # TODO: maybe report fisher inf here. But concatenation only works if !intercept_scale
+        self._fisher_inv = tf.linalg.inv(results[2]).numpy()
+        self._hessian = -results[2].numpy()
 
         self.model.batch_features = batch_features
 
diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index 732aa1c7..f28b221b 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -53,25 +53,19 @@ def __init__(
         self.params_copy = None
         self.batch_features = False
 
-        self.calc_jacobian = False
-        self.calc_hessian = False
-        self.calc_fim = False
-        self.concat_grads = True
+        self.setMethod(optimizer)
 
-        self._setParams(optimizer)
-
-    def _setParams(self, optimizer):
+    def setMethod(self, optimizer):
 
         optimizer = optimizer.lower()
         if optimizer in ['gd', 'adam', 'adagrad', 'rmsprop']:
-            self.calc_jacobian = True
+            self._calc = self._return_jacobians
 
         elif optimizer in ['nr', 'nr_tr']:
-            self.calc_hessian = True
+            self._calc = self._calc_hessians
 
         elif optimizer in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr']:
-            self.calc_fim = True
-            self.concat_grads = False
+            self._calc = self._calc_fim
 
     def _call_parameters(self, inputs, keep_previous_params_copy=False):
         if not keep_previous_params_copy:
@@ -95,7 +89,10 @@ def calc_ll(self, inputs, keep_previous_params_copy=False):
         log_probs = self.likelihood([*parameters[:-2], inputs[0], np.sum(self.model_vars.updated)])
         return (log_probs, *parameters[2:])
 
-    def _calc_jacobians(self, inputs, concat, transpose=True):
+    def _return_jacobians(self, inputs):
+        return self._calc_jacobians(inputs)[-2:]
+
+    def _calc_jacobians(self, inputs, concat=True, transpose=True):
         """
         calculates jacobian.
 
@@ -156,14 +153,9 @@ def _calc_jacobians(self, inputs, concat, transpose=True):
             return loc, scale, log_probs, tf.negative(jacobians)
         return loc, scale, log_probs, tf.negative(jac_a), tf.negative(jac_b)
 
-    def calc_hessians(self, inputs, concat=False):
+    def _calc_hessians(self, inputs):
         # with tf.GradientTape(persistent=True) as g2:
-        if concat:
-            loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, concat=True, transpose=False)
-        else:
-            loc, scale, log_probs, jac_a, jac_b = self._calc_jacobians(inputs, concat=False, transpose=False)
-        # results_arr = [jacobians[:, i] for i in tf.range(self.params_copy.get_shape()[0])]
-
+        loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, transpose=False)
         '''
         autograd not yet working. TODO: Search error in the following code:
 
@@ -191,44 +183,16 @@ def calc_hessians(self, inputs, concat=False):
             ), perm=[2, 1, 0])
             hessians = tf.negative(hessians)
         '''
-        # else:
-        if concat:
-            hessians = tf.negative(self.hessian([*inputs[0:3], loc, scale, True]))
-            return log_probs, jacobians, hessians
+        hessians = tf.negative(self.hessian([*inputs[0:3], loc, scale, True]))
+        return log_probs, jacobians, hessians
 
-        hes_aa, hes_ab, hes_ba, hes_bb = self.hessian([*inputs[0:3], loc, scale, False])
-        return log_probs, jac_a, jac_b, tf.negative(hes_aa), \
-            tf.negative(hes_ab), tf.negative(hes_ba), tf.negative(hes_bb)
-        # del g2 # need to delete this GradientTape because persistent is True.
+    def _calc_fim(self, inputs):
+        loc, scale, log_probs, jac_a, jac_b = self._calc_jacobians(inputs, concat=False, transpose=False)
+        fim_a, fim_b = self.fim([*inputs[0:3], loc, scale, False])
+        return log_probs, jac_a, jac_b, fim_a, fim_b
 
     def call(self, inputs, training=False, mask=None):
-        # X_data, design_loc, design_scale, size_factors = inputs
-
-        # This is for first order optimizations, which get the full jacobian
-
-        if self.calc_jacobian:
-            _, _, log_probs, jacobians = self._calc_jacobians(inputs, concat=self.concat_grads)
-            return log_probs, jacobians
-
-        # This is for SecondOrder NR/NR_TR
-        if self.calc_hessian:
-            results = self.calc_hessians(inputs, concat=self.concat_grads)
-            return results
-        # This is for SecondOrder IRLS/IRLS_GD/IRLS_TR/IRLS_GD_TR
-        if self.calc_fim:
-            if self.concat_grads:
-                loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, concat=True, transpose=False)
-                fims = self.fim([*inputs[0:3], loc, scale, True])
-
-                return log_probs, tf.negative(jacobians), fims
-            else:
-                loc, scale, log_probs, jac_a, jac_b = self._calc_jacobians(inputs, concat=False, transpose=False)
-                fim_a, fim_b = self.fim([*inputs[0:3], loc, scale, False])
-
-                return log_probs, jac_a, jac_b, fim_a, fim_b
-
-        raise ValueError("No gradient calculation specified.")
-
+        return self._calc(inputs)
 
 class LossGLM(LossBase):
 
diff --git a/batchglm/train/tf2/glm_beta/estimator.py b/batchglm/train/tf2/glm_beta/estimator.py
index bd266e02..57ae76e5 100644
--- a/batchglm/train/tf2/glm_beta/estimator.py
+++ b/batchglm/train/tf2/glm_beta/estimator.py
@@ -109,6 +109,9 @@ def train(
                 use_gradient_tape=autograd,
                 optimizer=optim_algo
             )
+        else:
+            self.model.setMethod(optim_algo)
+
         self._loss = LossGLMBeta()
 
         optimizer_object = self.get_optimizer_object(optim_algo, learning_rate)
diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index 3409e0ab..c7618cff 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -106,6 +106,8 @@ def train(
                 use_gradient_tape=autograd,
                 optimizer=optim_algo
             )
+        else:
+            self.model.setMethod(optim_algo)
 
         self._loss = LossGLMNB()
 
diff --git a/batchglm/train/tf2/glm_norm/estimator.py b/batchglm/train/tf2/glm_norm/estimator.py
index 70de3c91..d92798ab 100644
--- a/batchglm/train/tf2/glm_norm/estimator.py
+++ b/batchglm/train/tf2/glm_norm/estimator.py
@@ -112,6 +112,8 @@ def train(
                 use_gradient_tape=autograd,
                 optimizer=optim_algo
             )
+        else:
+            self.model.setMethod(optim_algo)
 
         self._loss = LossGLMNorm()
 

From 98b2bf11b555ae8309da109d51ba62bd3273429a Mon Sep 17 00:00:00 2001
From: kikky <klaudia.adamowicz.91@gmail.com>
Date: Sat, 18 Jan 2020 02:41:45 +0100
Subject: [PATCH 032/124] generator inside for loop

---
 batchglm/train/tf2/base_glm/estimator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 30a8261e..ec2760f6 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -137,6 +137,7 @@ def convergence_decision(convergence_status, train_step):
         ll_current = np.zeros([self._input_data.num_features], self.dtype) + np.nextafter(np.inf, 0, dtype=self.dtype)
 
         dataset_iterator = iter(input_list)
+
         irls_algo = False
         nr_algo = False
         if optim_algo.lower() in ['nr','nr_tr']:
@@ -169,7 +170,7 @@ def convergence_decision(convergence_status, train_step):
                 results = None
                 x_batch = None
                 first_batch = True
-                for x_batch_tuple in input_list:
+                for x_batch_tuple in dataset: #input_list:
                     x_batch = self.getModelInput(x_batch_tuple, batch_features, not_converged)
                     current_results = self.model(x_batch)
                     if first_batch:

From a14c1dfb0e5e183d25d44c2371b20852264d2d0c Mon Sep 17 00:00:00 2001
From: kikky <klaudia.adamowicz.91@gmail.com>
Date: Sat, 18 Jan 2020 13:21:21 +0100
Subject: [PATCH 033/124] changed generator outputtype

---
 batchglm/train/tf2/base_glm/estimator.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 0bba6ba8..c14d25a7 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -107,11 +107,12 @@ def _train(
                                           input_data=self._input_data,
                                           batch_size=batch_size,
                                           drop_remainder=True)
-
         dataset = tf.data.Dataset.from_generator(
             generator=custom_generator,
-            output_types=(self._input_data.x.dtype, self._input_data.design_loc.dtype,
-                          self._input_data.design_scale.dtype, self._input_data.size_factors.dtype)
+            output_types=(self.dtype, self.dtype,
+                          self.dtype, self.dtype)
+            #output_types=(self._input_data.x.dtype, self._input_data.design_loc.dtype,
+            #              self._input_data.design_scale.dtype, self._input_data.size_factors.dtype)
         )
         # output_shapes = (tf.TensorShape([]), tf.TensorShape([None])))
 

From b2a6fc06838628ee10e48d61ce2a5fb929a8f6e9 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Sun, 19 Jan 2020 00:13:39 +0100
Subject: [PATCH 034/124] bugfixes in generator

---
 batchglm/train/tf2/base_glm/estimator.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index c14d25a7..0b8ed59a 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -108,7 +108,7 @@ def _train(
                                           batch_size=batch_size,
                                           drop_remainder=True)
         dataset = tf.data.Dataset.from_generator(
-            generator=custom_generator,
+            generator=custom_generator.generate,
             output_types=(self.dtype, self.dtype,
                           self.dtype, self.dtype)
             #output_types=(self._input_data.x.dtype, self._input_data.design_loc.dtype,
@@ -505,9 +505,9 @@ def __init__(self,
             self.input_data = input_data
             self.batch_size = batch_size
             self.drop_remainder = drop_remainder
+            self.data = np.random.permutation(self.num_observations)
 
-        def __next__(self):
-            data = np.random.shuffle(self.num_observations)
+        def generate(self):
             for id in range(0, self.num_observations, self.batch_size):
                 """
                 Get data only if it is not the last batch while
@@ -519,18 +519,18 @@ def __next__(self):
                     generate smaller data for remaining data (if smaller than batch_size)
                     """
                     if (id+self.batch_size) < self.num_observations:
-                        idx = data[id:(id+self.batch_size)]
+                        idx = self.data[id:(id+self.batch_size)]
                     else:
-                        idx = data[id:self.num_observations]
-
+                        idx = self.data[id:self.num_observations]
                     if isinstance(self.input_data.x, scipy.sparse.csr_matrix):
-                        x_tensor_idx, x_tensor_val, x =  self.input_data.fetch_x_sparse([idx])
+                        x_tensor_idx, x_tensor_val, x =  self.input_data.fetch_x_sparse(idx)
                         x_tensor = tf.SparseTensor(x_tensor_idx, x_tensor_val, x)
                     else:
-                        x_tensor =self.input_data.fetch_x_dense([idx])
-                    design_loc_tensor = self.input_data.fetch_design_loc([idx])
-                    design_scale_tensor = self.input_data.fetch_design_scale([idx])
-                    size_factors_tensor =  self.input_data.fetch_size_factors([idx])
+                        x_tensor =self.input_data.fetch_x_dense(idx)
+                    design_loc_tensor = self.input_data.fetch_design_loc(idx)
+                    design_scale_tensor = self.input_data.fetch_design_scale(idx)
+                    if self.input_data.size_factors is not None:
+                        size_factors_tensor =  self.input_data.fetch_size_factors(idx)
                     yield x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor
 
     @staticmethod

From 4502566fa01757b312851c4a3706ae39945d8b4a Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Sun, 19 Jan 2020 02:44:00 +0100
Subject: [PATCH 035/124] bugfixes in generator for full model

---
 batchglm/train/tf2/base_glm/estimator.py | 96 +++++++++++-------------
 1 file changed, 42 insertions(+), 54 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 0b8ed59a..47537dd8 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -66,7 +66,7 @@ def _train(
             self,
             noise_model: str,
             is_batched: bool = True,
-            batch_size: int = 100,
+            batch_size: int = 1000,
             optimizer_object: tf.keras.optimizers.Optimizer = tf.keras.optimizers.Adam(),
             convergence_criteria: str = "step",
             stopping_criteria: int = 1000,
@@ -75,8 +75,10 @@ def _train(
             benchmark: bool = False,
             optim_algo: str = "adam"
     ):
-        if batch_size > self.input_data.num_observations:
-            batch_size = self.input_data.num_observations
+
+        n_obs = self.input_data.num_observations
+        if batch_size > n_obs:
+            batch_size = n_obs
         if not self._initialized:
             raise RuntimeError("Cannot train the model: \
                                 Estimator not initialized. Did you forget to call estimator.initialize() ?")
@@ -86,34 +88,44 @@ def _train(
                             Falling back to closed form. Only Jacobians are calculated using autograd.")
 
         self.noise_model = noise_model
-        # Slice data and create batches
+        sparse = isinstance(self.input_data.x, scipy.sparse.csr_matrix)
+        full_model = not is_batched
+
+        def generate():
+            """
+            Generator for the full model.
+            We use max_obs to cut the observations with max_obs % batch_size = 0 to ensure consistent
+            sizes of tensors.
+            """
+            fetch_size_factors = self._input_data.size_factors is not None and self.noise_model in ["nb", "norm"]
+
+            if full_model:
+                max_obs = n_obs - (n_obs % batch_size)
+                obs_pool = np.arange(max_obs)
+            else:
+                max_obs = n_obs
+                obs_pool = np.random.permutation(n_obs)
 
-        """
-        with map
-        """
-        data_ids = tf.data.Dataset.from_tensor_slices(
-            (tf.range(self._input_data.num_observations, name="sample_index", dtype=tf.dtypes.int64))
-        )
-        if is_batched:
-            data = data_ids.shuffle(buffer_size=2 * batch_size).repeat().batch(batch_size)
-        else:
-            data = data_ids.batch(batch_size, drop_remainder=True)
-        input_list = data.map(self.fetch_fn, num_parallel_calls=pkg_constants.TF_NUM_THREADS).prefetch(2)
+            for id in range(0, max_obs, batch_size):
+                idx = obs_pool[id: id + batch_size]  # numpy returns just n_obs if id + batch_size > n_obs
 
+                x = self.input_data.fetch_x_sparse(idx) if sparse else self.input_data.fetch_x_dense(idx)
+                dloc = self.input_data.fetch_design_loc(idx)
+                dscale = self.input_data.fetch_design_scale(idx)
+                size_factors = self.input_data.fetch_size_factors(idx) if fetch_size_factors else 1
+
+                yield x, dloc, dscale, size_factors
+
+        dtp = self.dtype
+        output_types = ((tf.int64, dtp, tf.int64), *(dtp,) * 3) if sparse else (dtp,) * 4
+
+        dataset = tf.data.Dataset.from_generator(generator=generate, output_types=output_types)
         """
-        with custom Generator
+        Workaround for sparse x tensor according to:
+        https://github.com/tensorflow/tensorflow/issues/16689#issuecomment-362662437
         """
-        custom_generator = self.Data_Generator(num_observations=self.input_data.num_observations,
-                                          input_data=self._input_data,
-                                          batch_size=batch_size,
-                                          drop_remainder=True)
-        dataset = tf.data.Dataset.from_generator(
-            generator=custom_generator.generate,
-            output_types=(self.dtype, self.dtype,
-                          self.dtype, self.dtype)
-            #output_types=(self._input_data.x.dtype, self._input_data.design_loc.dtype,
-            #              self._input_data.design_scale.dtype, self._input_data.size_factors.dtype)
-        )
+        if sparse:
+            dataset = dataset.map(lambda ivs_tuple, loc, scale, sf: (tf.SparseTensor(*ivs_tuple), loc, scale, sf))
         # output_shapes = (tf.TensorShape([]), tf.TensorShape([None])))
 
         # Iterate until conditions are fulfilled.
@@ -137,7 +149,7 @@ def convergence_decision(convergence_status, train_step):
         # fill with highest possible number:
         ll_current = np.zeros([self._input_data.num_features], self.dtype) + np.nextafter(np.inf, 0, dtype=self.dtype)
 
-        dataset_iterator = iter(input_list)
+        dataset_iterator = iter(dataset)
 
         irls_algo = False
         nr_algo = False
@@ -166,7 +178,7 @@ def convergence_decision(convergence_status, train_step):
             ll_prev = ll_current.copy()
             #if train_step % 10 == 0:
             logger.warning('step %i: loss: %f', train_step, np.sum(ll_current))
-            if not is_batched:
+            if full_model:
                 results = None
                 x_batch = None
                 first_batch = True
@@ -506,32 +518,8 @@ def __init__(self,
             self.batch_size = batch_size
             self.drop_remainder = drop_remainder
             self.data = np.random.permutation(self.num_observations)
+            self.fetch_size_factors = self.input_data.size_factors is not None and self.noise_model in ["nb", "norm"]
 
-        def generate(self):
-            for id in range(0, self.num_observations, self.batch_size):
-                """
-                Get data only if it is not the last batch while
-                drop_remainder is set on True.
-                """
-                if not ((id+self.batch_size) > self.num_observations and self.drop_remainder):
-                    """
-                    Generate data with size = batch_size or
-                    generate smaller data for remaining data (if smaller than batch_size)
-                    """
-                    if (id+self.batch_size) < self.num_observations:
-                        idx = self.data[id:(id+self.batch_size)]
-                    else:
-                        idx = self.data[id:self.num_observations]
-                    if isinstance(self.input_data.x, scipy.sparse.csr_matrix):
-                        x_tensor_idx, x_tensor_val, x =  self.input_data.fetch_x_sparse(idx)
-                        x_tensor = tf.SparseTensor(x_tensor_idx, x_tensor_val, x)
-                    else:
-                        x_tensor =self.input_data.fetch_x_dense(idx)
-                    design_loc_tensor = self.input_data.fetch_design_loc(idx)
-                    design_scale_tensor = self.input_data.fetch_design_scale(idx)
-                    if self.input_data.size_factors is not None:
-                        size_factors_tensor =  self.input_data.fetch_size_factors(idx)
-                    yield x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor
 
     @staticmethod
     def get_init_from_model(init_a, init_b, input_data, init_model):

From c421139689073b9ba1876206e12186087ee2d222 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Sun, 19 Jan 2020 02:47:56 +0100
Subject: [PATCH 036/124] rm additional n_obs assignment

---
 batchglm/train/tf2/base_glm/estimator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 47537dd8..239e55a2 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -163,7 +163,6 @@ def convergence_decision(convergence_status, train_step):
 
         else:
             update_func = optimizer_object.apply_gradients
-        n_obs = self._input_data.num_observations
 
         prev_params = self.model.params.numpy()
 

From 9ec92cb0f3cf0626edadcceb402bfad11bd1c054 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Sun, 19 Jan 2020 03:42:09 +0100
Subject: [PATCH 037/124] cleanup train method

---
 batchglm/train/tf2/base_glm/estimator.py | 57 ++++++++----------------
 1 file changed, 19 insertions(+), 38 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 239e55a2..26e92dbd 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -76,7 +76,13 @@ def _train(
             optim_algo: str = "adam"
     ):
 
+        conv_step = lambda x: np.all(x)
+        conv_all = lambda x, y: np.any(x) and y < stopping_criteria
+        assert convergence_criteria in ["step", "all_converged"], ("Unrecognized convergence criteria %s", convergence_criteria)
+        convergence_decision = conv_step if convergence_criteria == "step" else conv_all
+
         n_obs = self.input_data.num_observations
+        n_features = self.model.model_vars.num_features
         if batch_size > n_obs:
             batch_size = n_obs
         if not self._initialized:
@@ -106,8 +112,8 @@ def generate():
                 max_obs = n_obs
                 obs_pool = np.random.permutation(n_obs)
 
-            for id in range(0, max_obs, batch_size):
-                idx = obs_pool[id: id + batch_size]  # numpy returns just n_obs if id + batch_size > n_obs
+            for x in range(0, max_obs, batch_size):
+                idx = obs_pool[x: x + batch_size]  # numpy automatically returns only id:id+n_obs if out of range
 
                 x = self.input_data.fetch_x_sparse(idx) if sparse else self.input_data.fetch_x_dense(idx)
                 dloc = self.input_data.fetch_design_loc(idx)
@@ -118,65 +124,38 @@ def generate():
 
         dtp = self.dtype
         output_types = ((tf.int64, dtp, tf.int64), *(dtp,) * 3) if sparse else (dtp,) * 4
-
         dataset = tf.data.Dataset.from_generator(generator=generate, output_types=output_types)
-        """
-        Workaround for sparse x tensor according to:
-        https://github.com/tensorflow/tensorflow/issues/16689#issuecomment-362662437
-        """
         if sparse:
             dataset = dataset.map(lambda ivs_tuple, loc, scale, sf: (tf.SparseTensor(*ivs_tuple), loc, scale, sf))
-        # output_shapes = (tf.TensorShape([]), tf.TensorShape([None])))
 
-        # Iterate until conditions are fulfilled.
-        train_step = 0
+
 
         # Set all to convergence status = False, this is needed if multiple
         # training strategies are run:
-        converged_current = np.repeat(
-            False, repeats=self.model.model_vars.n_features)
-
-        def convergence_decision(convergence_status, train_step):
-            if convergence_criteria == "step":
-                return train_step < stopping_criteria
-            elif convergence_criteria == "all_converged":
-                return np.any(np.logical_not(convergence_status))
-            elif convergence_criteria == "both":
-                return np.any(np.logical_not(convergence_status)) and train_step < stopping_criteria
-            else:
-                raise ValueError("convergence_criteria %s not recognized." % convergence_criteria)
+        converged_current = np.zeros(n_features, dtype=np.bool)
 
-        # fill with highest possible number:
-        ll_current = np.zeros([self._input_data.num_features], self.dtype) + np.nextafter(np.inf, 0, dtype=self.dtype)
+        # fill with lowest possible number:
+        ll_current = np.nextafter(np.inf, np.zeros([self._input_data.num_features]), dtype=self.dtype)
 
         dataset_iterator = iter(dataset)
 
-        irls_algo = False
-        nr_algo = False
-        if optim_algo.lower() in ['nr','nr_tr']:
-            nr_algo = True
-            update_func = optimizer_object.perform_parameter_update
-
-        elif optim_algo.lower() in ['irls','irls_tr','irls_gd','irls_gd_tr']:
-            irls_algo = True
-            update_func = optimizer_object.perform_parameter_update
+        irls_algo = optim_algo.lower() in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr']
+        nr_algo = optim_algo.lower() in ['nr', 'nr_tr']
 
-        else:
-            update_func = optimizer_object.apply_gradients
+        update_func = optimizer_object.perform_parameter_update if irls_algo or nr_algo else optimizer_object.apply_gradients
 
         prev_params = self.model.params.numpy()
 
         batch_features = False
+        train_step = 0
         while convergence_decision(converged_current, train_step):
             # ### Iterate over the batches of the dataset.
             # x_batch is a tuple (idx, (X_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor))
             if benchmark:
                 t0_epoch = time.time()
 
-            not_converged = np.logical_not(self.model.model_vars.converged)
+            not_converged = ~ self.model.model_vars.converged
             ll_prev = ll_current.copy()
-            #if train_step % 10 == 0:
-            logger.warning('step %i: loss: %f', train_step, np.sum(ll_current))
             if full_model:
                 results = None
                 x_batch = None
@@ -293,6 +272,8 @@ def convergence_decision(convergence_status, train_step):
                     np.sum(features_updated).astype("int32"),
                     *[np.sum(convergence_vals) for convergence_vals in convergences[1:]]
                 )
+            else:
+                logger.warning('step %i: loss: %f', train_step, np.sum(ll_current))
             train_step += 1
             if benchmark:
                 t1_epoch = time.time()

From 70955f00dc8adfdeeaa8418b6bb773e1ab218f80 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Sun, 19 Jan 2020 04:02:57 +0100
Subject: [PATCH 038/124] further cleanup and bugfixes

---
 batchglm/train/tf2/base_glm/estimator.py | 118 +++--------------------
 1 file changed, 13 insertions(+), 105 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 26e92dbd..cd700ebd 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -48,19 +48,8 @@ def __init__(
             input_data,
             dtype,
     ):
-
-        self._input_data = input_data
-
-        TFEstimator.__init__(
-            self=self,
-            input_data=input_data,
-            dtype=dtype,
-        )
-        _EstimatorGLM.__init__(
-            self=self,
-            model=None,
-            input_data=input_data
-        )
+        TFEstimator.__init__(self=self, input_data=input_data, dtype=dtype)
+        _EstimatorGLM.__init__(self=self, model=None, input_data=input_data)
 
     def _train(
             self,
@@ -76,13 +65,13 @@ def _train(
             optim_algo: str = "adam"
     ):
 
-        conv_step = lambda x: np.all(x)
-        conv_all = lambda x, y: np.any(x) and y < stopping_criteria
+        conv_step = lambda x, y: not np.all(x)
+        conv_all = lambda x, y: not np.all(x) and y < stopping_criteria
         assert convergence_criteria in ["step", "all_converged"], ("Unrecognized convergence criteria %s", convergence_criteria)
         convergence_decision = conv_step if convergence_criteria == "step" else conv_all
 
         n_obs = self.input_data.num_observations
-        n_features = self.model.model_vars.num_features
+        n_features = self.input_data.num_features
         if batch_size > n_obs:
             batch_size = n_obs
         if not self._initialized:
@@ -135,7 +124,7 @@ def generate():
         converged_current = np.zeros(n_features, dtype=np.bool)
 
         # fill with lowest possible number:
-        ll_current = np.nextafter(np.inf, np.zeros([self._input_data.num_features]), dtype=self.dtype)
+        ll_current = np.nextafter(np.inf, np.zeros(n_features), dtype=self.dtype)
 
         dataset_iterator = iter(dataset)
 
@@ -148,9 +137,9 @@ def generate():
 
         batch_features = False
         train_step = 0
+
         while convergence_decision(converged_current, train_step):
-            # ### Iterate over the batches of the dataset.
-            # x_batch is a tuple (idx, (X_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor))
+
             if benchmark:
                 t0_epoch = time.time()
 
@@ -160,7 +149,7 @@ def generate():
                 results = None
                 x_batch = None
                 first_batch = True
-                for x_batch_tuple in dataset: #input_list:
+                for x_batch_tuple in dataset:
                     x_batch = self.getModelInput(x_batch_tuple, batch_features, not_converged)
                     current_results = self.model(x_batch)
                     if first_batch:
@@ -206,7 +195,7 @@ def generate():
                     update_var = tf.transpose(tf.scatter_nd(
                         indices,
                         tf.transpose(results[1]),
-                        shape=(self.model.model_vars.n_features, results[1].get_shape()[0])
+                        shape=(n_features, results[1].get_shape()[0])
                     ))
                 else:
                     update_var = results[1]
@@ -228,7 +217,7 @@ def generate():
             if is_batched:
                 jac_normalization = batch_size
             else:
-                jac_normalization = self._input_data.num_observations
+                jac_normalization = n_obs
             if irls_algo:
                 grad_numpy = tf.abs(tf.concat((results[1], results[2]), axis=1))
             elif nr_algo:
@@ -240,7 +229,7 @@ def generate():
                 grad_numpy = tf.scatter_nd(
                     indices,
                     grad_numpy,
-                    shape=(self.model.model_vars.n_features, self.model.params.get_shape()[0])
+                    shape=(n_features, self.model.params.get_shape()[0])
                 )
             grad_numpy = grad_numpy.numpy()
             convergences = self.calculate_convergence(
@@ -288,7 +277,7 @@ def generate():
         self.model.setMethod('nr_tr')
 
         first_batch = True
-        for x_batch_tuple in input_list:
+        for x_batch_tuple in dataset:
             current_results = self.model(x_batch_tuple)
             if first_batch:
                 results = list(current_results)
@@ -420,87 +409,6 @@ def get_optimizer_object(self, optimizer: str, learning_rate):
 
         return optim_obj
 
-    def fetch_fn(self, idx):
-        """
-        Documentation of tensorflow coding style in this function:
-        tf.py_func defines a python function (the getters of the InputData object slots)
-        as a tensorflow operation. Here, the shape of the tensor is lost and
-        has to be set with set_shape. For size factors, we use explicit broadcasting
-        as explained below.
-        """
-        # Catch dimension collapse error if idx is only one element long, ie. 0D:
-        if len(idx.shape) == 0:
-            idx = tf.expand_dims(idx, axis=0)
-
-        if isinstance(self._input_data.x, scipy.sparse.csr_matrix):
-
-            x_tensor_idx, x_tensor_val, x = tf.py_function(
-                func=self._input_data.fetch_x_sparse,
-                inp=[idx],
-                Tout=[np.int64, np.float64, np.int64],
-            )
-            # Note on Tout: np.float64 for val seems to be required to avoid crashing v1.12.
-            x_tensor_idx = tf.cast(x_tensor_idx, dtype=tf.int64)
-            x = tf.cast(x, dtype=tf.int64)
-            x_tensor_val = tf.cast(x_tensor_val, dtype=self.dtype)
-            x_tensor = tf.SparseTensor(x_tensor_idx, x_tensor_val, x)
-            x_tensor = tf.cast(x_tensor, dtype=self.dtype)
-
-        else:
-
-            x_tensor = tf.py_function(
-                func=self._input_data.fetch_x_dense,
-                inp=[idx],
-                Tout=self._input_data.x.dtype,
-            )
-
-            x_tensor.set_shape(idx.get_shape().as_list() + [self._input_data.num_features])
-            x_tensor = tf.cast(x_tensor, dtype=self.dtype)
-
-        design_loc_tensor = tf.py_function(
-            func=self._input_data.fetch_design_loc,
-            inp=[idx],
-            Tout=self._input_data.design_loc.dtype,
-        )
-        design_loc_tensor.set_shape(idx.get_shape().as_list() + [self._input_data.num_design_loc_params])
-        design_loc_tensor = tf.cast(design_loc_tensor, dtype=self.dtype)
-
-        design_scale_tensor = tf.py_function(
-            func=self._input_data.fetch_design_scale,
-            inp=[idx],
-            Tout=self._input_data.design_scale.dtype,
-        )
-        design_scale_tensor.set_shape(idx.get_shape().as_list() + [self._input_data.num_design_scale_params])
-        design_scale_tensor = tf.cast(design_scale_tensor, dtype=self.dtype)
-
-        if self._input_data.size_factors is not None and self.noise_model in ["nb", "norm"]:
-            size_factors_tensor = tf.py_function(
-                func=self._input_data.fetch_size_factors,
-                inp=[idx],
-                Tout=self._input_data.size_factors.dtype,
-            )
-            size_factors_tensor = tf.cast(size_factors_tensor, dtype=self.dtype)
-
-        else:
-            size_factors_tensor = tf.constant(1, shape=[1, 1], dtype=self.dtype)
-
-        # feature batching
-        return x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor
-
-    class Data_Generator:
-        def __init__(self,
-                     num_observations,
-                     input_data,
-                     batch_size: int = 1000,
-                     drop_remainder: bool = True):
-            self.num_observations = num_observations
-            self.input_data = input_data
-            self.batch_size = batch_size
-            self.drop_remainder = drop_remainder
-            self.data = np.random.permutation(self.num_observations)
-            self.fetch_size_factors = self.input_data.size_factors is not None and self.noise_model in ["nb", "norm"]
-
-
     @staticmethod
     def get_init_from_model(init_a, init_b, input_data, init_model):
         # Locations model:

From 3fc0e6279a06cf84bf4af83fd4c9d21b1c66a6ce Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Sun, 19 Jan 2020 07:02:03 +0100
Subject: [PATCH 039/124] bugfix: convergence decision

---
 batchglm/train/tf2/base_glm/estimator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index cd700ebd..4be9344a 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -65,8 +65,8 @@ def _train(
             optim_algo: str = "adam"
     ):
 
-        conv_step = lambda x, y: not np.all(x)
-        conv_all = lambda x, y: not np.all(x) and y < stopping_criteria
+        conv_all = lambda x, y: not np.all(x)
+        conv_step = lambda x, y: not np.all(x) and y < stopping_criteria
         assert convergence_criteria in ["step", "all_converged"], ("Unrecognized convergence criteria %s", convergence_criteria)
         convergence_decision = conv_step if convergence_criteria == "step" else conv_all
 

From 936199179f7867ff81e7224559c7c5045dd9a242 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 20 Jan 2020 01:58:59 +0100
Subject: [PATCH 040/124] compute scale gradient in final run

---
 batchglm/train/tf2/base_glm/estimator.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index d394c7ab..2722047a 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -275,6 +275,7 @@ def convergence_decision(convergence_status, train_step):
 
         # change to hessian mode since we still use hessian instead of FIM for self._fisher_inv
         self.model.setMethod('nr_tr')
+        self.model.hessian.compute_b = True
 
         first_batch = True
         for x_batch_tuple in input_list:
@@ -293,6 +294,7 @@ def convergence_decision(convergence_status, train_step):
         self._fisher_inv = tf.linalg.inv(results[2]).numpy()
         self._hessian = -results[2].numpy()
 
+        self.model.hessian.compute_b = self.model.compute_b
         self.model.batch_features = batch_features
 
     def getModelInput(self, x_batch_tuple: tuple, batch_features: bool, not_converged):

From a64c3ea9b3e96c083e09497802f1604fc3e78289 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 20 Jan 2020 03:58:17 +0100
Subject: [PATCH 041/124] resturcture train: support for batched mode

---
 batchglm/train/tf2/base_glm/estimator.py | 240 +++++++++++------------
 1 file changed, 118 insertions(+), 122 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 4be9344a..13c31e3f 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -110,6 +110,7 @@ def generate():
                 size_factors = self.input_data.fetch_size_factors(idx) if fetch_size_factors else 1
 
                 yield x, dloc, dscale, size_factors
+            return
 
         dtp = self.dtype
         output_types = ((tf.int64, dtp, tf.int64), *(dtp,) * 3) if sparse else (dtp,) * 4
@@ -118,7 +119,6 @@ def generate():
             dataset = dataset.map(lambda ivs_tuple, loc, scale, sf: (tf.SparseTensor(*ivs_tuple), loc, scale, sf))
 
 
-
         # Set all to convergence status = False, this is needed if multiple
         # training strategies are run:
         converged_current = np.zeros(n_features, dtype=np.bool)
@@ -126,8 +126,6 @@ def generate():
         # fill with lowest possible number:
         ll_current = np.nextafter(np.inf, np.zeros(n_features), dtype=self.dtype)
 
-        dataset_iterator = iter(dataset)
-
         irls_algo = optim_algo.lower() in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr']
         nr_algo = optim_algo.lower() in ['nr', 'nr_tr']
 
@@ -137,6 +135,7 @@ def generate():
 
         batch_features = False
         train_step = 0
+        num_batches = n_obs // batch_size
 
         while convergence_decision(converged_current, train_step):
 
@@ -145,129 +144,122 @@ def generate():
 
             not_converged = ~ self.model.model_vars.converged
             ll_prev = ll_current.copy()
-            if full_model:
-                results = None
-                x_batch = None
-                first_batch = True
-                for x_batch_tuple in dataset:
-                    x_batch = self.getModelInput(x_batch_tuple, batch_features, not_converged)
-                    current_results = self.model(x_batch)
-                    if first_batch:
-                        results = list(current_results)
-                        first_batch = False
-                    else:
-                        for i, x in enumerate(current_results):
-                            results[i] += x
-            else:
-                x_batch_tuple = next(dataset_iterator)
+            results = None
+            for i, x_batch_tuple in enumerate(dataset):
                 x_batch = self.getModelInput(x_batch_tuple, batch_features, not_converged)
-                results = self.model(x_batch)
-
-            if irls_algo or nr_algo:
-                if irls_algo:
-                    update_func(
-                        [x_batch, *results, False, n_obs],
-                        True,
-                        False,
-                        batch_features,
-                        ll_prev
-                    )
-                    if self._train_scale:
-                        update_func(
-                            [x_batch, *results, False, n_obs],
-                            False,
-                            True,
-                            batch_features,
-                            ll_prev
-                        )
+                current_results = self.model(x_batch)
+                if is_batched or i == 0:
+                    results = current_results
                 else:
-                    update_func(
-                        [x_batch, *results, False, n_obs],
-                        True,
-                        True,
-                        batch_features,
-                        ll_prev
+                    results = [tf.math.add(results[i], x) for i, x in enumerate(current_results)]
+
+                if is_batched or i == num_batches - 1:
+
+                    if irls_algo or nr_algo:
+                        if irls_algo:
+                            update_func(
+                                [x_batch, *results, False, n_obs],
+                                True,
+                                False,
+                                batch_features,
+                                ll_prev
+                            )
+                            if self._train_scale:
+                                update_func(
+                                    [x_batch, *results, False, n_obs],
+                                    False,
+                                    True,
+                                    batch_features,
+                                    ll_prev
+                                )
+                        else:
+                            update_func(
+                                [x_batch, *results, False, n_obs],
+                                True,
+                                True,
+                                batch_features,
+                                ll_prev
+                            )
+                        features_updated = self.model.model_vars.updated
+                    else:
+                        if batch_features:
+                            indices = tf.where(not_converged)
+                            update_var = tf.transpose(tf.scatter_nd(
+                                indices,
+                                tf.transpose(results[1]),
+                                shape=(n_features, results[1].get_shape()[0])
+                            ))
+                        else:
+                            update_var = results[1]
+                        update_func([(update_var, self.model.params)])
+                        features_updated = not_converged
+
+                    if benchmark:
+                        self.values.append(self.model.trainable_variables[0].numpy().copy())
+
+                    # Update converged status
+                    converged_prev = converged_current.copy()
+                    ll_current = self.loss.norm_neg_log_likelihood(results[0]).numpy()
+
+                    if batch_features:
+                        indices = tf.where(not_converged)
+                        updated_lls = tf.scatter_nd(indices, ll_current, shape=ll_prev.shape)
+                        ll_current = np.where(features_updated, updated_lls.numpy(), ll_prev)
+
+                    if is_batched:
+                        jac_normalization = batch_size
+                    else:
+                        jac_normalization = n_obs
+                    if irls_algo:
+                        grad_numpy = tf.abs(tf.concat((results[1], results[2]), axis=1))
+                    elif nr_algo:
+                        grad_numpy = tf.abs(results[1])
+                    else:
+                        grad_numpy = tf.abs(tf.transpose(results[1]))
+                    if batch_features:
+                        indices = tf.where(not_converged)
+                        grad_numpy = tf.scatter_nd(
+                            indices,
+                            grad_numpy,
+                            shape=(n_features, self.model.params.get_shape()[0])
+                        )
+                    grad_numpy = grad_numpy.numpy()
+                    convergences = self.calculate_convergence(
+                        converged_prev,
+                        ll_prev,
+                        ll_current,
+                        prev_params,
+                        jac_normalization,
+                        grad_numpy,
+                        features_updated,
+                        optimizer_object
                     )
-                features_updated = self.model.model_vars.updated
-            else:
-                if batch_features:
-                    indices = tf.where(not_converged)
-                    update_var = tf.transpose(tf.scatter_nd(
-                        indices,
-                        tf.transpose(results[1]),
-                        shape=(n_features, results[1].get_shape()[0])
-                    ))
-                else:
-                    update_var = results[1]
-                update_func([(update_var, self.model.params)])
-                features_updated = not_converged
 
-            if benchmark:
-                self.values.append(self.model.trainable_variables[0].numpy().copy())
-
-            # Update converged status
-            converged_prev = converged_current.copy()
-            ll_current = self.loss.norm_neg_log_likelihood(results[0]).numpy()
-
-            if batch_features:
-                indices = tf.where(not_converged)
-                updated_lls = tf.scatter_nd(indices, ll_current, shape=ll_prev.shape)
-                ll_current = np.where(features_updated, updated_lls.numpy(), ll_prev)
-
-            if is_batched:
-                jac_normalization = batch_size
-            else:
-                jac_normalization = n_obs
-            if irls_algo:
-                grad_numpy = tf.abs(tf.concat((results[1], results[2]), axis=1))
-            elif nr_algo:
-                grad_numpy = tf.abs(results[1])
-            else:
-                grad_numpy = tf.abs(tf.transpose(results[1]))
-            if batch_features:
-                indices = tf.where(not_converged)
-                grad_numpy = tf.scatter_nd(
-                    indices,
-                    grad_numpy,
-                    shape=(n_features, self.model.params.get_shape()[0])
-                )
-            grad_numpy = grad_numpy.numpy()
-            convergences = self.calculate_convergence(
-                converged_prev,
-                ll_prev,
-                ll_current,
-                prev_params,
-                jac_normalization,
-                grad_numpy,
-                features_updated,
-                optimizer_object
-            )
-
-            prev_params = self.model.params.numpy()
-            #converged_current, converged_f, converged_g, converged_x = convergences
-            converged_current = convergences[0]
-            self.model.model_vars.convergence_update(converged_current, features_updated)
-            num_converged = np.sum(converged_current)
-            if num_converged != np.sum(converged_prev):
-                if featurewise and not batch_features:
-                    batch_features = True
-                    self.model.batch_features = batch_features
-                logger_pattern = "Step: %i loss: %f, converged %i, updated %i, (logs: %i, grad: %i, x_step: %i)"
-                logger.warning(
-                    logger_pattern,
-                    train_step,
-                    np.sum(ll_current),
-                    num_converged.astype("int32"),
-                    np.sum(features_updated).astype("int32"),
-                    *[np.sum(convergence_vals) for convergence_vals in convergences[1:]]
-                )
-            else:
-                logger.warning('step %i: loss: %f', train_step, np.sum(ll_current))
-            train_step += 1
-            if benchmark:
-                t1_epoch = time.time()
-                self.times.append(t1_epoch-t0_epoch)
-                self.converged.append(num_converged)
+                    prev_params = self.model.params.numpy()
+                    #converged_current, converged_f, converged_g, converged_x = convergences
+                    converged_current = convergences[0]
+                    self.model.model_vars.convergence_update(converged_current, features_updated)
+                    num_converged = np.sum(converged_current)
+                    if num_converged != np.sum(converged_prev):
+                        if featurewise and not batch_features:
+                            batch_features = True
+                            self.model.batch_features = batch_features
+                        logger_pattern = "Step: %i loss: %f, converged %i, updated %i, (logs: %i, grad: %i, x_step: %i)"
+                        logger.warning(
+                            logger_pattern,
+                            train_step,
+                            np.sum(ll_current),
+                            num_converged.astype("int32"),
+                            np.sum(features_updated).astype("int32"),
+                            *[np.sum(convergence_vals) for convergence_vals in convergences[1:]]
+                        )
+                    else:
+                        logger.warning('step %i: loss: %f', train_step, np.sum(ll_current))
+                    train_step += 1
+                    if benchmark:
+                        t1_epoch = time.time()
+                        self.times.append(t1_epoch-t0_epoch)
+                        self.converged.append(num_converged)
 
         # Evaluate final params
         logger.warning("Final Evaluation run.")
@@ -286,6 +278,10 @@ def generate():
                 for i, x in enumerate(current_results):
                     results[i] += x
 
+        for i, x_batch_tuple in enumerate(dataset):
+            current_results = self.model(x_batch_tuple)
+            results = current_results if i == 0 else [tf.math.add(results[i], x) for i, x in enumerate(current_results)]
+
         self._log_likelihood = self.loss.norm_log_likelihood(results[0].numpy())
         self._jacobian = tf.reduce_sum(tf.abs(results[1] / self.input_data.num_observations), axis=1)
 

From a3ca215ded4125389eeec696f0f3dca824b5e71e Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 20 Jan 2020 12:43:52 +0100
Subject: [PATCH 042/124] always call input_data property not _input_data

---
 batchglm/train/tf2/base_glm/estimator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index f8ee23f9..8b02a9da 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -38,7 +38,7 @@ def finalize(self, **kwargs):
         """
 
         a_var, b_var = self.model.unpack_params([self.model.params, self.model.model_vars.a_var.get_shape()[0]])
-        self.model = self.get_model_container(self._input_data)
+        self.model = self.get_model_container(self.input_data)
         self.model._a_var = a_var.numpy()
         self.model._b_var = b_var.numpy()
         self._loss = tf.reduce_sum(np.negative(self._log_likelihood) / self.input_data.num_observations).numpy()
@@ -92,7 +92,7 @@ def generate():
             We use max_obs to cut the observations with max_obs % batch_size = 0 to ensure consistent
             sizes of tensors.
             """
-            fetch_size_factors = self._input_data.size_factors is not None and self.noise_model in ["nb", "norm"]
+            fetch_size_factors = self.input_data.size_factors is not None and self.noise_model in ["nb", "norm"]
 
             if full_model:
                 max_obs = n_obs - (n_obs % batch_size)
@@ -300,7 +300,7 @@ def getModelInput(self, x_batch_tuple: tuple, batch_features: bool, not_converge
         """
         if batch_features:
             x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor = x_batch_tuple
-            if isinstance(self._input_data.x, scipy.sparse.csr_matrix):
+            if isinstance(self.input_data.x, scipy.sparse.csr_matrix):
                 not_converged_idx = np.where(not_converged)[0]
                 feature_columns = tf.sparse.split(
                     x_tensor,

From 8407af18b8c80efa189a8702c9f6784dc6a57bfd Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 21 Jan 2020 02:30:45 +0100
Subject: [PATCH 043/124] log updated and converged in each step

---
 batchglm/train/tf2/base_glm/estimator.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 8b02a9da..cd7c3bf3 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -154,7 +154,6 @@ def generate():
                     results = [tf.math.add(results[i], x) for i, x in enumerate(current_results)]
 
                 if is_batched or i == num_batches - 1:
-
                     if irls_algo or nr_algo:
                         if irls_algo:
                             update_func(
@@ -254,7 +253,7 @@ def generate():
                             *[np.sum(convergence_vals) for convergence_vals in convergences[1:]]
                         )
                     else:
-                        logger.warning('step %i: loss: %f', train_step, np.sum(ll_current))
+                        logger.warning('step %i: loss: %f converged %i, updated %i', train_step, np.sum(ll_current), num_converged.astype("int32"), np.sum(features_updated).astype("int32"))
                     train_step += 1
                     if benchmark:
                         t1_epoch = time.time()
@@ -333,7 +332,6 @@ def calculate_convergence(self, converged_prev, ll_prev, ll_current, prev_params
         ll_difference = np.abs(ll_prev - ll_current) / ll_prev
         ll_converged = (ll_difference < pkg_constants.LLTOL_BY_FEATURE) & features_updated
         epoch_ll_converged = not_converged_prev & ll_converged  # formerly known as converged_f
-
         total_converged |= epoch_ll_converged
 
         """
@@ -363,15 +361,15 @@ def calculate_convergence(self, converged_prev, ll_prev, ll_current, prev_params
         In case we use irls_tr/irls_gd_tr or nr_tr, we can also utilize the trusted region radius.
         For now it must not be below the threshold for the X step of the loc model.
         """
+
         if hasattr(optimizer_object, 'trusted_region_mode') and optimizer_object.trusted_region_mode:
-            converged_tr = optimizer_object.tr_radius.numpy() < pkg_constants.XTOL_BY_FEATURE_LOC
+            converged_tr = optimizer_object.tr_radius.numpy() < pkg_constants.TRTOL_BY_FEATURE_LOC
             if hasattr(optimizer_object, 'tr_radius_b') and self._train_scale:
-                converged_tr &= optimizer_object.tr_radius_b.numpy() < pkg_constants.XTOL_BY_FEATURE_SCALE
+                converged_tr &= optimizer_object.tr_radius_b.numpy() < pkg_constants.TRTOL_BY_FEATURE_SCALE
             epoch_tr_converged = not_converged_prev & converged_tr
             epoch_step_converged |= epoch_tr_converged
 
         total_converged |= epoch_step_converged
-
         return total_converged, epoch_ll_converged, epoch_grad_converged, epoch_step_converged
 
     def get_optimizer_object(self, optimizer: str, learning_rate):

From c9ee70f9a7ece2eaab4d4e7656a21519b0306ed8 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 5 Feb 2020 16:36:30 +0100
Subject: [PATCH 044/124] bugfixes: full model for 2nd order eval

---
 batchglm/train/tf2/base_glm/estimator.py |  78 ++++++-------
 batchglm/train/tf2/base_glm/model.py     |   2 +-
 batchglm/train/tf2/base_glm/optim.py     | 142 +++++++++++------------
 3 files changed, 108 insertions(+), 114 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index cd7c3bf3..5f93fd54 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -145,8 +145,14 @@ def generate():
             not_converged = ~ self.model.model_vars.converged
             ll_prev = ll_current.copy()
             results = None
-            for i, x_batch_tuple in enumerate(dataset):
-                x_batch = self.getModelInput(x_batch_tuple, batch_features, not_converged)
+            x_batches = []
+            for x_batch_tuple in dataset:
+                if batch_features:
+                    x_batches.append(self.getModelInput(x_batch_tuple, not_converged))
+                else:
+                    x_batches.append(x_batch_tuple)
+
+            for i, x_batch in enumerate(x_batches):
                 current_results = self.model(x_batch)
                 if is_batched or i == 0:
                     results = current_results
@@ -157,27 +163,25 @@ def generate():
                     if irls_algo or nr_algo:
                         if irls_algo:
                             update_func(
-                                [x_batch, *results, False, n_obs],
-                                True,
-                                False,
-                                batch_features,
-                                ll_prev
+                                inputs=[x_batches, *results],
+                                compute_a=True,
+                                compute_b=False,
+                                batch_features=batch_features,
+                                compute_full_ll=is_batched
                             )
                             if self._train_scale:
                                 update_func(
-                                    [x_batch, *results, False, n_obs],
-                                    False,
-                                    True,
-                                    batch_features,
-                                    ll_prev
+                                    inputs=[x_batches, *results],
+                                    compute_a=False,
+                                    compute_b=True,
+                                    batch_features=batch_features,
+                                    compute_full_ll=is_batched
                                 )
                         else:
                             update_func(
-                                [x_batch, *results, False, n_obs],
-                                True,
-                                True,
-                                batch_features,
-                                ll_prev
+                                inputs=[x_batches, *results],
+                                batch_features=batch_features,
+                                compute_full_ll=is_batched
                             )
                         features_updated = self.model.model_vars.updated
                     else:
@@ -198,11 +202,11 @@ def generate():
 
                     # Update converged status
                     converged_prev = converged_current.copy()
-                    ll_current = self.loss.norm_neg_log_likelihood(results[0]).numpy()
+                    ll_current = -results[0].numpy() / self.input_data.num_observations
 
                     if batch_features:
                         indices = tf.where(not_converged)
-                        updated_lls = tf.scatter_nd(indices, ll_current, shape=ll_prev.shape)
+                        updated_lls = tf.scatter_nd(indices, ll_current, shape=[n_features])
                         ll_current = np.where(features_updated, updated_lls.numpy(), ll_prev)
 
                     if is_batched:
@@ -292,28 +296,23 @@ def generate():
         self.model.hessian.compute_b = self.model.compute_b
         self.model.batch_features = batch_features
 
-    def getModelInput(self, x_batch_tuple: tuple, batch_features: bool, not_converged):
+    def getModelInput(self, x_batch_tuple: tuple, not_converged):
         """
-            Checks whether batch_features is true and returns a smaller x_batch tuple reduced
-            in feature space. Otherwise returns the x_batch.
+            Returns a smaller x_batch tuple reduced in feature space.
         """
-        if batch_features:
-            x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor = x_batch_tuple
-            if isinstance(self.input_data.x, scipy.sparse.csr_matrix):
-                not_converged_idx = np.where(not_converged)[0]
-                feature_columns = tf.sparse.split(
-                    x_tensor,
-                    num_split=self.model.model_vars.n_features,
-                    axis=1)
-                feature_columns = [feature_columns[i] for i in not_converged_idx]
-                x_tensor = tf.sparse.concat(axis=1, sp_inputs=feature_columns)
-                if not isinstance(x_tensor, tf.sparse.SparseTensor):
-                    raise RuntimeError("x_tensor now dense!!!")
-            else:
-                x_tensor = tf.boolean_mask(tensor=x_tensor, mask=not_converged, axis=1)
-            x_batch = (x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor)
+        x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor = x_batch_tuple
+        if isinstance(self.input_data.x, scipy.sparse.csr_matrix):
+            not_converged_idx = np.where(not_converged)[0]
+            feature_columns = tf.sparse.split(
+                x_tensor,
+                num_split=self.model.model_vars.n_features,
+                axis=1)
+            feature_columns = [feature_columns[i] for i in not_converged_idx]
+            x_tensor = tf.sparse.concat(axis=1, sp_inputs=feature_columns)
+
         else:
-            x_batch = x_batch_tuple
+            x_tensor = tf.boolean_mask(tensor=x_tensor, mask=not_converged, axis=1)
+        x_batch = (x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor)
 
         return x_batch
 
@@ -393,7 +392,8 @@ def get_optimizer_object(self, optimizer: str, learning_rate):
                 "dtype": self.dtype,
                 "model": self.model,
                 "name": optimizer,
-                "trusted_region_mode": tr_mode
+                "trusted_region_mode": tr_mode,
+                "n_obs": self.input_data.num_observations
             }
             if optimizer.startswith('irls'):
                 optim_obj = IRLS(**init_dict)
diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index f28b221b..73c68658 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -85,8 +85,8 @@ def _call_parameters(self, inputs, keep_previous_params_copy=False):
 
     def calc_ll(self, inputs, keep_previous_params_copy=False):
         parameters = self._call_parameters(inputs[1:], keep_previous_params_copy)
-
         log_probs = self.likelihood([*parameters[:-2], inputs[0], np.sum(self.model_vars.updated)])
+        log_probs = tf.reduce_sum(log_probs, axis=0)
         return (log_probs, *parameters[2:])
 
     def _return_jacobians(self, inputs):
diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index ae50a2fa..211cc1df 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -11,11 +11,8 @@ class SecondOrderOptim(OptimizerBase, metaclass=abc.ABCMeta):
     Superclass for NR and IRLS
     """
 
-    def _norm_log_likelihood(self, log_probs):
-        return tf.reduce_mean(log_probs, axis=0, name="log_likelihood")
-
     def _norm_neg_log_likelihood(self, log_probs):
-        return - self._norm_log_likelihood(log_probs)
+        return - log_probs / self.n_obs
 
     def _resource_apply_dense(self, grad, handle, apply_state=None):
 
@@ -38,14 +35,14 @@ def _create_slots(self, var_list):
 
     def _trust_region_ops(
             self,
-            x_batch,
+            x_batches,
             likelihood,
             proposed_vector,
             proposed_gain,
             compute_a,
             compute_b,
             batch_features,
-            ll_prev
+            compute_full_ll
     ):
         # Load hyper-parameters:
         assert pkg_constants.TRUST_REGION_ETA0 < pkg_constants.TRUST_REGION_ETA1, \
@@ -67,28 +64,44 @@ def _trust_region_ops(
 
         # Phase I: Perform a trial update.
         # Propose parameter update:
+        if compute_full_ll:
+            for i, x_batch in enumerate(x_batches):
+                log_likelihood = self.model.calc_ll([*x_batch], keep_previous_params_copy=True)[0]
+                if i == 0:
+                    old_likelihood = log_likelihood
+                else:
+                    old_likelihood += log_likelihood
+        else:
+            old_likelihood = likelihood
+        old_likelihood = self._norm_neg_log_likelihood(old_likelihood)
 
         self.model.params_copy.assign_sub(proposed_vector)
         # Phase II: Evaluate success of trial update and complete update cycle.
         # Include parameter updates only if update improves cost function:
-        new_likelihood = self.model.calc_ll([*x_batch], keep_previous_params_copy=True)[0]
-        delta_f_actual = self._norm_neg_log_likelihood(likelihood) - self._norm_neg_log_likelihood(new_likelihood)
+        for i, x_batch in enumerate(x_batches):
+            log_likelihood = self.model.calc_ll([*x_batch], keep_previous_params_copy=True)[0]
+            if i == 0:
+                new_likelihood = log_likelihood
+            else:
+                new_likelihood += log_likelihood
+        new_likelihood = self._norm_neg_log_likelihood(new_likelihood)
 
-        if batch_features:
+        delta_f_actual = old_likelihood - new_likelihood
 
+        if batch_features:
+            n_features = self.model.model_vars.n_features
             indices = tf.where(tf.logical_not(self.model.model_vars.converged))
-            updated_lls = tf.scatter_nd(indices, delta_f_actual, shape=ll_prev.shape)
-            delta_f_actual = np.where(self.model.model_vars.converged, ll_prev, updated_lls.numpy())
+            old_lls = tf.scatter_nd(self.model.model_vars.converged, old_likelihood, shape=tf.constant([n_features]))
+            delta_f_actual = tf.scatter_nd_update(old_lls, indices, delta_f_actual)
             update_var = tf.transpose(tf.scatter_nd(
                 indices,
                 tf.transpose(proposed_vector),
-                shape=(self.model.model_vars.n_features, proposed_vector.get_shape()[0])
+                shape=(n_features, proposed_vector.get_shape()[0])
             ))
-
             gain_var = tf.transpose(tf.scatter_nd(
                 indices,
                 proposed_gain,
-                shape=([self.model.model_vars.n_features])))
+                shape=([n_features])))
         else:
             update_var = proposed_vector
             gain_var = proposed_gain
@@ -102,7 +115,7 @@ def _trust_region_ops(
             params = tf.transpose(tf.scatter_nd(
                 indices,
                 tf.transpose(self.model.params_copy),
-                shape=(self.model.model_vars.n_features, self.model.params.get_shape()[0])
+                shape=(n_features, self.model.params.get_shape()[0])
             ))
 
             theta_new_tr = tf.add(
@@ -110,9 +123,6 @@ def _trust_region_ops(
                 tf.multiply(params, update_theta_numeric)
             )
 
-
-            #self.model.params.assign_(tf.multiply(params, update_theta_numeric))
-
         else:
             params = self.model.params_copy
             theta_new_tr = tf.add(
@@ -147,42 +157,36 @@ def _trust_region_ops(
         radius_new = tf.minimum(tf.multiply(tr_radius, radius_update), upper_bound)
         tr_radius.assign(radius_new)
 
-    def __init__(self, dtype: tf.dtypes.DType, trusted_region_mode: bool, model: tf.keras.Model, name: str):
-
-        self.model = model
-        self.gd = name in ['irls_gd', 'irls_gd_tr']
+    def __init__(self, dtype: tf.dtypes.DType, trusted_region_mode: bool, model: tf.keras.Model, name: str, n_obs: int):
 
         super(SecondOrderOptim, self).__init__(name)
 
+        self.model = model
+        self.gd = name in ['irls_gd', 'irls_gd_tr']
         self._dtype = dtype
+        self.n_obs = tf.cast(n_obs, dtype=self._dtype)
         self.trusted_region_mode = trusted_region_mode
-        if trusted_region_mode:
 
+        if trusted_region_mode:
+            n_features = self.model.model_vars.n_features
             self.tr_radius = tf.Variable(
-                np.zeros(shape=[self.model.model_vars.n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT,
-                dtype=self._dtype, trainable=False
-            )
+                np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT,
+                dtype=self._dtype, trainable=False)
             if self.gd:
                 self.tr_radius_b = tf.Variable(
-                    np.zeros(shape=[self.model.model_vars.n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT,
-                    dtype=self._dtype, trainable=False
-                )
-
-            self.tr_ll_prev = tf.Variable(np.zeros(shape=[self.model.model_vars.n_features]), trainable=False)
-            self.tr_pred_gain = tf.Variable(np.zeros(shape=[self.model.model_vars.n_features]), trainable=False)
-
+                    np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT,
+                    dtype=self._dtype, trainable=False)
         else:
-
             self.tr_radius = tf.Variable(np.array([np.inf]), dtype=self._dtype, trainable=False)
 
     @abc.abstractmethod
     def perform_parameter_update(self, inputs):
         pass
 
-    def _newton_type_update(self, lhs, rhs, psd):
+    def _newton_type_update(self, lhs, rhs, psd=False):
 
         new_rhs = tf.expand_dims(rhs, axis=-1)
-        res = tf.linalg.lstsq(lhs, new_rhs, fast=False)
+        res = tf.linalg.lstsq(lhs, new_rhs, fast=psd)
         delta_t = tf.squeeze(res, axis=-1)
         update_tensor = tf.transpose(delta_t)
         return update_tensor
@@ -256,47 +260,45 @@ def _trust_region_newton_cost_gain(
             self,
             proposed_vector,
             neg_jac,
-            hessian_fim,
-            n_obs
+            hessian_fim
     ):
         pred_cost_gain = tf.add(
             tf.einsum(
                 'ni,in->n',
                 neg_jac,
                 proposed_vector
-            ) / n_obs,
+            ) / self.n_obs,
             0.5 * tf.einsum(
                 'nix,xin->n',
                 tf.einsum('inx,nij->njx',
                           tf.expand_dims(proposed_vector, axis=-1),
                           hessian_fim),
                 tf.expand_dims(proposed_vector, axis=0)
-            ) / tf.square(n_obs)
+            ) / tf.square(self.n_obs)
         )
         return pred_cost_gain
 
 
 class NR(SecondOrderOptim):
 
-    def _get_updates(self, lhs, rhs, psd, compute_a, compute_b):
+    def _get_updates(self, lhs, rhs, compute_a, compute_b):
 
-        update_raw = self._newton_type_update(lhs=lhs, rhs=rhs, psd=psd)
+        update_raw = self._newton_type_update(lhs=lhs, rhs=rhs)
         update = self._pad_updates(update_raw, compute_a, compute_b)
 
         return update_raw, update
 
-    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, prev_ll=None):
+    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, compute_full_ll=False):
 
-        x_batch, log_probs, jacobians, hessians, psd, n_obs = inputs
+        x_batches, log_probs, jacobians, hessians = inputs
         if not (compute_a or compute_b):
             raise ValueError(
                 "Nothing can be trained. Please make sure at least one of train_mu and train_r is set to True.")
 
-        update_raw, update = self._get_updates(hessians, jacobians, psd, compute_a, compute_b)
+        update_raw, update = self._get_updates(hessians, jacobians, compute_a, compute_b)
 
         if self.trusted_region_mode:
 
-            n_obs = tf.cast(n_obs, dtype=self._dtype)
             if batch_features:
                 radius_container = tf.boolean_mask(
                     tensor=self.tr_radius,
@@ -310,8 +312,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             tr_pred_cost_gain = self._trust_region_newton_cost_gain(
                 proposed_vector=tr_proposed_vector,
                 neg_jac=jacobians,
-                hessian_fim=hessians,
-                n_obs=n_obs
+                hessian_fim=hessians
             )
 
             tr_proposed_vector_pad = self._pad_updates(
@@ -321,14 +322,14 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             )
 
             self._trust_region_ops(
-                x_batch=x_batch,
+                x_batches=x_batches,
                 likelihood=log_probs,
                 proposed_vector=tr_proposed_vector_pad,
                 proposed_gain=tr_pred_cost_gain,
                 compute_a=compute_a,
                 compute_b=compute_b,
                 batch_features=batch_features,
-                ll_prev=prev_ll
+                compute_full_ll=compute_full_ll
             )
 
         else:
@@ -352,7 +353,6 @@ def _calc_proposed_vector_and_pred_cost_gain(
             self,
             update_x,
             radius_container,
-            n_obs,
             gd,
             neg_jac_x,
             fim_x=None
@@ -363,8 +363,6 @@ def _calc_proposed_vector_and_pred_cost_gain(
 
         :param radius_container: tf.tensor ? x ? TODO
 
-        :param n_obs: ? TODO
-            Number of observations in current batch.
         :param gd: boolean
             If True, the proposed vector and predicted cost gain are
             calculated by linear functions related to IRLS_GD(_TR) optimizer.
@@ -382,7 +380,7 @@ def _calc_proposed_vector_and_pred_cost_gain(
         proposed_vector_x = self._trust_region_update(
             update_raw=update_x,
             radius_container=radius_container,
-            n_obs=n_obs if gd else None
+            n_obs=self.n_obs if gd else None
         )
         # here, functions have different number of arguments, thus
         # must be written out
@@ -395,8 +393,7 @@ def _calc_proposed_vector_and_pred_cost_gain(
             pred_cost_gain_x = self._trust_region_newton_cost_gain(
                 proposed_vector=proposed_vector_x,
                 neg_jac=neg_jac_x,
-                hessian_fim=fim_x,
-                n_obs=n_obs
+                hessian_fim=fim_x
             )
 
         return proposed_vector_x, pred_cost_gain_x
@@ -412,9 +409,9 @@ def _trust_region_linear_cost_gain(
         ), axis=0)
         return pred_cost_gain
 
-    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, prev_ll=None):
+    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, compute_full_ll=False):
 
-        x_batch, log_probs, jac_a, jac_b, fim_a, fim_b, psd, n_obs = inputs
+        x_batches, log_probs, jac_a, jac_b, fim_a, fim_b = inputs
         if not (compute_a or compute_b):
             raise ValueError(
                 "Nothing can be trained. Please make sure at least one of train_mu and train_r is set to True.")
@@ -437,8 +434,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             else:
                 update_b = self._newton_type_update(
                     lhs=fim_b,
-                    rhs=jac_b,
-                    psd=False
+                    rhs=jac_b
                 )
 
         if not self.trusted_region_mode:
@@ -470,8 +466,6 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             self.model.params.assign_sub(update_var)
 
         else:
-
-            n_obs = tf.cast(n_obs, dtype=self._dtype)
             # put together update_raw based on proposed vector and cost gain depending on train_r and train_mu
             if compute_b:
                 if compute_a:
@@ -482,10 +476,10 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                     else:
                         radius_container = self.tr_radius
                     tr_proposed_vector_b, tr_pred_cost_gain_b = self._calc_proposed_vector_and_pred_cost_gain(
-                        update_b, radius_container, n_obs, self.gd, jac_b, fim_b)
+                        update_b, radius_container, self.gd, jac_b, fim_b)
 
                     tr_proposed_vector_a, tr_pred_cost_gain_a = self._calc_proposed_vector_and_pred_cost_gain(
-                        update_a, radius_container, n_obs, False, jac_a, fim_a)
+                        update_a, radius_container, False, jac_a, fim_a)
 
                     tr_update_raw = tf.concat([tr_proposed_vector_a, tr_proposed_vector_b], axis=0)
                     tr_pred_cost_gain = tf.add(tr_pred_cost_gain_a, tr_pred_cost_gain_b)
@@ -498,7 +492,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                             mask=tf.logical_not(self.model.model_vars.converged))
 
                     tr_proposed_vector_b, tr_pred_cost_gain_b = self._calc_proposed_vector_and_pred_cost_gain(
-                        update_b, radius_container, n_obs, self.gd, jac_b, fim_b)
+                        update_b, radius_container, self.gd, jac_b, fim_b)
 
                     # directly apply output of calc_proposed_vector_and_pred_cost_gain to tr_update_raw
                     # and tr_pred_cost_gain
@@ -514,7 +508,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                 # here train_r is False AND train_mu is true, so the output of the function can directly be applied to
                 # tr_update_raw and tr_pred_cost_gain, similar to train_r = True and train_mu = False
                 tr_update_raw, tr_pred_cost_gain = self._calc_proposed_vector_and_pred_cost_gain(
-                    update_a, radius_container, n_obs, False, jac_a, fim_a)
+                    update_a, radius_container, False, jac_a, fim_a)
 
             # perform update
             tr_update = self._pad_updates(
@@ -524,12 +518,12 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             )
 
             self._trust_region_ops(
-                x_batch,
-                log_probs,
-                tr_update,
-                tr_pred_cost_gain,
-                compute_a,
-                compute_b,
-                batch_features,
-                prev_ll
+                x_batches=x_batches,
+                likelihood=log_probs,
+                proposed_vector=tr_update,
+                proposed_gain=tr_pred_cost_gain,
+                compute_a=compute_a,
+                compute_b=compute_b,
+                batch_features=batch_features,
+                compute_full_ll=compute_full_ll
             )

From e0cbb102547a4981f3c80172cd345c5a23c7e1d8 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 5 Feb 2020 20:28:22 +0100
Subject: [PATCH 045/124] bugfixes: ll now summed up over obs + tr update

---
 batchglm/train/tf2/base_glm/estimator.py | 23 ++++------
 batchglm/train/tf2/base_glm/optim.py     | 58 +++++++++++++++---------
 2 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 5f93fd54..bcff47e2 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -167,7 +167,7 @@ def generate():
                                 compute_a=True,
                                 compute_b=False,
                                 batch_features=batch_features,
-                                compute_full_ll=is_batched
+                                is_batched=is_batched
                             )
                             if self._train_scale:
                                 update_func(
@@ -175,13 +175,13 @@ def generate():
                                     compute_a=False,
                                     compute_b=True,
                                     batch_features=batch_features,
-                                    compute_full_ll=is_batched
+                                    is_batched=is_batched
                                 )
                         else:
                             update_func(
                                 inputs=[x_batches, *results],
                                 batch_features=batch_features,
-                                compute_full_ll=is_batched
+                                is_batched=is_batched
                             )
                         features_updated = self.model.model_vars.updated
                     else:
@@ -272,21 +272,14 @@ def generate():
         self.model.setMethod('nr_tr')
         self.model.hessian.compute_b = True
 
-        first_batch = True
-        for x_batch_tuple in dataset:
-            current_results = self.model(x_batch_tuple)
-            if first_batch:
-                results = list(current_results)
-                first_batch = False
-            else:
-                for i, x in enumerate(current_results):
-                    results[i] += x
-
         for i, x_batch_tuple in enumerate(dataset):
             current_results = self.model(x_batch_tuple)
-            results = current_results if i == 0 else [tf.math.add(results[i], x) for i, x in enumerate(current_results)]
+            if i == 0:
+                results = current_results
+            else:
+                results = [tf.math.add(results[i], x) for i, x in enumerate(current_results)]
 
-        self._log_likelihood = self.loss.norm_log_likelihood(results[0].numpy())
+        self._log_likelihood = results[0].numpy() / self.input_data.num_observations
         self._jacobian = tf.reduce_sum(tf.abs(results[1] / self.input_data.num_observations), axis=1)
 
         # TODO: maybe report fisher inf here. But concatenation only works if !intercept_scale
diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index 211cc1df..00229d7d 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -36,13 +36,13 @@ def _create_slots(self, var_list):
     def _trust_region_ops(
             self,
             x_batches,
-            likelihood,
+            log_probs,
             proposed_vector,
             proposed_gain,
             compute_a,
             compute_b,
             batch_features,
-            compute_full_ll
+            is_batched
     ):
         # Load hyper-parameters:
         assert pkg_constants.TRUST_REGION_ETA0 < pkg_constants.TRUST_REGION_ETA1, \
@@ -64,20 +64,25 @@ def _trust_region_ops(
 
         # Phase I: Perform a trial update.
         # Propose parameter update:
-        if compute_full_ll:
+        """
+        Current likelihood refers to the likelihood that has been calculated in the last model call.
+        We are always evaluating on the full model, so if we train on the batched model (is_batched),
+        current likelihood needs to be calculated on the full model using the same model state as
+        used in the last model call:
+        """
+        current_likelihood = log_probs
+        if is_batched:
             for i, x_batch in enumerate(x_batches):
                 log_likelihood = self.model.calc_ll([*x_batch], keep_previous_params_copy=True)[0]
-                if i == 0:
-                    old_likelihood = log_likelihood
-                else:
-                    old_likelihood += log_likelihood
-        else:
-            old_likelihood = likelihood
-        old_likelihood = self._norm_neg_log_likelihood(old_likelihood)
+                current_likelihood = log_likelihood if i == 0 else tf.math.add(current_likelihood, log_likelihood)
+
+        current_likelihood = self._norm_neg_log_likelihood(current_likelihood)
 
+        """
+        The new likelihood is calculated on the full model now, after updating the parameters using
+        the proposed vector:
+        """
         self.model.params_copy.assign_sub(proposed_vector)
-        # Phase II: Evaluate success of trial update and complete update cycle.
-        # Include parameter updates only if update improves cost function:
         for i, x_batch in enumerate(x_batches):
             log_likelihood = self.model.calc_ll([*x_batch], keep_previous_params_copy=True)[0]
             if i == 0:
@@ -86,13 +91,22 @@ def _trust_region_ops(
                 new_likelihood += log_likelihood
         new_likelihood = self._norm_neg_log_likelihood(new_likelihood)
 
-        delta_f_actual = old_likelihood - new_likelihood
+        """
+        delta_f_actual shows the difference between the log likelihoods before and after the proposed
+        update of parameters. It is > 0 if the new likelihood is greater than the old.
+        """
+        delta_f_actual = current_likelihood - new_likelihood
 
+        """
+        If we use feature batching, the individual vector indices need to be spread out to the full
+        feature space by adding columns corresponding to positions of converged (non calculated)
+        features.
+        """
         if batch_features:
             n_features = self.model.model_vars.n_features
             indices = tf.where(tf.logical_not(self.model.model_vars.converged))
-            old_lls = tf.scatter_nd(self.model.model_vars.converged, old_likelihood, shape=tf.constant([n_features]))
-            delta_f_actual = tf.scatter_nd_update(old_lls, indices, delta_f_actual)
+
+            delta_f_actual = tf.scatter_nd(indices, delta_f_actual, shape=(n_features,))
             update_var = tf.transpose(tf.scatter_nd(
                 indices,
                 tf.transpose(proposed_vector),
@@ -101,7 +115,7 @@ def _trust_region_ops(
             gain_var = tf.transpose(tf.scatter_nd(
                 indices,
                 proposed_gain,
-                shape=([n_features])))
+                shape=(n_features,)))
         else:
             update_var = proposed_vector
             gain_var = proposed_gain
@@ -288,7 +302,7 @@ def _get_updates(self, lhs, rhs, compute_a, compute_b):
 
         return update_raw, update
 
-    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, compute_full_ll=False):
+    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
 
         x_batches, log_probs, jacobians, hessians = inputs
         if not (compute_a or compute_b):
@@ -323,13 +337,13 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
 
             self._trust_region_ops(
                 x_batches=x_batches,
-                likelihood=log_probs,
+                log_probs=log_probs,
                 proposed_vector=tr_proposed_vector_pad,
                 proposed_gain=tr_pred_cost_gain,
                 compute_a=compute_a,
                 compute_b=compute_b,
                 batch_features=batch_features,
-                compute_full_ll=compute_full_ll
+                is_batched=is_batched
             )
 
         else:
@@ -409,7 +423,7 @@ def _trust_region_linear_cost_gain(
         ), axis=0)
         return pred_cost_gain
 
-    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, compute_full_ll=False):
+    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
 
         x_batches, log_probs, jac_a, jac_b, fim_a, fim_b = inputs
         if not (compute_a or compute_b):
@@ -519,11 +533,11 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
 
             self._trust_region_ops(
                 x_batches=x_batches,
-                likelihood=log_probs,
+                log_probs=log_probs,
                 proposed_vector=tr_update,
                 proposed_gain=tr_pred_cost_gain,
                 compute_a=compute_a,
                 compute_b=compute_b,
                 batch_features=batch_features,
-                compute_full_ll=compute_full_ll
+                is_batched=is_batched
             )

From 81b523a1e09b288b4cbb07066ca58dd8a8d22ffd Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 5 Feb 2020 21:21:40 +0100
Subject: [PATCH 046/124] bugfix: always recalculate ll if only compute_b

---
 batchglm/train/tf2/base_glm/optim.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index 00229d7d..b0849a29 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -68,10 +68,13 @@ def _trust_region_ops(
         Current likelihood refers to the likelihood that has been calculated in the last model call.
         We are always evaluating on the full model, so if we train on the batched model (is_batched),
         current likelihood needs to be calculated on the full model using the same model state as
-        used in the last model call:
+        used in the last model call. Moreover, if this update is conducted separately for loc
+        (compute_a) and scale (compute_b), current likelihood always needs to be recalculated when
+        updating the scale params since the location params changed in the location update before.
+        This is only true if the location params are updated before the scale params however!
         """
         current_likelihood = log_probs
-        if is_batched:
+        if is_batched or compute_b and not compute_a:
             for i, x_batch in enumerate(x_batches):
                 log_likelihood = self.model.calc_ll([*x_batch], keep_previous_params_copy=True)[0]
                 current_likelihood = log_likelihood if i == 0 else tf.math.add(current_likelihood, log_likelihood)

From a3df1b0f8e6b1b0a69ef73c6a4b965af552b3153 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Thu, 6 Feb 2020 13:15:21 +0100
Subject: [PATCH 047/124] bugfix: final ll calc, prefetch, map + last batch

---
 batchglm/train/tf2/base_glm/estimator.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index bcff47e2..4eff1a34 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -1,5 +1,6 @@
 import abc
 import logging
+import time
 import numpy as np
 import scipy
 import tensorflow as tf
@@ -7,7 +8,7 @@
 from .external import TFEstimator, _EstimatorGLM
 from .optim import NR, IRLS
 from .external import pkg_constants
-import time
+
 
 logger = logging.getLogger("batchglm")
 
@@ -95,7 +96,7 @@ def generate():
             fetch_size_factors = self.input_data.size_factors is not None and self.noise_model in ["nb", "norm"]
 
             if full_model:
-                max_obs = n_obs - (n_obs % batch_size)
+                max_obs = n_obs  # - (n_obs % batch_size)
                 obs_pool = np.arange(max_obs)
             else:
                 max_obs = n_obs
@@ -114,9 +115,11 @@ def generate():
 
         dtp = self.dtype
         output_types = ((tf.int64, dtp, tf.int64), *(dtp,) * 3) if sparse else (dtp,) * 4
-        dataset = tf.data.Dataset.from_generator(generator=generate, output_types=output_types)
+        dataset = tf.data.Dataset.from_generator(generator=generate, output_types=output_types).prefetch(1)
         if sparse:
-            dataset = dataset.map(lambda ivs_tuple, loc, scale, sf: (tf.SparseTensor(*ivs_tuple), loc, scale, sf))
+            dataset = dataset.map(
+                lambda ivs_tuple, loc, scale, sf: (tf.SparseTensor(*ivs_tuple), loc, scale, sf)
+            ).cache()
 
 
         # Set all to convergence status = False, this is needed if multiple
@@ -135,7 +138,7 @@ def generate():
 
         batch_features = False
         train_step = 0
-        num_batches = n_obs // batch_size
+        num_batches = (n_obs + batch_size - 1) // batch_size  # integer ceil division ceil(a/b)=(a+b-1)//b
 
         while convergence_decision(converged_current, train_step):
 
@@ -279,7 +282,7 @@ def generate():
             else:
                 results = [tf.math.add(results[i], x) for i, x in enumerate(current_results)]
 
-        self._log_likelihood = results[0].numpy() / self.input_data.num_observations
+        self._log_likelihood = results[0].numpy()
         self._jacobian = tf.reduce_sum(tf.abs(results[1] / self.input_data.num_observations), axis=1)
 
         # TODO: maybe report fisher inf here. But concatenation only works if !intercept_scale
@@ -325,7 +328,6 @@ def calculate_convergence(self, converged_prev, ll_prev, ll_current, prev_params
         ll_converged = (ll_difference < pkg_constants.LLTOL_BY_FEATURE) & features_updated
         epoch_ll_converged = not_converged_prev & ll_converged  # formerly known as converged_f
         total_converged |= epoch_ll_converged
-
         """
         Now getting convergence based on change in gradient below threshold:
         """

From 40543a395e37ce27e4ec2b995d1e2673c91d2f2a Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Thu, 6 Feb 2020 14:50:25 +0100
Subject: [PATCH 048/124] style changes

---
 batchglm/train/tf2/base_glm/estimator.py | 109 +++++++++++------------
 1 file changed, 53 insertions(+), 56 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 4eff1a34..ca19a21b 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -9,9 +9,9 @@
 from .optim import NR, IRLS
 from .external import pkg_constants
 
-
 logger = logging.getLogger("batchglm")
 
+
 class Estimator(TFEstimator, _EstimatorGLM, metaclass=abc.ABCMeta):
     """
     Estimator for Generalized Linear Models (GLMs).
@@ -31,18 +31,18 @@ def initialize(self, **kwargs):
 
     def finalize(self, **kwargs):
         """
-        Evaluate all tensors that need to be exported from session and save these as class attributes
-        and close session.
-
+        Evaluate all tensors that need to be exported from session,
+        save these as class attributes and close session.
         Changes .model entry from tf-based EstimatorGraph to numpy based Model instance and
         transfers relevant attributes.
         """
-
-        a_var, b_var = self.model.unpack_params([self.model.params, self.model.model_vars.a_var.get_shape()[0]])
+        a_var, b_var = self.model.unpack_params(
+            [self.model.params, self.model.model_vars.a_var.get_shape()[0]])
         self.model = self.get_model_container(self.input_data)
         self.model._a_var = a_var.numpy()
         self.model._b_var = b_var.numpy()
-        self._loss = tf.reduce_sum(np.negative(self._log_likelihood) / self.input_data.num_observations).numpy()
+        self._loss = tf.reduce_sum(
+            tf.negative(self._log_likelihood) / self.input_data.num_observations).numpy()
 
     def __init__(
             self,
@@ -68,7 +68,8 @@ def _train(
 
         conv_all = lambda x, y: not np.all(x)
         conv_step = lambda x, y: not np.all(x) and y < stopping_criteria
-        assert convergence_criteria in ["step", "all_converged"], ("Unrecognized convergence criteria %s", convergence_criteria)
+        assert convergence_criteria in ["step", "all_converged"], \
+            ("Unrecognized convergence criteria %s", convergence_criteria)
         convergence_decision = conv_step if convergence_criteria == "step" else conv_all
 
         n_obs = self.input_data.num_observations
@@ -76,52 +77,44 @@ def _train(
         if batch_size > n_obs:
             batch_size = n_obs
         if not self._initialized:
-            raise RuntimeError("Cannot train the model: \
-                                Estimator not initialized. Did you forget to call estimator.initialize() ?")
+            raise RuntimeError("Cannot train the model: Estimator not initialized. \
+                Did you forget to call estimator.initialize() ?")
 
         if autograd and optim_algo.lower() in ['nr', 'nr_tr']:
-            logger.warning("Automatic differentiation is currently not supported for hessians. \
-                            Falling back to closed form. Only Jacobians are calculated using autograd.")
+            logger.warning(
+                "Automatic differentiation is currently not supported for hessians. Falling back \
+                to closed form. Only Jacobians are calculated using autograd.")
 
         self.noise_model = noise_model
         sparse = isinstance(self.input_data.x, scipy.sparse.csr_matrix)
         full_model = not is_batched
 
         def generate():
-            """
-            Generator for the full model.
-            We use max_obs to cut the observations with max_obs % batch_size = 0 to ensure consistent
-            sizes of tensors.
-            """
-            fetch_size_factors = self.input_data.size_factors is not None and self.noise_model in ["nb", "norm"]
-
-            if full_model:
-                max_obs = n_obs  # - (n_obs % batch_size)
-                obs_pool = np.arange(max_obs)
-            else:
-                max_obs = n_obs
-                obs_pool = np.random.permutation(n_obs)
 
-            for x in range(0, max_obs, batch_size):
-                idx = obs_pool[x: x + batch_size]  # numpy automatically returns only id:id+n_obs if out of range
+            fetch_size_factors = self.input_data.size_factors is not None \
+                and self.noise_model in ["nb", "norm"]
+            obs_pool = np.arange(n_obs) if full_model else np.random.permutation(n_obs)
+
+            for start_id in range(0, n_obs, batch_size):
+                # numpy ignores ids > len(obs_pool) so no out of bounds check needed here:
+                idx = obs_pool[start_id: start_id + batch_size]
 
-                x = self.input_data.fetch_x_sparse(idx) if sparse else self.input_data.fetch_x_dense(idx)
+                counts = self.input_data.fetch_x_sparse(idx) if sparse \
+                    else self.input_data.fetch_x_dense(idx)
                 dloc = self.input_data.fetch_design_loc(idx)
                 dscale = self.input_data.fetch_design_scale(idx)
                 size_factors = self.input_data.fetch_size_factors(idx) if fetch_size_factors else 1
 
-                yield x, dloc, dscale, size_factors
-            return
+                yield counts, dloc, dscale, size_factors
 
         dtp = self.dtype
         output_types = ((tf.int64, dtp, tf.int64), *(dtp,) * 3) if sparse else (dtp,) * 4
-        dataset = tf.data.Dataset.from_generator(generator=generate, output_types=output_types).prefetch(1)
+        dataset = tf.data.Dataset.from_generator(
+            generator=generate, output_types=output_types).prefetch(1)
         if sparse:
             dataset = dataset.map(
                 lambda ivs_tuple, loc, scale, sf: (tf.SparseTensor(*ivs_tuple), loc, scale, sf)
             ).cache()
-
-
         # Set all to convergence status = False, this is needed if multiple
         # training strategies are run:
         converged_current = np.zeros(n_features, dtype=np.bool)
@@ -132,13 +125,16 @@ def generate():
         irls_algo = optim_algo.lower() in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr']
         nr_algo = optim_algo.lower() in ['nr', 'nr_tr']
 
-        update_func = optimizer_object.perform_parameter_update if irls_algo or nr_algo else optimizer_object.apply_gradients
+        update_func = optimizer_object.perform_parameter_update \
+            if irls_algo or nr_algo else optimizer_object.apply_gradients
 
         prev_params = self.model.params.numpy()
 
         batch_features = False
         train_step = 0
-        num_batches = (n_obs + batch_size - 1) // batch_size  # integer ceil division ceil(a/b)=(a+b-1)//b
+        # integer ceil division with arithmetic trick: ceil(a/b)=(a+b-1)//b
+        # We need this for cases where n_obs mod batch_size != 0
+        num_batches = (n_obs + batch_size - 1) // batch_size
 
         while convergence_decision(converged_current, train_step):
 
@@ -205,7 +201,7 @@ def generate():
 
                     # Update converged status
                     converged_prev = converged_current.copy()
-                    ll_current = -results[0].numpy() / self.input_data.num_observations
+                    ll_current = -results[0].numpy() / n_obs
 
                     if batch_features:
                         indices = tf.where(not_converged)
@@ -242,25 +238,24 @@ def generate():
                     )
 
                     prev_params = self.model.params.numpy()
-                    #converged_current, converged_f, converged_g, converged_x = convergences
+                    # converged_current, converged_f, converged_g, converged_x = convergences
                     converged_current = convergences[0]
                     self.model.model_vars.convergence_update(converged_current, features_updated)
                     num_converged = np.sum(converged_current)
-                    if num_converged != np.sum(converged_prev):
+                    loss = np.sum(ll_current)
+                    num_updated = np.sum(features_updated)
+                    log_output = f"Step: {train_step} loss: {loss}, "\
+                        f"converged {num_converged}, updated {num_updated}"
+                    if num_converged == np.sum(converged_prev):
+                        logger.warning(log_output)
+                    else:
                         if featurewise and not batch_features:
                             batch_features = True
                             self.model.batch_features = batch_features
-                        logger_pattern = "Step: %i loss: %f, converged %i, updated %i, (logs: %i, grad: %i, x_step: %i)"
-                        logger.warning(
-                            logger_pattern,
-                            train_step,
-                            np.sum(ll_current),
-                            num_converged.astype("int32"),
-                            np.sum(features_updated).astype("int32"),
-                            *[np.sum(convergence_vals) for convergence_vals in convergences[1:]]
-                        )
-                    else:
-                        logger.warning('step %i: loss: %f converged %i, updated %i', train_step, np.sum(ll_current), num_converged.astype("int32"), np.sum(features_updated).astype("int32"))
+                        sums = [np.sum(convergence_vals) for convergence_vals in convergences[1:]]
+                        log_output = f"{log_output} logs: {sums[0]} grad: {sums[1]}, "\
+                            f"x_step: {sums[2]}"
+                        logger.warning(log_output)
                     train_step += 1
                     if benchmark:
                         t1_epoch = time.time()
@@ -283,9 +278,9 @@ def generate():
                 results = [tf.math.add(results[i], x) for i, x in enumerate(current_results)]
 
         self._log_likelihood = results[0].numpy()
-        self._jacobian = tf.reduce_sum(tf.abs(results[1] / self.input_data.num_observations), axis=1)
+        self._jacobian = tf.reduce_sum(tf.abs(results[1] / n_obs), axis=1)
 
-        # TODO: maybe report fisher inf here. But concatenation only works if !intercept_scale
+        # TODO: maybe report fisher inf here in the future.
         self._fisher_inv = tf.linalg.inv(results[2]).numpy()
         self._hessian = -results[2].numpy()
 
@@ -356,10 +351,12 @@ def calculate_convergence(self, converged_prev, ll_prev, ll_current, prev_params
         For now it must not be below the threshold for the X step of the loc model.
         """
 
-        if hasattr(optimizer_object, 'trusted_region_mode') and optimizer_object.trusted_region_mode:
+        if hasattr(optimizer_object, 'trusted_region_mode') \
+                and optimizer_object.trusted_region_mode:
             converged_tr = optimizer_object.tr_radius.numpy() < pkg_constants.TRTOL_BY_FEATURE_LOC
             if hasattr(optimizer_object, 'tr_radius_b') and self._train_scale:
-                converged_tr &= optimizer_object.tr_radius_b.numpy() < pkg_constants.TRTOL_BY_FEATURE_SCALE
+                converged_tr &= \
+                    optimizer_object.tr_radius_b.numpy() < pkg_constants.TRTOL_BY_FEATURE_SCALE
             epoch_tr_converged = not_converged_prev & converged_tr
             epoch_step_converged |= epoch_tr_converged
 
@@ -439,15 +436,15 @@ def calc_x_step(self, prev_params, features_updated):
         def get_norm_converged(model: str, prev_params):
             if model == 'loc':
                 idx_train = self.model.model_vars.idx_train_loc
-                XTOL = pkg_constants.XTOL_BY_FEATURE_LOC
+                xtol = pkg_constants.XTOL_BY_FEATURE_LOC
             elif model == 'scale':
                 idx_train = self.model.model_vars.idx_train_scale
-                XTOL = pkg_constants.XTOL_BY_FEATURE_SCALE
+                xtol = pkg_constants.XTOL_BY_FEATURE_SCALE
             else:
                 assert False, "Supply either 'loc' or 'scale'!"
             x_step = self.model.params.numpy() - prev_params
             x_norm = np.sqrt(np.sum(np.square(x_step[idx_train, :]), axis=0))
-            return x_norm < XTOL
+            return x_norm < xtol
 
         """
         We use a trick here: First we set both the loc and scale convergence to True.

From 2308e090ba8a4d3ede0f02f072a14bcc630af29b Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 10 Feb 2020 12:08:01 +0100
Subject: [PATCH 049/124] added scale specific constants

---
 batchglm/pkg_constants.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index a0c4ed2e..d73932d4 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -11,6 +11,7 @@
 
 # Trust region hyper parameters:
 TRUST_REGION_RADIUS_INIT = 100.
+TRUST_REGION_RADIUS_INIT_SCALE = 1.
 TRUST_REGION_ETA0 = 0.
 TRUST_REGION_ETA1 = 0.25
 TRUST_REGION_ETA2 = 0.25
@@ -18,10 +19,11 @@
 TRUST_REGION_T2 = 1.5  # Allow expansion if not shrinking.
 TRUST_REGION_UPPER_BOUND = 1e5
 
-TRUST_REGIONT_T1_IRLS_GD_TR_SCALE = 1
+TRUST_REGIONT_T1_IRLS_GD_TR_SCALE = 0.8
+TRUST_REGIONT_T2_IRLS_GD_TR_SCALE = 1.2
 
 # Convergence hyper-parameters:
-LLTOL_BY_FEATURE = 1e-10
+LLTOL_BY_FEATURE = 1e-16
 XTOL_BY_FEATURE_LOC = 1e-8
 XTOL_BY_FEATURE_SCALE = 1e-6
 GTOL_BY_FEATURE_LOC = 1e-8

From 7096fa68dd1a7e94563aa9451c02af6a63c9a9e1 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 10 Feb 2020 12:09:00 +0100
Subject: [PATCH 050/124] added ll by step container for benchmarking

---
 batchglm/train/tf2/base_glm/estimator.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index ca19a21b..c41f1e1f 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -26,6 +26,7 @@ def initialize(self, **kwargs):
         self.values = []
         self.times = []
         self.converged = []
+        self.lls = []
         self._initialized = True
         self.model = None
 
@@ -207,7 +208,8 @@ def generate():
                         indices = tf.where(not_converged)
                         updated_lls = tf.scatter_nd(indices, ll_current, shape=[n_features])
                         ll_current = np.where(features_updated, updated_lls.numpy(), ll_prev)
-
+                    if benchmark:
+                        self.lls.append(ll_current)
                     if is_batched:
                         jac_normalization = batch_size
                     else:

From 43d6ab74c4cec4375a5edf439b9dc888b4cbb0b5 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 10 Feb 2020 12:09:50 +0100
Subject: [PATCH 051/124] rm T1 and T2 + proposed gain checks for TR update

---
 batchglm/train/tf2/base_glm/optim.py | 32 +++++++++++++++++-----------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index b0849a29..d7333f75 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -45,21 +45,22 @@ def _trust_region_ops(
             is_batched
     ):
         # Load hyper-parameters:
-        assert pkg_constants.TRUST_REGION_ETA0 < pkg_constants.TRUST_REGION_ETA1, \
-            "eta0 must be smaller than eta1"
-        assert pkg_constants.TRUST_REGION_ETA1 <= pkg_constants.TRUST_REGION_ETA2, \
-            "eta1 must be smaller than or equal to eta2"
-        assert pkg_constants.TRUST_REGION_T1 <= 1, "t1 must be smaller than 1"
-        assert pkg_constants.TRUST_REGION_T2 >= 1, "t1 must be larger than 1"
+        #assert pkg_constants.TRUST_REGION_ETA0 < pkg_constants.TRUST_REGION_ETA1, \
+        #    "eta0 must be smaller than eta1"
+        #assert pkg_constants.TRUST_REGION_ETA1 <= pkg_constants.TRUST_REGION_ETA2, \
+        #    "eta1 must be smaller than or equal to eta2"
+        #assert pkg_constants.TRUST_REGION_T1 <= 1, "t1 must be smaller than 1"
+        #assert pkg_constants.TRUST_REGION_T2 >= 1, "t1 must be larger than 1"
         # Set trust region hyper-parameters
         eta0 = tf.constant(pkg_constants.TRUST_REGION_ETA0, dtype=self._dtype)
         eta1 = tf.constant(pkg_constants.TRUST_REGION_ETA1, dtype=self._dtype)
         eta2 = tf.constant(pkg_constants.TRUST_REGION_ETA2, dtype=self._dtype)
         if self.gd and compute_b:
             t1 = tf.constant(pkg_constants.TRUST_REGIONT_T1_IRLS_GD_TR_SCALE, dtype=self._dtype)
+            t2 = tf.constant(pkg_constants.TRUST_REGIONT_T2_IRLS_GD_TR_SCALE, dtype=self._dtype)
         else:
             t1 = tf.constant(pkg_constants.TRUST_REGION_T1, dtype=self._dtype)
-        t2 = tf.constant(pkg_constants.TRUST_REGION_T2, dtype=self._dtype)
+            t2 = tf.constant(pkg_constants.TRUST_REGION_T2, dtype=self._dtype)
         upper_bound = tf.constant(pkg_constants.TRUST_REGION_UPPER_BOUND, dtype=self._dtype)
 
         # Phase I: Perform a trial update.
@@ -122,7 +123,7 @@ def _trust_region_ops(
         else:
             update_var = proposed_vector
             gain_var = proposed_gain
-        delta_f_ratio = tf.divide(delta_f_actual, gain_var)
+        #delta_f_ratio = tf.divide(delta_f_actual, gain_var)
 
         # Compute parameter updates.g
         update_theta = tf.logical_and(delta_f_actual > eta0, tf.logical_not(self.model.model_vars.converged))
@@ -147,9 +148,15 @@ def _trust_region_ops(
                 tf.multiply(params, update_theta_numeric)  # new values
             )
         self.model.params.assign(theta_new_tr)
-        self.model.model_vars.updated = update_theta.numpy()
+        if compute_b and not compute_a:
+            self.model.model_vars.updated &= update_theta.numpy()
+        else:
+            self.model.model_vars.updated = update_theta.numpy()
 
         # Update trusted region accordingly:
+        decrease_radius = delta_f_actual <= eta0
+        increase_radius = delta_f_actual > eta0
+        """
         decrease_radius = tf.logical_or(
             delta_f_actual <= eta0,
             tf.logical_and(delta_f_ratio <= eta1, tf.logical_not(self.model.model_vars.converged))
@@ -158,6 +165,7 @@ def _trust_region_ops(
             delta_f_actual > eta0,
             tf.logical_and(delta_f_ratio > eta2, tf.logical_not(self.model.model_vars.converged))
         )
+        """
         keep_radius = tf.logical_and(tf.logical_not(decrease_radius),
                                      tf.logical_not(increase_radius))
         radius_update = tf.add_n([
@@ -166,7 +174,7 @@ def _trust_region_ops(
             tf.multiply(tf.ones_like(t1), tf.cast(keep_radius, self._dtype))
         ])
 
-        if self.gd and compute_b and not compute_a:
+        if compute_b and not compute_a:
             tr_radius = self.tr_radius_b
         else:
             tr_radius = self.tr_radius
@@ -191,7 +199,7 @@ def __init__(self, dtype: tf.dtypes.DType, trusted_region_mode: bool, model: tf.
                 dtype=self._dtype, trainable=False)
             if self.gd:
                 self.tr_radius_b = tf.Variable(
-                    np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT,
+                    np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT_SCALE,
                     dtype=self._dtype, trainable=False)
         else:
             self.tr_radius = tf.Variable(np.array([np.inf]), dtype=self._dtype, trainable=False)
@@ -261,7 +269,7 @@ def _trust_region_update(
         update_norm = tf.multiply(update_raw, update_magnitude_inv)
         # the following switch is for irls_gd_tr (linear instead of newton)
         if n_obs is not None:
-            update_magnitude /= n_obs
+            update_magnitude = update_magnitude / n_obs * radius_container
         update_scale = tf.minimum(
             radius_container,
             update_magnitude

From 539bb5bb22c30816c8d6aa90d46ee2e0c05cdb26 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 11 Feb 2020 18:06:53 +0100
Subject: [PATCH 052/124] integrated featurewise batching in dataset gen

---
 batchglm/train/tf2/base_glm/estimator.py | 106 ++++++++++++-----------
 1 file changed, 55 insertions(+), 51 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index c41f1e1f..cb4eef07 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -91,11 +91,9 @@ def _train(
         full_model = not is_batched
 
         def generate():
-
             fetch_size_factors = self.input_data.size_factors is not None \
                 and self.noise_model in ["nb", "norm"]
             obs_pool = np.arange(n_obs) if full_model else np.random.permutation(n_obs)
-
             for start_id in range(0, n_obs, batch_size):
                 # numpy ignores ids > len(obs_pool) so no out of bounds check needed here:
                 idx = obs_pool[start_id: start_id + batch_size]
@@ -105,17 +103,20 @@ def generate():
                 dloc = self.input_data.fetch_design_loc(idx)
                 dscale = self.input_data.fetch_design_scale(idx)
                 size_factors = self.input_data.fetch_size_factors(idx) if fetch_size_factors else 1
-
                 yield counts, dloc, dscale, size_factors
 
         dtp = self.dtype
         output_types = ((tf.int64, dtp, tf.int64), *(dtp,) * 3) if sparse else (dtp,) * 4
+        # integer ceil division with arithmetic trick: ceil(a/b)=(a+b-1)//b
+        # We need this for cases where n_obs mod batch_size != 0
+        num_batches = (n_obs + batch_size - 1) // batch_size
         dataset = tf.data.Dataset.from_generator(
-            generator=generate, output_types=output_types).prefetch(1)
+            generator=generate, output_types=output_types)
         if sparse:
             dataset = dataset.map(
                 lambda ivs_tuple, loc, scale, sf: (tf.SparseTensor(*ivs_tuple), loc, scale, sf)
-            ).cache()
+            )
+        batch_features = False
         # Set all to convergence status = False, this is needed if multiple
         # training strategies are run:
         converged_current = np.zeros(n_features, dtype=np.bool)
@@ -131,29 +132,43 @@ def generate():
 
         prev_params = self.model.params.numpy()
 
-        batch_features = False
         train_step = 0
-        # integer ceil division with arithmetic trick: ceil(a/b)=(a+b-1)//b
-        # We need this for cases where n_obs mod batch_size != 0
-        num_batches = (n_obs + batch_size - 1) // batch_size
 
-        while convergence_decision(converged_current, train_step):
+        not_converged = ~ self.model.model_vars.converged
+
+        def featurewise_batch(x_tensor, dloc, dscale, size_factors):
+            if not batch_features:
+                return x_tensor, dloc, dscale, size_factors
+            if isinstance(x_tensor, tf.SparseTensor):
+                feature_columns = tf.sparse.split(
+                    x_tensor,
+                    num_split=self.model.model_vars.n_features,
+                    axis=1)
+                not_converged_idx = np.where(not_converged)[0]
+                feature_columns = [feature_columns[i] for i in not_converged_idx]
+                x_tensor = tf.sparse.concat(axis=1, sp_inputs=feature_columns)
+
+            else:
+                x_tensor = tf.boolean_mask(tensor=x_tensor, mask=not_converged, axis=1)
+            return x_tensor, dloc, dscale, size_factors
+
+        def new_epoch_set():
+            return dataset.take(num_batches).map(featurewise_batch).cache().prefetch(1)
 
+        epoch_set = new_epoch_set()
+        num_converged = 0
+        num_converged_prev = 0
+        need_new_epoch_set = False
+        while convergence_decision(converged_current, train_step):
             if benchmark:
                 t0_epoch = time.time()
 
-            not_converged = ~ self.model.model_vars.converged
             ll_prev = ll_current.copy()
             results = None
-            x_batches = []
-            for x_batch_tuple in dataset:
-                if batch_features:
-                    x_batches.append(self.getModelInput(x_batch_tuple, not_converged))
-                else:
-                    x_batches.append(x_batch_tuple)
-
-            for i, x_batch in enumerate(x_batches):
-                current_results = self.model(x_batch)
+            if need_new_epoch_set:
+                epoch_set = new_epoch_set()
+            for i, x_batch in enumerate(epoch_set):
+                current_results = self.model(x_batch, keep_previous_params_copy=not need_new_epoch_set)
                 if is_batched or i == 0:
                     results = current_results
                 else:
@@ -162,8 +177,9 @@ def generate():
                 if is_batched or i == num_batches - 1:
                     if irls_algo or nr_algo:
                         if irls_algo:
+                            batches = x_batch if is_batched else epoch_set
                             update_func(
-                                inputs=[x_batches, *results],
+                                inputs=[batches, *results],
                                 compute_a=True,
                                 compute_b=False,
                                 batch_features=batch_features,
@@ -171,7 +187,7 @@ def generate():
                             )
                             if self._train_scale:
                                 update_func(
-                                    inputs=[x_batches, *results],
+                                    inputs=[batches, *results],
                                     compute_a=False,
                                     compute_b=True,
                                     batch_features=batch_features,
@@ -179,7 +195,7 @@ def generate():
                                 )
                         else:
                             update_func(
-                                inputs=[x_batches, *results],
+                                inputs=[batches, *results],
                                 batch_features=batch_features,
                                 is_batched=is_batched
                             )
@@ -248,12 +264,19 @@ def generate():
                     num_updated = np.sum(features_updated)
                     log_output = f"Step: {train_step} loss: {loss}, "\
                         f"converged {num_converged}, updated {num_updated}"
-                    if num_converged == np.sum(converged_prev):
+                    num_converged_prev = np.sum(converged_prev)
+                    need_new_epoch_set = False
+                    if num_converged == num_converged_prev:
                         logger.warning(log_output)
                     else:
-                        if featurewise and not batch_features:
-                            batch_features = True
-                            self.model.batch_features = batch_features
+                        if featurewise:
+                            if not batch_features:
+                                batch_features = True
+                                self.model.batch_features = batch_features
+
+                            if num_converged - num_converged_prev >= pkg_constants.FEATUREWISE_THRESHOLD:
+                                need_new_epoch_set = True
+                        not_converged = ~self.model.model_vars.converged
                         sums = [np.sum(convergence_vals) for convergence_vals in convergences[1:]]
                         log_output = f"{log_output} logs: {sums[0]} grad: {sums[1]}, "\
                             f"x_step: {sums[2]}"
@@ -267,13 +290,13 @@ def generate():
         # Evaluate final params
         logger.warning("Final Evaluation run.")
         self.model.batch_features = False
-
+        batch_features = False
         # change to hessian mode since we still use hessian instead of FIM for self._fisher_inv
         self.model.setMethod('nr_tr')
         self.model.hessian.compute_b = True
-
-        for i, x_batch_tuple in enumerate(dataset):
-            current_results = self.model(x_batch_tuple)
+        final_set = new_epoch_set()
+        for i, x_batch_tuple in enumerate(final_set):
+            current_results = self.model(x_batch_tuple, keep_previous_params_copy=False)
             if i == 0:
                 results = current_results
             else:
@@ -288,26 +311,7 @@ def generate():
 
         self.model.hessian.compute_b = self.model.compute_b
         self.model.batch_features = batch_features
-
-    def getModelInput(self, x_batch_tuple: tuple, not_converged):
-        """
-            Returns a smaller x_batch tuple reduced in feature space.
-        """
-        x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor = x_batch_tuple
-        if isinstance(self.input_data.x, scipy.sparse.csr_matrix):
-            not_converged_idx = np.where(not_converged)[0]
-            feature_columns = tf.sparse.split(
-                x_tensor,
-                num_split=self.model.model_vars.n_features,
-                axis=1)
-            feature_columns = [feature_columns[i] for i in not_converged_idx]
-            x_tensor = tf.sparse.concat(axis=1, sp_inputs=feature_columns)
-
-        else:
-            x_tensor = tf.boolean_mask(tensor=x_tensor, mask=not_converged, axis=1)
-        x_batch = (x_tensor, design_loc_tensor, design_scale_tensor, size_factors_tensor)
-
-        return x_batch
+        batch_features = True
 
     def calculate_convergence(self, converged_prev, ll_prev, ll_current, prev_params,
                               jac_normalization, grad_numpy, features_updated, optimizer_object):

From 8ffcbd4571b1021341cfe1b3d53ab7b437c756e4 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 11 Feb 2020 18:08:55 +0100
Subject: [PATCH 053/124] forward keep_previous_params_copy everywhere

---
 batchglm/train/tf2/base_glm/model.py | 32 +++++++++++++++++-----------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index 73c68658..01e648df 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -50,7 +50,7 @@ def __init__(
         self.hessian = hessian
         self.fim = fim
         self.use_gradient_tape = use_gradient_tape
-        self.params_copy = None
+        self.params_copy = self.params
         self.batch_features = False
 
         self.setMethod(optimizer)
@@ -67,7 +67,7 @@ def setMethod(self, optimizer):
         elif optimizer in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr']:
             self._calc = self._calc_fim
 
-    def _call_parameters(self, inputs, keep_previous_params_copy=False):
+    def _call_parameters(self, inputs, keep_previous_params_copy=True):
         if not keep_previous_params_copy:
             if self.batch_features:
                 self.params_copy = tf.Variable(tf.boolean_mask(tensor=self.params,
@@ -75,6 +75,7 @@ def _call_parameters(self, inputs, keep_previous_params_copy=False):
                                                                axis=1), trainable=True)
             else:
                 self.params_copy = self.params
+
         design_loc, design_scale, size_factors = inputs
         a_var, b_var = self.unpack_params([self.params_copy, self.model_vars.a_var.get_shape()[0]])
         eta_loc = self.linear_loc([a_var, design_loc, self.model_vars.constraints_loc, size_factors])
@@ -89,10 +90,10 @@ def calc_ll(self, inputs, keep_previous_params_copy=False):
         log_probs = tf.reduce_sum(log_probs, axis=0)
         return (log_probs, *parameters[2:])
 
-    def _return_jacobians(self, inputs):
-        return self._calc_jacobians(inputs)[-2:]
+    def _return_jacobians(self, inputs, keep_previous_params_copy=True):
+        return self._calc_jacobians(inputs, keep_previous_params_copy)[-2:]
 
-    def _calc_jacobians(self, inputs, concat=True, transpose=True):
+    def _calc_jacobians(self, inputs, concat=True, transpose=True, keep_previous_params_copy=True):
         """
         calculates jacobian.
 
@@ -108,7 +109,7 @@ def _calc_jacobians(self, inputs, concat=True, transpose=True):
         """
 
         with tf.GradientTape(persistent=True) as g:
-            log_probs, loc, scale, a_var, b_var = self.calc_ll(inputs)
+            log_probs, loc, scale, a_var, b_var = self.calc_ll(inputs, keep_previous_params_copy)
 
         if self.use_gradient_tape:
 
@@ -153,9 +154,12 @@ def _calc_jacobians(self, inputs, concat=True, transpose=True):
             return loc, scale, log_probs, tf.negative(jacobians)
         return loc, scale, log_probs, tf.negative(jac_a), tf.negative(jac_b)
 
-    def _calc_hessians(self, inputs):
+    def _calc_hessians(self, inputs, keep_previous_params_copy=True):
         # with tf.GradientTape(persistent=True) as g2:
-        loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, transpose=False)
+        loc, scale, log_probs, jacobians = self._calc_jacobians(
+            inputs,
+            keep_previous_params_copy=keep_previous_params_copy,
+            transpose=False)
         '''
         autograd not yet working. TODO: Search error in the following code:
 
@@ -186,13 +190,17 @@ def _calc_hessians(self, inputs):
         hessians = tf.negative(self.hessian([*inputs[0:3], loc, scale, True]))
         return log_probs, jacobians, hessians
 
-    def _calc_fim(self, inputs):
-        loc, scale, log_probs, jac_a, jac_b = self._calc_jacobians(inputs, concat=False, transpose=False)
+    def _calc_fim(self, inputs, keep_previous_params_copy=True):
+        loc, scale, log_probs, jac_a, jac_b = self._calc_jacobians(
+            inputs,
+            concat=False,
+            transpose=False,
+            keep_previous_params_copy=keep_previous_params_copy)
         fim_a, fim_b = self.fim([*inputs[0:3], loc, scale, False])
         return log_probs, jac_a, jac_b, fim_a, fim_b
 
-    def call(self, inputs, training=False, mask=None):
-        return self._calc(inputs)
+    def call(self, inputs, keep_previous_params_copy=True):
+        return self._calc(inputs, keep_previous_params_copy)
 
 class LossGLM(LossBase):
 

From 3dc6eea446f494353727335be303e81d77146fe9 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 11 Feb 2020 18:10:40 +0100
Subject: [PATCH 054/124] rm keep_previous_params_copy form model call

---
 batchglm/train/tf2/base_glm/optim.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index d7333f75..3fc5102d 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -77,7 +77,7 @@ def _trust_region_ops(
         current_likelihood = log_probs
         if is_batched or compute_b and not compute_a:
             for i, x_batch in enumerate(x_batches):
-                log_likelihood = self.model.calc_ll([*x_batch], keep_previous_params_copy=True)[0]
+                log_likelihood = self.model.calc_ll([*x_batch])[0]
                 current_likelihood = log_likelihood if i == 0 else tf.math.add(current_likelihood, log_likelihood)
 
         current_likelihood = self._norm_neg_log_likelihood(current_likelihood)
@@ -88,7 +88,7 @@ def _trust_region_ops(
         """
         self.model.params_copy.assign_sub(proposed_vector)
         for i, x_batch in enumerate(x_batches):
-            log_likelihood = self.model.calc_ll([*x_batch], keep_previous_params_copy=True)[0]
+            log_likelihood = self.model.calc_ll([*x_batch])[0]
             if i == 0:
                 new_likelihood = log_likelihood
             else:
@@ -100,7 +100,6 @@ def _trust_region_ops(
         update of parameters. It is > 0 if the new likelihood is greater than the old.
         """
         delta_f_actual = current_likelihood - new_likelihood
-
         """
         If we use feature batching, the individual vector indices need to be spread out to the full
         feature space by adding columns corresponding to positions of converged (non calculated)
@@ -449,7 +448,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             update_a = self._newton_type_update(
                 lhs=fim_a,
                 rhs=jac_a,
-                psd=True
+                psd=False
             )
         if compute_b:
 

From ed47e933448d27e83e3462255722f2614404bb8c Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 11 Feb 2020 18:11:24 +0100
Subject: [PATCH 055/124] FEATUREWISE_THRESHOLD for featurewise step control

---
 batchglm/pkg_constants.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index d73932d4..d3147b25 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -17,13 +17,13 @@
 TRUST_REGION_ETA2 = 0.25
 TRUST_REGION_T1 = 0.5  # Fast collapse to avoid trailing.
 TRUST_REGION_T2 = 1.5  # Allow expansion if not shrinking.
-TRUST_REGION_UPPER_BOUND = 1e5
+TRUST_REGION_UPPER_BOUND = 1e40
 
-TRUST_REGIONT_T1_IRLS_GD_TR_SCALE = 0.8
-TRUST_REGIONT_T2_IRLS_GD_TR_SCALE = 1.2
+TRUST_REGIONT_T1_IRLS_GD_TR_SCALE = 0.5
+TRUST_REGIONT_T2_IRLS_GD_TR_SCALE = 1.5
 
 # Convergence hyper-parameters:
-LLTOL_BY_FEATURE = 1e-16
+LLTOL_BY_FEATURE = 1e-12
 XTOL_BY_FEATURE_LOC = 1e-8
 XTOL_BY_FEATURE_SCALE = 1e-6
 GTOL_BY_FEATURE_LOC = 1e-8
@@ -32,6 +32,8 @@
 TRTOL_BY_FEATURE_LOC = 1e-12
 TRTOL_BY_FEATURE_SCALE = 1e-12
 
+FEATUREWISE_THRESHOLD = 1  # the minimal number of features to converge before next featurewise batch
+
 try:
     import tensorflow as tf
 

From 3d734916a71dc979623b32796afdddf81bb45ae4 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 11 Feb 2020 20:22:35 +0100
Subject: [PATCH 056/124] bugfix: wrong default parameter in calc_ll

---
 batchglm/train/tf2/base_glm/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index 01e648df..48a0d516 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -84,7 +84,7 @@ def _call_parameters(self, inputs, keep_previous_params_copy=True):
         scale = self.linker_scale(eta_scale)
         return eta_loc, eta_scale, loc, scale, a_var, b_var
 
-    def calc_ll(self, inputs, keep_previous_params_copy=False):
+    def calc_ll(self, inputs, keep_previous_params_copy=True):
         parameters = self._call_parameters(inputs[1:], keep_previous_params_copy)
         log_probs = self.likelihood([*parameters[:-2], inputs[0], np.sum(self.model_vars.updated)])
         log_probs = tf.reduce_sum(log_probs, axis=0)

From fba55c987aacf27d4efd394ac0b3ac97a32ea987 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 12 Feb 2020 11:33:06 +0100
Subject: [PATCH 057/124] bugfix: calc of conv diff to last features batch

---
 batchglm/train/tf2/base_glm/estimator.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index cb4eef07..a8ee2214 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -159,6 +159,7 @@ def new_epoch_set():
         num_converged = 0
         num_converged_prev = 0
         need_new_epoch_set = False
+        n_conv_last_featurewise_batch = 0
         while convergence_decision(converged_current, train_step):
             if benchmark:
                 t0_epoch = time.time()
@@ -273,9 +274,10 @@ def new_epoch_set():
                             if not batch_features:
                                 batch_features = True
                                 self.model.batch_features = batch_features
-
-                            if num_converged - num_converged_prev >= pkg_constants.FEATUREWISE_THRESHOLD:
+                            conv_diff = num_converged - n_conv_last_featurewise_batch
+                            if conv_diff >= pkg_constants.FEATUREWISE_THRESHOLD:
                                 need_new_epoch_set = True
+                                n_conv_last_featurewise_batch = num_converged
                         not_converged = ~self.model.model_vars.converged
                         sums = [np.sum(convergence_vals) for convergence_vals in convergences[1:]]
                         log_output = f"{log_output} logs: {sums[0]} grad: {sums[1]}, "\

From 2cc1e8bc72d7d0f8bc144786e2fb64f80b988ee7 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Sat, 7 Mar 2020 15:35:23 +0100
Subject: [PATCH 058/124] bugfix: loss increases when featurewise

---
 batchglm/train/tf2/base_glm/estimator.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index a8ee2214..bf9c63fc 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -222,9 +222,10 @@ def new_epoch_set():
                     ll_current = -results[0].numpy() / n_obs
 
                     if batch_features:
-                        indices = tf.where(not_converged)
-                        updated_lls = tf.scatter_nd(indices, ll_current, shape=[n_features])
-                        ll_current = np.where(features_updated, updated_lls.numpy(), ll_prev)
+                        indices = np.where(not_converged)[0]
+                        updated_lls = tf.scatter_nd(
+                            np.expand_dims(indices, 1), ll_current, shape=[n_features])
+                        ll_current = np.where(not_converged, updated_lls.numpy(), ll_prev)
                     if benchmark:
                         self.lls.append(ll_current)
                     if is_batched:

From b925504a5f0968d4597c927dfdace952e4869432 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 9 Mar 2020 17:49:50 +0100
Subject: [PATCH 059/124] workaround for numeric instabilities

---
 batchglm/train/tf2/base_glm/optim.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index 3fc5102d..fe2927a6 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -99,7 +99,8 @@ def _trust_region_ops(
         delta_f_actual shows the difference between the log likelihoods before and after the proposed
         update of parameters. It is > 0 if the new likelihood is greater than the old.
         """
-        delta_f_actual = current_likelihood - new_likelihood
+        delta_f_actual = tf.math.subtract(current_likelihood, new_likelihood)
+
         """
         If we use feature batching, the individual vector indices need to be spread out to the full
         feature space by adding columns corresponding to positions of converged (non calculated)
@@ -148,7 +149,7 @@ def _trust_region_ops(
             )
         self.model.params.assign(theta_new_tr)
         if compute_b and not compute_a:
-            self.model.model_vars.updated &= update_theta.numpy()
+            self.model.model_vars.updated |= update_theta.numpy()
         else:
             self.model.model_vars.updated = update_theta.numpy()
 
@@ -268,7 +269,7 @@ def _trust_region_update(
         update_norm = tf.multiply(update_raw, update_magnitude_inv)
         # the following switch is for irls_gd_tr (linear instead of newton)
         if n_obs is not None:
-            update_magnitude = update_magnitude / n_obs * radius_container
+            update_magnitude = update_magnitude / n_obs #* radius_container
         update_scale = tf.minimum(
             radius_container,
             update_magnitude

From 128e04ca8211dac7a6005a4de5de1a7cf7f02e9b Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 9 Mar 2020 17:51:24 +0100
Subject: [PATCH 060/124] bugfix: always keeps param_copy if using ADAM

---
 batchglm/train/tf2/base_glm/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index 48a0d516..09559529 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -91,7 +91,7 @@ def calc_ll(self, inputs, keep_previous_params_copy=True):
         return (log_probs, *parameters[2:])
 
     def _return_jacobians(self, inputs, keep_previous_params_copy=True):
-        return self._calc_jacobians(inputs, keep_previous_params_copy)[-2:]
+        return self._calc_jacobians(inputs, True, keep_previous_params_copy)[-2:]
 
     def _calc_jacobians(self, inputs, concat=True, transpose=True, keep_previous_params_copy=True):
         """

From 475c93ceb91e347998cd12b2e8a297253d9d622c Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 9 Mar 2020 19:13:04 +0100
Subject: [PATCH 061/124] bugfix: ADAM still keeping params copy

---
 batchglm/train/tf2/base_glm/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index 09559529..e0a8a64b 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -91,7 +91,7 @@ def calc_ll(self, inputs, keep_previous_params_copy=True):
         return (log_probs, *parameters[2:])
 
     def _return_jacobians(self, inputs, keep_previous_params_copy=True):
-        return self._calc_jacobians(inputs, True, keep_previous_params_copy)[-2:]
+        return self._calc_jacobians(inputs, keep_previous_params_copy=keep_previous_params_copy)[-2:]
 
     def _calc_jacobians(self, inputs, concat=True, transpose=True, keep_previous_params_copy=True):
         """

From e1fa6ed80cdae98e0f9ae4d099f5ba8d3d9a6346 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 9 Mar 2020 21:07:12 +0100
Subject: [PATCH 062/124] workaround: boolean mask in each run (issue #21)

---
 batchglm/train/tf2/base_glm/model.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index e0a8a64b..6e12f952 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -68,13 +68,13 @@ def setMethod(self, optimizer):
             self._calc = self._calc_fim
 
     def _call_parameters(self, inputs, keep_previous_params_copy=True):
-        if not keep_previous_params_copy:
-            if self.batch_features:
-                self.params_copy = tf.Variable(tf.boolean_mask(tensor=self.params,
-                                                               mask=tf.logical_not(self.model_vars.converged),
-                                                               axis=1), trainable=True)
-            else:
-                self.params_copy = self.params
+        #if not keep_previous_params_copy:
+        if self.batch_features:
+            self.params_copy = tf.Variable(tf.boolean_mask(tensor=self.params,
+                                                           mask=tf.logical_not(self.model_vars.converged),
+                                                           axis=1), trainable=True)
+        else:
+            self.params_copy = self.params
 
         design_loc, design_scale, size_factors = inputs
         a_var, b_var = self.unpack_params([self.params_copy, self.model_vars.a_var.get_shape()[0]])

From 0d65ab01aca726ab6d71859d6bb9466feb5b3d38 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 11 Mar 2020 08:59:58 +0100
Subject: [PATCH 063/124] bugfix: wrong parameter updates

---
 batchglm/train/tf2/base_glm/estimator.py | 40 ++++++++++++++++++------
 batchglm/train/tf2/base_glm/model.py     | 14 ++++-----
 batchglm/train/tf2/base_glm/optim.py     | 34 ++++++++++++++------
 3 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index bf9c63fc..d6ce5c00 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -66,7 +66,6 @@ def _train(
             benchmark: bool = False,
             optim_algo: str = "adam"
     ):
-
         conv_all = lambda x, y: not np.all(x)
         conv_step = lambda x, y: not np.all(x) and y < stopping_criteria
         assert convergence_criteria in ["step", "all_converged"], \
@@ -130,7 +129,7 @@ def generate():
         update_func = optimizer_object.perform_parameter_update \
             if irls_algo or nr_algo else optimizer_object.apply_gradients
 
-        prev_params = self.model.params.numpy()
+        prev_params = self.model.params_copy.numpy()
 
         train_step = 0
 
@@ -202,6 +201,7 @@ def new_epoch_set():
                             )
                         features_updated = self.model.model_vars.updated
                     else:
+                        """
                         if batch_features:
                             indices = tf.where(not_converged)
                             update_var = tf.transpose(tf.scatter_nd(
@@ -210,8 +210,9 @@ def new_epoch_set():
                                 shape=(n_features, results[1].get_shape()[0])
                             ))
                         else:
-                            update_var = results[1]
-                        update_func([(update_var, self.model.params)])
+                        """
+                        update_var = results[1]
+                        update_func([(update_var, self.model.params_copy)])
                         features_updated = not_converged
 
                     if benchmark:
@@ -246,18 +247,29 @@ def new_epoch_set():
                             shape=(n_features, self.model.params.get_shape()[0])
                         )
                     grad_numpy = grad_numpy.numpy()
+                    curr_params = self.model.params_copy
+                    if batch_features:
+                        curr_params = tf.transpose(
+                            tf.scatter_nd(
+                                tf.where(not_converged),
+                                tf.transpose(curr_params),
+                                shape=(self.model.params.shape[1], self.model.params.shape[0])
+                            )
+                        )
+                    curr_params = curr_params.numpy()
                     convergences = self.calculate_convergence(
                         converged_prev,
                         ll_prev,
                         ll_current,
                         prev_params,
+                        curr_params,
                         jac_normalization,
                         grad_numpy,
                         features_updated,
                         optimizer_object
                     )
+                    prev_params = curr_params
 
-                    prev_params = self.model.params.numpy()
                     # converged_current, converged_f, converged_g, converged_x = convergences
                     converged_current = convergences[0]
                     self.model.model_vars.convergence_update(converged_current, features_updated)
@@ -279,6 +291,16 @@ def new_epoch_set():
                             if conv_diff >= pkg_constants.FEATUREWISE_THRESHOLD:
                                 need_new_epoch_set = True
                                 n_conv_last_featurewise_batch = num_converged
+                                scattered_update_tensor = tf.transpose(
+                                    tf.scatter_nd(
+                                        tf.where(not_converged),
+                                        tf.transpose(self.model.params_copy),
+                                        shape=(self.model.params.shape[1], self.model.params.shape[0])
+                                    )
+                                )
+                                self.model.params.assign(
+                                    tf.where(not_converged, scattered_update_tensor, self.model.params)
+                                )
                         not_converged = ~self.model.model_vars.converged
                         sums = [np.sum(convergence_vals) for convergence_vals in convergences[1:]]
                         log_output = f"{log_output} logs: {sums[0]} grad: {sums[1]}, "\
@@ -316,7 +338,7 @@ def new_epoch_set():
         self.model.batch_features = batch_features
         batch_features = True
 
-    def calculate_convergence(self, converged_prev, ll_prev, ll_current, prev_params,
+    def calculate_convergence(self, converged_prev, ll_prev, ll_current, prev_params, curr_params,
                               jac_normalization, grad_numpy, features_updated, optimizer_object):
         """
             Wrapper method to perform all necessary convergence checks.
@@ -352,7 +374,7 @@ def calculate_convergence(self, converged_prev, ll_prev, ll_current, prev_params
         Now getting convergence based on change of coefficients below threshold:
         """
 
-        x_step_converged = self.calc_x_step(prev_params, features_updated)
+        x_step_converged = self.calc_x_step(prev_params, curr_params, features_updated)
         epoch_step_converged = not_converged_prev & x_step_converged
 
         """
@@ -440,7 +462,7 @@ def get_init_from_model(init_a, init_b, input_data, init_model):
     def get_model_container(self, input_data):
         pass
 
-    def calc_x_step(self, prev_params, features_updated):
+    def calc_x_step(self, prev_params, curr_params, features_updated):
 
         def get_norm_converged(model: str, prev_params):
             if model == 'loc':
@@ -451,7 +473,7 @@ def get_norm_converged(model: str, prev_params):
                 xtol = pkg_constants.XTOL_BY_FEATURE_SCALE
             else:
                 assert False, "Supply either 'loc' or 'scale'!"
-            x_step = self.model.params.numpy() - prev_params
+            x_step = curr_params - prev_params
             x_norm = np.sqrt(np.sum(np.square(x_step[idx_train, :]), axis=0))
             return x_norm < xtol
 
diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index 6e12f952..e0a8a64b 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -68,13 +68,13 @@ def setMethod(self, optimizer):
             self._calc = self._calc_fim
 
     def _call_parameters(self, inputs, keep_previous_params_copy=True):
-        #if not keep_previous_params_copy:
-        if self.batch_features:
-            self.params_copy = tf.Variable(tf.boolean_mask(tensor=self.params,
-                                                           mask=tf.logical_not(self.model_vars.converged),
-                                                           axis=1), trainable=True)
-        else:
-            self.params_copy = self.params
+        if not keep_previous_params_copy:
+            if self.batch_features:
+                self.params_copy = tf.Variable(tf.boolean_mask(tensor=self.params,
+                                                               mask=tf.logical_not(self.model_vars.converged),
+                                                               axis=1), trainable=True)
+            else:
+                self.params_copy = self.params
 
         design_loc, design_scale, size_factors = inputs
         a_var, b_var = self.unpack_params([self.params_copy, self.model_vars.a_var.get_shape()[0]])
diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index fe2927a6..68effa2e 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -86,6 +86,7 @@ def _trust_region_ops(
         The new likelihood is calculated on the full model now, after updating the parameters using
         the proposed vector:
         """
+        original_params_copy = tf.identity(self.model.params_copy)
         self.model.params_copy.assign_sub(proposed_vector)
         for i, x_batch in enumerate(x_batches):
             log_likelihood = self.model.calc_ll([*x_batch])[0]
@@ -106,6 +107,7 @@ def _trust_region_ops(
         feature space by adding columns corresponding to positions of converged (non calculated)
         features.
         """
+        """
         if batch_features:
             n_features = self.model.model_vars.n_features
             indices = tf.where(tf.logical_not(self.model.model_vars.converged))
@@ -124,11 +126,15 @@ def _trust_region_ops(
             update_var = proposed_vector
             gain_var = proposed_gain
         #delta_f_ratio = tf.divide(delta_f_actual, gain_var)
-
+        """
         # Compute parameter updates.g
-        update_theta = tf.logical_and(delta_f_actual > eta0, tf.logical_not(self.model.model_vars.converged))
-        update_theta_numeric = tf.expand_dims(tf.cast(update_theta, self._dtype), axis=0)
-        keep_theta_numeric = tf.ones_like(update_theta_numeric) - update_theta_numeric
+        #update_theta = tf.logical_and(delta_f_actual > eta0, tf.logical_not(self.model.model_vars.converged))
+        update_theta = delta_f_actual > eta0
+        self.model.params_copy.assign(tf.where(update_theta, self.model.params_copy, original_params_copy))
+
+        #update_theta_numeric = tf.expand_dims(tf.cast(update_theta, self._dtype), axis=0)
+        #keep_theta_numeric = tf.ones_like(update_theta_numeric) - update_theta_numeric
+        """
         if batch_features:
             params = tf.transpose(tf.scatter_nd(
                 indices,
@@ -140,7 +146,6 @@ def _trust_region_ops(
                 tf.multiply(self.model.params, keep_theta_numeric),
                 tf.multiply(params, update_theta_numeric)
             )
-
         else:
             params = self.model.params_copy
             theta_new_tr = tf.add(
@@ -148,14 +153,25 @@ def _trust_region_ops(
                 tf.multiply(params, update_theta_numeric)  # new values
             )
         self.model.params.assign(theta_new_tr)
+        """
+        decrease_radius = tf.math.logical_not(update_theta)
+        increase_radius = update_theta
+        if batch_features:
+            n_features = self.model.model_vars.n_features
+            indices = tf.where(tf.logical_not(self.model.model_vars.converged))
+            decrease_radius = tf.scatter_nd(indices, decrease_radius, shape=(n_features,))
+            increase_radius = tf.scatter_nd(indices, update_theta, shape=(n_features,))
+            update_theta = increase_radius
+
         if compute_b and not compute_a:
-            self.model.model_vars.updated |= update_theta.numpy()
+            self.model.model_vars.updated &= update_theta.numpy()
         else:
             self.model.model_vars.updated = update_theta.numpy()
 
         # Update trusted region accordingly:
-        decrease_radius = delta_f_actual <= eta0
-        increase_radius = delta_f_actual > eta0
+
+        #decrease_radius = delta_f_actual <= eta0
+        #increase_radius = delta_f_actual > eta0
         """
         decrease_radius = tf.logical_or(
             delta_f_actual <= eta0,
@@ -269,7 +285,7 @@ def _trust_region_update(
         update_norm = tf.multiply(update_raw, update_magnitude_inv)
         # the following switch is for irls_gd_tr (linear instead of newton)
         if n_obs is not None:
-            update_magnitude = update_magnitude / n_obs #* radius_container
+            update_magnitude = update_magnitude / n_obs * radius_container
         update_scale = tf.minimum(
             radius_container,
             update_magnitude

From 6791c4e44b18f6d378f0adb2814eaeaf581bfa20 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Sat, 14 Mar 2020 14:22:19 +0100
Subject: [PATCH 064/124] default batchsize raised to 5000

---
 batchglm/train/tf2/base_glm/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index d6ce5c00..82a25dc7 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -57,7 +57,7 @@ def _train(
             self,
             noise_model: str,
             is_batched: bool = True,
-            batch_size: int = 1000,
+            batch_size: int = 5000,
             optimizer_object: tf.keras.optimizers.Optimizer = tf.keras.optimizers.Adam(),
             convergence_criteria: str = "step",
             stopping_criteria: int = 1000,

From eb87404c5748bd483765a8e4dc982a8f021aea42 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Thu, 19 Mar 2020 08:27:51 +0100
Subject: [PATCH 065/124] stricter likelihood criterion

---
 batchglm/pkg_constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index d3147b25..52c2b50a 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -23,7 +23,7 @@
 TRUST_REGIONT_T2_IRLS_GD_TR_SCALE = 1.5
 
 # Convergence hyper-parameters:
-LLTOL_BY_FEATURE = 1e-12
+LLTOL_BY_FEATURE = 1e-14
 XTOL_BY_FEATURE_LOC = 1e-8
 XTOL_BY_FEATURE_SCALE = 1e-6
 GTOL_BY_FEATURE_LOC = 1e-8

From 6df19e26c290db2fa62701cf2b9fae18adbf174b Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Thu, 19 Mar 2020 10:45:57 +0100
Subject: [PATCH 066/124] even stricter likelihood criterion

---
 batchglm/pkg_constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index 52c2b50a..a98ccf7a 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -23,7 +23,7 @@
 TRUST_REGIONT_T2_IRLS_GD_TR_SCALE = 1.5
 
 # Convergence hyper-parameters:
-LLTOL_BY_FEATURE = 1e-14
+LLTOL_BY_FEATURE = 1e-16
 XTOL_BY_FEATURE_LOC = 1e-8
 XTOL_BY_FEATURE_SCALE = 1e-6
 GTOL_BY_FEATURE_LOC = 1e-8

From c5c6a5ebad5011ac4b8dcd3bd0d8d32e72804ef3 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 20 Mar 2020 13:00:34 +0100
Subject: [PATCH 067/124] increased default batch size to 5000

---
 batchglm/train/tf2/glm_nb/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index c7618cff..3109bb23 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -88,7 +88,7 @@ def __init__(
     def train(
             self,
             use_batching: bool = True,
-            batch_size: int = 500,
+            batch_size: int = 5000,
             optim_algo: str = "adam",
             learning_rate: float = 1e-2,
             convergence_criteria: str = "step",

From af5e2355e1ac007e9668636f72e4d5d0175800c8 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 20 Mar 2020 13:05:13 +0100
Subject: [PATCH 068/124] removed n_features argument from likelihood call

---
 batchglm/train/tf2/base_glm/layers.py | 4 +---
 batchglm/train/tf2/base_glm/model.py  | 2 +-
 batchglm/train/tf2/glm_beta/layers.py | 2 +-
 batchglm/train/tf2/glm_nb/layers.py   | 2 +-
 4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/layers.py b/batchglm/train/tf2/base_glm/layers.py
index b09b2642..4c22da02 100644
--- a/batchglm/train/tf2/base_glm/layers.py
+++ b/batchglm/train/tf2/base_glm/layers.py
@@ -234,7 +234,7 @@ def __init__(self, dtype):
         self.ll_dtype = dtype
 
     @abc.abstractmethod
-    def _ll(self, eta_loc, eta_scale, loc, scale, x, n_features):
+    def _ll(self, eta_loc, eta_scale, loc, scale, x):
         """
         Does the actual likelihood calculation. Depends on the given noise model and needs to be implemented in the
         inheriting layer.
@@ -249,8 +249,6 @@ def _ll(self, eta_loc, eta_scale, loc, scale, x, n_features):
             the variance values for each individual distribution, encoded in data space.
         :param x: tf.Tensor
             the input data
-        :param n_features
-            number of features.
 
         :return tf.Tensor
             the log-likelihoods of each individual data point.
diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index e0a8a64b..7e0d9ad1 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -86,7 +86,7 @@ def _call_parameters(self, inputs, keep_previous_params_copy=True):
 
     def calc_ll(self, inputs, keep_previous_params_copy=True):
         parameters = self._call_parameters(inputs[1:], keep_previous_params_copy)
-        log_probs = self.likelihood([*parameters[:-2], inputs[0], np.sum(self.model_vars.updated)])
+        log_probs = self.likelihood([*parameters[:-2], inputs[0]])
         log_probs = tf.reduce_sum(log_probs, axis=0)
         return (log_probs, *parameters[2:])
 
diff --git a/batchglm/train/tf2/glm_beta/layers.py b/batchglm/train/tf2/glm_beta/layers.py
index 2eae4735..eb84bcb6 100644
--- a/batchglm/train/tf2/glm_beta/layers.py
+++ b/batchglm/train/tf2/glm_beta/layers.py
@@ -35,7 +35,7 @@ def _inv_linker(self, scale: tf.Tensor):
 
 class Likelihood(LikelihoodGLM, ProcessModel):
 
-    def _ll(self, eta_loc, eta_scale, loc, scale, x, n_features):
+    def _ll(self, eta_loc, eta_scale, loc, scale, x):
 
         if isinstance(x, tf.SparseTensor):
             one_minus_x = -tf.sparse.add(x, -tf.ones_like(loc))
diff --git a/batchglm/train/tf2/glm_nb/layers.py b/batchglm/train/tf2/glm_nb/layers.py
index b180c9eb..4aff0436 100644
--- a/batchglm/train/tf2/glm_nb/layers.py
+++ b/batchglm/train/tf2/glm_nb/layers.py
@@ -36,7 +36,7 @@ def _inv_linker(self, scale: tf.Tensor):
 
 class Likelihood(LikelihoodGLM, ProcessModel):
 
-    def _ll(self, eta_loc, eta_scale, loc, scale, x, n_features):
+    def _ll(self, eta_loc, eta_scale, loc, scale, x):
 
         # Log-likelihood:
         log_r_plus_mu = tf.math.log(scale + loc)

From 58ce37587ad2d4891465b3b24d1796ae2e5799b3 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 20 Mar 2020 14:00:42 +0100
Subject: [PATCH 069/124] fallback to non featurewise if 1st order optim

---
 batchglm/train/tf2/base_glm/estimator.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 82a25dc7..b3c65d46 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -125,6 +125,10 @@ def generate():
 
         irls_algo = optim_algo.lower() in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr']
         nr_algo = optim_algo.lower() in ['nr', 'nr_tr']
+        if featurewise and not (irls_algo or nr_algo):
+            featurewise = False
+            logger.warning("WARNING: 'Featurewise batching' is only available for 2nd order "
+                           "optimizers IRLS and NR. Fallback to full featurespace fitting.")
 
         update_func = optimizer_object.perform_parameter_update \
             if irls_algo or nr_algo else optimizer_object.apply_gradients

From 30c7d8afba2e4f2b441fb0cfbda0c700a8f182ba Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 23 Mar 2020 18:15:05 +0100
Subject: [PATCH 070/124] added convergence and generator outside train

---
 batchglm/train/tf2/base_glm/convergence.py | 164 +++++++++++++++++++++
 batchglm/train/tf2/base_glm/generator.py   |  78 ++++++++++
 2 files changed, 242 insertions(+)
 create mode 100644 batchglm/train/tf2/base_glm/convergence.py
 create mode 100644 batchglm/train/tf2/base_glm/generator.py

diff --git a/batchglm/train/tf2/base_glm/convergence.py b/batchglm/train/tf2/base_glm/convergence.py
new file mode 100644
index 00000000..6dd6ad0e
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/convergence.py
@@ -0,0 +1,164 @@
+import numpy as np
+import tensorflow as tf
+from .external import pkg_constants
+
+
+class ConvergenceCalculator:
+    """Wrapper object containing all necessary methods to calculate convergences based on change
+    in likelihood, gradient and parameters."""
+
+    def __init__(self, estimator, last_ll: np.ndarray):
+        self.estimator = estimator
+        self.current_converged = estimator.model.model_vars.converged
+        self.current_params = estimator.model.params_copy
+        self.current_ll = last_ll
+        self.previous_number_converged = 0
+
+    def calculate_convergence(self, results, jac_normalization, optimizer_object, batch_features):
+        """Calculates convergence based on change in likelihood, gradient and parameters."""
+
+        features_updated = self.estimator.model.model_vars.features_updated
+        total_converged = self.estimator.model.model_vars.converged
+        not_converged_prev = ~ self.current_converged
+        n_features = self.estimator.input_data.n_features
+
+        ###########################################################
+        # FIRST PART: Retrieve and manipulate ll, grads and params.
+        ####
+        if self.estimator.irls_algo:
+            grad_numpy = tf.abs(tf.concat((results[1], results[2]), axis=1))
+        elif self.estimator.nr_algo:
+            grad_numpy = tf.abs(results[1])
+        else:
+            grad_numpy = tf.abs(tf.transpose(results[1]))
+        new_ll = tf.negative(tf.divide(results[0], self.estimator.input_data.num_observations))
+        new_params = self.estimator.model.params_copy
+
+        if batch_features:
+            # map columns of ll to full feature space
+            indices = np.where(not_converged_prev)[0]
+            updated_lls = tf.scatter_nd(
+                np.expand_dims(indices, 1), new_ll, shape=[n_features])
+            # fill the added columns with previous ll
+            new_ll = np.where(not_converged_prev, updated_lls.numpy(), self.current_ll)
+
+            # fill added columns with the gradients from previous runs.
+            indices = tf.where(not_converged_prev)
+            grad_numpy = tf.scatter_nd(
+                indices,
+                grad_numpy,
+                shape=(n_features, self.estimator.model.params.get_shape()[0])
+            )
+            # TODO: added columns are zero here, does that matter?
+
+            # map columns of params to full feature space
+            new_params = tf.transpose(
+                tf.scatter_nd(
+                    indices,
+                    tf.transpose(new_params),
+                    shape=(self.estimator.model.params.shape[1], self.estimator.model.params.shape[0])
+                ).numpy()
+            )
+            # TODO: added columns are zero here, does that matter?
+        else:
+            new_ll = new_ll.numpy()
+
+        ###########################################################
+        # SECOND PART: Calculate ll convergence.
+        ####
+
+        # Get all converged features due to change in ll < LLTOL_BY_FEATURE
+        # IMPORTANT: we need to ensure they have also been updated, otherwise ll_prev = ll_current!
+        ll_difference = np.abs(self.current_ll - new_ll) / self.current_ll
+        ll_converged = (ll_difference < pkg_constants.LLTOL_BY_FEATURE) & features_updated
+        epoch_ll_converged = not_converged_prev & ll_converged  # formerly known as converged_f
+
+        total_converged |= epoch_ll_converged
+
+        ###########################################################
+        # THIRD PART: calculate grad convergence.
+        ####
+        grad_loc = np.sum(grad_numpy[:, self.estimator.model.model_vars.idx_train_loc], axis=1)
+        grad_norm_loc = grad_loc / jac_normalization
+        grad_scale = np.sum(grad_numpy[:, self.estimator.model.model_vars.idx_train_scale], axis=1)
+        grad_norm_scale = grad_scale / jac_normalization
+
+        grad_norm_loc_converged = grad_norm_loc < pkg_constants.GTOL_BY_FEATURE_LOC
+        grad_norm_scale_converged = grad_norm_scale < pkg_constants.GTOL_BY_FEATURE_SCALE
+
+        grad_converged = grad_norm_loc_converged & grad_norm_scale_converged & features_updated
+        epoch_grad_converged = not_converged_prev & grad_converged  # formerly known as converged_g
+
+        total_converged |= grad_converged
+
+        ###########################################################
+        # Fourth PART: calculate parameter step convergence.
+        ####
+        x_step_converged = self.calc_x_step(self.current_params, new_params, features_updated)
+        epoch_step_converged = not_converged_prev & x_step_converged
+
+        # In case we use irls_tr/irls_gd_tr or nr_tr, we can also utilize the trusted region radius.
+        # For now it must not be below the threshold for the X step of the loc model.
+        if hasattr(optimizer_object, 'trusted_region_mode') \
+                and optimizer_object.trusted_region_mode:
+            converged_tr = optimizer_object.tr_radius.numpy() < pkg_constants.TRTOL_BY_FEATURE_LOC
+            if hasattr(optimizer_object, 'tr_radius_b') and self.estimator.train_scale:
+                converged_tr &= \
+                    optimizer_object.tr_radius_b.numpy() < pkg_constants.TRTOL_BY_FEATURE_SCALE
+            epoch_tr_converged = not_converged_prev & converged_tr
+            epoch_step_converged |= epoch_tr_converged
+
+        total_converged |= epoch_step_converged
+
+        ###########################################################
+        # FINAL PART: exchange the current with the new containers.
+        ####
+        self.previous_number_converged = np.sum(self.current_converged)
+        self.current_converged = total_converged.copy()
+        self.current_params = new_params
+        self.current_ll = new_ll
+
+        return total_converged, epoch_ll_converged, epoch_grad_converged, epoch_step_converged
+
+    def calc_x_step(self, prev_params, curr_params, features_updated):
+        """Calculates convergence based on the difference in parameters before and
+        after the update."""
+        def get_norm_converged(model: str, prev_params):
+            if model == 'loc':
+                idx_train = self.estimator.model.model_vars.idx_train_loc
+                xtol = pkg_constants.XTOL_BY_FEATURE_LOC
+            elif model == 'scale':
+                idx_train = self.estimator.model.model_vars.idx_train_scale
+                xtol = pkg_constants.XTOL_BY_FEATURE_SCALE
+            else:
+                assert False, "Supply either 'loc' or 'scale'!"
+            x_step = curr_params - prev_params
+            x_norm = np.sqrt(np.sum(np.square(x_step[idx_train, :]), axis=0))
+            return x_norm < xtol
+
+        # We use a trick here: First we set both the loc and scale convergence to True.
+        # It is not necessary to use an array with length = number of features, since bitwise
+        # AND also works with a single boolean.
+        loc_conv = np.bool_(True)
+        scale_conv = np.bool_(True)
+
+        # Now we check which models need to be trained. E.g. if you are using quick_scale = True,
+        # self._train_scale will be False and so the above single True value will be used.
+        if self.estimator.train_loc:
+            loc_conv = get_norm_converged('loc', prev_params)
+        if self.estimator.train_scale:
+            scale_conv = get_norm_converged('scale', prev_params)
+
+        # Finally, we check that only features updated in this epoch can evaluate to True.
+        # This is only a problem for 2nd order optims with trusted region mode, since it might
+        # occur, that a feature isn't updated, so the x_step is zero although not yet converged.
+        return loc_conv & scale_conv & features_updated
+
+    def getLoss(self):
+        return np.sum(self.current_ll)
+
+    def getNumberConverged(self):
+        return np.sum(self.current_converged)
+
+    def getPreviousNumberConverged(self):
+        return self.previous_number_converged
diff --git a/batchglm/train/tf2/base_glm/generator.py b/batchglm/train/tf2/base_glm/generator.py
new file mode 100644
index 00000000..40a00c68
--- /dev/null
+++ b/batchglm/train/tf2/base_glm/generator.py
@@ -0,0 +1,78 @@
+import numpy as np
+from scipy.sparse import csr_matrix
+import tensorflow as tf
+
+
+class DataGenerator:
+    """Wrapper Object to generate an iterable TensorFlow Dataset from given input data."""
+
+    def __init__(
+            self,
+            estimator,
+            noise_model: str,
+            is_batched_model: bool,
+            batch_size: int
+    ):
+        self.estimator = estimator
+        self.noise_model = noise_model
+        self.is_batched_model = is_batched_model
+        self.batch_size = batch_size
+        self.sparse = isinstance(estimator.input_data.x, csr_matrix)
+        self.n_obs = estimator.input_data.num_observations
+        # integer ceil division with arithmetic trick: ceil(a/b)=(a+b-1)//b
+        # We need this for cases where n_obs mod batch_size != 0
+        self.num_batches = (self.n_obs + batch_size - 1) // batch_size
+        dtp = estimator.dtype
+        output_types = ((tf.int64, dtp, tf.int64), *(dtp,) * 3) if self.sparse else (dtp,) * 4
+        self.dataset = tf.data.Dataset.from_generator(
+            generator=self._generate, output_types=output_types)
+        if self.sparse:
+            self.dataset = self.dataset.map(
+                lambda ivs_tuple, loc, scale, sf: (tf.SparseTensor(*ivs_tuple), loc, scale, sf)
+            )
+
+    def _generate(self):
+        """
+        Generates `(counts, design_loc, design_scale, size_factors)` tuples of `self.input_data`.
+        The number of observations in each such data batch is given by `self.batch size`.
+        If `self.is_batched_model`, the method uses a random permutation of `input_data` each time
+        it is called.
+        """
+        input_data = self.estimator.input_data
+        fetch_size_factors = input_data.size_factors is not None \
+            and self.noise_model in ["nb", "norm"]
+        obs_pool = np.random.permutation(self.n_obs) \
+            if self.is_batched_model else np.arange(self.n_obs)
+        for start_id in range(0, self.n_obs, self.batch_size):
+            # numpy ignores ids > len(obs_pool) so no out of bounds check needed here:
+            idx = obs_pool[start_id: start_id + self.batch_size]
+            counts = input_data.fetch_x_sparse(idx) if self.sparse \
+                else input_data.fetch_x_dense(idx)
+            dloc = input_data.fetch_design_loc(idx)
+            dscale = input_data.fetch_design_scale(idx)
+            size_factors = input_data.fetch_size_factors(idx) if fetch_size_factors else 1
+            yield counts, dloc, dscale, size_factors
+
+    def _featurewise_batch(self, x_tensor, dloc, dscale, size_factors):
+        """Takes an element of a dataset, performs featurewise batching
+        and returns the reduced element."""
+        not_converged = np.negative(self.estimator.model.model_vars.total_converged)
+        if self.sparse:
+            feature_columns = tf.sparse.split(
+                x_tensor,
+                num_split=self.estimator.model_vars.n_features,
+                axis=1)
+            not_converged_idx = np.where(not_converged)[0]
+            feature_columns = [feature_columns[i] for i in not_converged_idx]
+            x_tensor = tf.sparse.concat(axis=1, sp_inputs=feature_columns)
+
+        else:
+            x_tensor = tf.boolean_mask(tensor=x_tensor, mask=not_converged, axis=1)
+        return x_tensor, dloc, dscale, size_factors
+
+    def new_epoch_set(self, batch_features: bool = False):
+        """Returns an iterable TensorFlow Dataset of the input data."""
+        dataset_to_return = self.dataset.take(self.num_batches)
+        if batch_features:
+            return dataset_to_return.map(self._featurewise_batch).cache().prefetch(1)
+        return self.dataset.take(self.num_batches).cache().prefetch(1)

From 498540dab1479f5672cba66f8b2acda0f1a14e50 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 24 Mar 2020 16:00:01 +0100
Subject: [PATCH 071/124] weaker ll_tol

---
 batchglm/pkg_constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index a98ccf7a..d3147b25 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -23,7 +23,7 @@
 TRUST_REGIONT_T2_IRLS_GD_TR_SCALE = 1.5
 
 # Convergence hyper-parameters:
-LLTOL_BY_FEATURE = 1e-16
+LLTOL_BY_FEATURE = 1e-12
 XTOL_BY_FEATURE_LOC = 1e-8
 XTOL_BY_FEATURE_SCALE = 1e-6
 GTOL_BY_FEATURE_LOC = 1e-8

From fa969a42809e3b9a228cd4642e9af6887fb01264 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 24 Mar 2020 16:01:19 +0100
Subject: [PATCH 072/124] bugfixes, separate loc and scale update

---
 batchglm/train/tf2/base_glm/convergence.py | 128 ++++---
 batchglm/train/tf2/base_glm/estimator.py   | 409 ++++++---------------
 batchglm/train/tf2/base_glm/generator.py   |   2 +-
 batchglm/train/tf2/base_glm/model.py       |   2 +-
 batchglm/train/tf2/base_glm/optim.py       |  16 +-
 batchglm/train/tf2/base_glm/vars.py        |   4 +
 6 files changed, 205 insertions(+), 356 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/convergence.py b/batchglm/train/tf2/base_glm/convergence.py
index 6dd6ad0e..c0df5608 100644
--- a/batchglm/train/tf2/base_glm/convergence.py
+++ b/batchglm/train/tf2/base_glm/convergence.py
@@ -9,18 +9,24 @@ class ConvergenceCalculator:
 
     def __init__(self, estimator, last_ll: np.ndarray):
         self.estimator = estimator
-        self.current_converged = estimator.model.model_vars.converged
-        self.current_params = estimator.model.params_copy
-        self.current_ll = last_ll
+        self.last_params = estimator.model.params_copy.numpy()
+        self.last_ll = last_ll
         self.previous_number_converged = 0
+        self.calc_separated = self.estimator.irls_algo and self.estimator._train_scale
+
 
     def calculate_convergence(self, results, jac_normalization, optimizer_object, batch_features):
         """Calculates convergence based on change in likelihood, gradient and parameters."""
 
-        features_updated = self.estimator.model.model_vars.features_updated
-        total_converged = self.estimator.model.model_vars.converged
-        not_converged_prev = ~ self.current_converged
-        n_features = self.estimator.input_data.n_features
+        features_updated = self.estimator.model.model_vars.updated
+        converged_a = self.estimator.model.model_vars.converged
+        not_converged_a = ~ converged_a
+        if self.calc_separated:
+            features_updated_b = self.estimator.model.model_vars.updated_b
+            converged_b = self.estimator.model.model_vars.converged_b
+            not_converged_b = ~ converged_b
+
+        n_features = self.estimator.input_data.num_features
 
         ###########################################################
         # FIRST PART: Retrieve and manipulate ll, grads and params.
@@ -36,14 +42,13 @@ def calculate_convergence(self, results, jac_normalization, optimizer_object, ba
 
         if batch_features:
             # map columns of ll to full feature space
-            indices = np.where(not_converged_prev)[0]
-            updated_lls = tf.scatter_nd(
-                np.expand_dims(indices, 1), new_ll, shape=[n_features])
+            not_conv = not_converged_a if not self.calc_separated else ~self.estimator.model.model_vars.total_converged
+            indices = tf.where(not_conv)
+            updated_lls = tf.scatter_nd(indices, new_ll, shape=[n_features])
             # fill the added columns with previous ll
-            new_ll = np.where(not_converged_prev, updated_lls.numpy(), self.current_ll)
+            new_ll = tf.where(not_conv, updated_lls, self.last_ll)
 
             # fill added columns with the gradients from previous runs.
-            indices = tf.where(not_converged_prev)
             grad_numpy = tf.scatter_nd(
                 indices,
                 grad_numpy,
@@ -57,11 +62,13 @@ def calculate_convergence(self, results, jac_normalization, optimizer_object, ba
                     indices,
                     tf.transpose(new_params),
                     shape=(self.estimator.model.params.shape[1], self.estimator.model.params.shape[0])
-                ).numpy()
+                )
             )
             # TODO: added columns are zero here, does that matter?
-        else:
-            new_ll = new_ll.numpy()
+
+        grad_numpy = grad_numpy.numpy()
+        new_params = new_params.numpy()
+        new_ll = new_ll.numpy()
 
         ###########################################################
         # SECOND PART: Calculate ll convergence.
@@ -69,11 +76,18 @@ def calculate_convergence(self, results, jac_normalization, optimizer_object, ba
 
         # Get all converged features due to change in ll < LLTOL_BY_FEATURE
         # IMPORTANT: we need to ensure they have also been updated, otherwise ll_prev = ll_current!
-        ll_difference = np.abs(self.current_ll - new_ll) / self.current_ll
-        ll_converged = (ll_difference < pkg_constants.LLTOL_BY_FEATURE) & features_updated
-        epoch_ll_converged = not_converged_prev & ll_converged  # formerly known as converged_f
+        ll_difference = np.abs(self.last_ll - new_ll) / self.last_ll
+        # print('ll_diff: ', ll_difference[0])
+        # print(self.estimator.model.model_vars.converged[0], self.estimator.model.model_vars.updated[0])
+        # print(self.estimator.model.model_vars.converged_b[0], self.estimator.model.model_vars.updated_b[0])
+        ll_converged = ll_difference < pkg_constants.LLTOL_BY_FEATURE
 
-        total_converged |= epoch_ll_converged
+        ll_converged_a = ll_converged & features_updated
+        epoch_ll_converged_a = not_converged_a & ll_converged_a  # formerly known as converged_f
+
+        if self.calc_separated:
+            ll_converged_b = ll_converged & features_updated_b
+            epoch_ll_converged_b = not_converged_b & ll_converged_b  # formerly known as converged_f
 
         ###########################################################
         # THIRD PART: calculate grad convergence.
@@ -85,42 +99,66 @@ def calculate_convergence(self, results, jac_normalization, optimizer_object, ba
 
         grad_norm_loc_converged = grad_norm_loc < pkg_constants.GTOL_BY_FEATURE_LOC
         grad_norm_scale_converged = grad_norm_scale < pkg_constants.GTOL_BY_FEATURE_SCALE
+        if self.calc_separated:
+            grad_converged_a = grad_norm_loc_converged & features_updated
+            grad_converged_b = grad_norm_scale_converged & features_updated_b
+            epoch_grad_converged_b = not_converged_b & grad_converged_b  # formerly known as converged_g
 
-        grad_converged = grad_norm_loc_converged & grad_norm_scale_converged & features_updated
-        epoch_grad_converged = not_converged_prev & grad_converged  # formerly known as converged_g
-
-        total_converged |= grad_converged
+        else:
+            grad_converged_a = grad_norm_loc_converged & grad_norm_scale_converged & features_updated
+        epoch_grad_converged_a = not_converged_a & grad_converged_a  # formerly known as converged_g
+        # print('grad: ', grad_norm_loc[0], grad_norm_scale[0])
 
         ###########################################################
         # Fourth PART: calculate parameter step convergence.
         ####
-        x_step_converged = self.calc_x_step(self.current_params, new_params, features_updated)
-        epoch_step_converged = not_converged_prev & x_step_converged
+        x_step_a, x_step_b = self.calc_x_step(self.last_params, new_params)
+        if self.calc_separated:
+            x_step_converged_a = x_step_a & features_updated
+            x_step_converged_b = x_step_b & features_updated_b
+            epoch_step_converged_b = not_converged_b & x_step_converged_b
+
+        else:
+            x_step_converged_a = x_step_a & x_step_b & features_updated
+        epoch_step_converged_a = not_converged_a & x_step_converged_a
+        # print('x_step: ', x_step_converged_a[0], x_step_converged_b[0])
 
         # In case we use irls_tr/irls_gd_tr or nr_tr, we can also utilize the trusted region radius.
         # For now it must not be below the threshold for the X step of the loc model.
         if hasattr(optimizer_object, 'trusted_region_mode') \
                 and optimizer_object.trusted_region_mode:
             converged_tr = optimizer_object.tr_radius.numpy() < pkg_constants.TRTOL_BY_FEATURE_LOC
-            if hasattr(optimizer_object, 'tr_radius_b') and self.estimator.train_scale:
-                converged_tr &= \
+            if hasattr(optimizer_object, 'tr_radius_b') and self.estimator._train_scale:
+                converged_tr_b = \
                     optimizer_object.tr_radius_b.numpy() < pkg_constants.TRTOL_BY_FEATURE_SCALE
-            epoch_tr_converged = not_converged_prev & converged_tr
-            epoch_step_converged |= epoch_tr_converged
-
-        total_converged |= epoch_step_converged
-
+                epoch_tr_converged_b = not_converged_b & converged_tr_b
+                epoch_step_converged_b |= epoch_tr_converged_b
+            epoch_tr_converged = not_converged_a & converged_tr
+            epoch_step_converged_a |= epoch_tr_converged
+        # print('tr: ', epoch_tr_converged[0], epoch_tr_converged_b[0])
+        # print(self.estimator.model.model_vars.converged[0], self.estimator.model.model_vars.updated[0])
+        # print(self.estimator.model.model_vars.converged_b[0], self.estimator.model.model_vars.updated_b[0])
         ###########################################################
         # FINAL PART: exchange the current with the new containers.
         ####
-        self.previous_number_converged = np.sum(self.current_converged)
-        self.current_converged = total_converged.copy()
-        self.current_params = new_params
-        self.current_ll = new_ll
-
-        return total_converged, epoch_ll_converged, epoch_grad_converged, epoch_step_converged
+        self.previous_number_converged = np.sum(self.estimator.model.model_vars.total_converged)
+        self.last_params = new_params
+        self.last_ll = new_ll
+        converged_a = np.logical_or.reduce((converged_a, epoch_ll_converged_a, epoch_grad_converged_a, epoch_step_converged_a))
+        if self.calc_separated:
+            converged_b = np.logical_or.reduce((converged_b, epoch_ll_converged_b, epoch_grad_converged_b, epoch_step_converged_b))
+            self.estimator.model.model_vars.total_converged = converged_a & converged_b
+            self.estimator.model.model_vars.converged_b = converged_b
+            epoch_ll_converged_a |= epoch_ll_converged_b
+            epoch_grad_converged_a |= epoch_grad_converged_b
+            epoch_step_converged_a |= epoch_step_converged_b
+        else:
+            self.estimator.model.model_vars.total_converged = converged_a
+        self.estimator.model.model_vars.converged = converged_a
+        # print(self.estimator.model.model_vars.total_converged[0])
+        return epoch_ll_converged_a, epoch_grad_converged_a, epoch_step_converged_a
 
-    def calc_x_step(self, prev_params, curr_params, features_updated):
+    def calc_x_step(self, prev_params, curr_params):
         """Calculates convergence based on the difference in parameters before and
         after the update."""
         def get_norm_converged(model: str, prev_params):
@@ -134,6 +172,7 @@ def get_norm_converged(model: str, prev_params):
                 assert False, "Supply either 'loc' or 'scale'!"
             x_step = curr_params - prev_params
             x_norm = np.sqrt(np.sum(np.square(x_step[idx_train, :]), axis=0))
+            # print('x_norm: ', x_norm[0])
             return x_norm < xtol
 
         # We use a trick here: First we set both the loc and scale convergence to True.
@@ -144,21 +183,18 @@ def get_norm_converged(model: str, prev_params):
 
         # Now we check which models need to be trained. E.g. if you are using quick_scale = True,
         # self._train_scale will be False and so the above single True value will be used.
-        if self.estimator.train_loc:
+        if self.estimator._train_loc:
             loc_conv = get_norm_converged('loc', prev_params)
-        if self.estimator.train_scale:
+        if self.estimator._train_scale:
             scale_conv = get_norm_converged('scale', prev_params)
 
         # Finally, we check that only features updated in this epoch can evaluate to True.
         # This is only a problem for 2nd order optims with trusted region mode, since it might
         # occur, that a feature isn't updated, so the x_step is zero although not yet converged.
-        return loc_conv & scale_conv & features_updated
+        return loc_conv, scale_conv
 
     def getLoss(self):
-        return np.sum(self.current_ll)
-
-    def getNumberConverged(self):
-        return np.sum(self.current_converged)
+        return np.sum(self.last_ll)
 
     def getPreviousNumberConverged(self):
         return self.previous_number_converged
diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index b3c65d46..6b04103e 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -2,7 +2,8 @@
 import logging
 import time
 import numpy as np
-import scipy
+from .generator import DataGenerator
+from .convergence import ConvergenceCalculator
 import tensorflow as tf
 from .model import GLM
 from .external import TFEstimator, _EstimatorGLM
@@ -66,14 +67,15 @@ def _train(
             benchmark: bool = False,
             optim_algo: str = "adam"
     ):
-        conv_all = lambda x, y: not np.all(x)
-        conv_step = lambda x, y: not np.all(x) and y < stopping_criteria
+        n_obs = self.input_data.num_observations
+        n_features = self.input_data.num_features
+        conv_all = lambda x, y: x < n_features
+        conv_step = lambda x, y: x < n_features and y < stopping_criteria
         assert convergence_criteria in ["step", "all_converged"], \
             ("Unrecognized convergence criteria %s", convergence_criteria)
         convergence_decision = conv_step if convergence_criteria == "step" else conv_all
 
-        n_obs = self.input_data.num_observations
-        n_features = self.input_data.num_features
+
         if batch_size > n_obs:
             batch_size = n_obs
         if not self._initialized:
@@ -86,235 +88,106 @@ def _train(
                 to closed form. Only Jacobians are calculated using autograd.")
 
         self.noise_model = noise_model
-        sparse = isinstance(self.input_data.x, scipy.sparse.csr_matrix)
-        full_model = not is_batched
-
-        def generate():
-            fetch_size_factors = self.input_data.size_factors is not None \
-                and self.noise_model in ["nb", "norm"]
-            obs_pool = np.arange(n_obs) if full_model else np.random.permutation(n_obs)
-            for start_id in range(0, n_obs, batch_size):
-                # numpy ignores ids > len(obs_pool) so no out of bounds check needed here:
-                idx = obs_pool[start_id: start_id + batch_size]
-
-                counts = self.input_data.fetch_x_sparse(idx) if sparse \
-                    else self.input_data.fetch_x_dense(idx)
-                dloc = self.input_data.fetch_design_loc(idx)
-                dscale = self.input_data.fetch_design_scale(idx)
-                size_factors = self.input_data.fetch_size_factors(idx) if fetch_size_factors else 1
-                yield counts, dloc, dscale, size_factors
-
-        dtp = self.dtype
-        output_types = ((tf.int64, dtp, tf.int64), *(dtp,) * 3) if sparse else (dtp,) * 4
-        # integer ceil division with arithmetic trick: ceil(a/b)=(a+b-1)//b
-        # We need this for cases where n_obs mod batch_size != 0
-        num_batches = (n_obs + batch_size - 1) // batch_size
-        dataset = tf.data.Dataset.from_generator(
-            generator=generate, output_types=output_types)
-        if sparse:
-            dataset = dataset.map(
-                lambda ivs_tuple, loc, scale, sf: (tf.SparseTensor(*ivs_tuple), loc, scale, sf)
-            )
-        batch_features = False
-        # Set all to convergence status = False, this is needed if multiple
-        # training strategies are run:
-        converged_current = np.zeros(n_features, dtype=np.bool)
 
-        # fill with lowest possible number:
-        ll_current = np.nextafter(np.inf, np.zeros(n_features), dtype=self.dtype)
+        batch_features = False
 
-        irls_algo = optim_algo.lower() in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr']
-        nr_algo = optim_algo.lower() in ['nr', 'nr_tr']
-        if featurewise and not (irls_algo or nr_algo):
+        self.irls_algo = optim_algo.lower() in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr']
+        self.nr_algo = optim_algo.lower() in ['nr', 'nr_tr']
+        if featurewise and not (self.irls_algo or self.nr_algo):
             featurewise = False
             logger.warning("WARNING: 'Featurewise batching' is only available for 2nd order "
                            "optimizers IRLS and NR. Fallback to full featurespace fitting.")
 
         update_func = optimizer_object.perform_parameter_update \
-            if irls_algo or nr_algo else optimizer_object.apply_gradients
-
-        prev_params = self.model.params_copy.numpy()
+            if self.irls_algo or self.nr_algo else optimizer_object.apply_gradients
 
         train_step = 0
 
-        not_converged = ~ self.model.model_vars.converged
-
-        def featurewise_batch(x_tensor, dloc, dscale, size_factors):
-            if not batch_features:
-                return x_tensor, dloc, dscale, size_factors
-            if isinstance(x_tensor, tf.SparseTensor):
-                feature_columns = tf.sparse.split(
-                    x_tensor,
-                    num_split=self.model.model_vars.n_features,
-                    axis=1)
-                not_converged_idx = np.where(not_converged)[0]
-                feature_columns = [feature_columns[i] for i in not_converged_idx]
-                x_tensor = tf.sparse.concat(axis=1, sp_inputs=feature_columns)
+        # create a tensorflow dataset using the DataGenerator
+        datagenerator = DataGenerator(self, noise_model, is_batched, batch_size)
+        epoch_set = datagenerator.new_epoch_set()
 
+        for i, x_batch in enumerate(epoch_set):
+            current_results = self.model(x_batch)
+            if is_batched or i == 0:
+                results = current_results
             else:
-                x_tensor = tf.boolean_mask(tensor=x_tensor, mask=not_converged, axis=1)
-            return x_tensor, dloc, dscale, size_factors
-
-        def new_epoch_set():
-            return dataset.take(num_batches).map(featurewise_batch).cache().prefetch(1)
+                results = [tf.math.add(results[i], x) for i, x in enumerate(current_results)]
+        # create a ConvergenceCalculator Object
+        conv_calc = ConvergenceCalculator(self, tf.negative(tf.divide(results[0], n_obs)).numpy())
 
-        epoch_set = new_epoch_set()
         num_converged = 0
         num_converged_prev = 0
         need_new_epoch_set = False
         n_conv_last_featurewise_batch = 0
-        while convergence_decision(converged_current, train_step):
+
+        while convergence_decision(num_converged, train_step):
             if benchmark:
                 t0_epoch = time.time()
 
-            ll_prev = ll_current.copy()
-            results = None
             if need_new_epoch_set:
-                epoch_set = new_epoch_set()
+                epoch_set = datagenerator.new_epoch_set(batch_features=batch_features)
+                for i, x_batch in enumerate(epoch_set):
+                    current_results = self.model(x_batch, keep_previous_params_copy=not need_new_epoch_set)
+                    if is_batched or i == 0:
+                        results = current_results
+                    else:
+                        results = [tf.math.add(results[i], x) for i, x in enumerate(current_results)]
+
+            self.update_params(x_batch if is_batched else epoch_set, results, batch_features, is_batched, update_func)
+            # converged_current, converged_f, converged_g, converged_x = convergences
+
             for i, x_batch in enumerate(epoch_set):
-                current_results = self.model(x_batch, keep_previous_params_copy=not need_new_epoch_set)
+                current_results = self.model(x_batch)
                 if is_batched or i == 0:
                     results = current_results
                 else:
                     results = [tf.math.add(results[i], x) for i, x in enumerate(current_results)]
 
-                if is_batched or i == num_batches - 1:
-                    if irls_algo or nr_algo:
-                        if irls_algo:
-                            batches = x_batch if is_batched else epoch_set
-                            update_func(
-                                inputs=[batches, *results],
-                                compute_a=True,
-                                compute_b=False,
-                                batch_features=batch_features,
-                                is_batched=is_batched
-                            )
-                            if self._train_scale:
-                                update_func(
-                                    inputs=[batches, *results],
-                                    compute_a=False,
-                                    compute_b=True,
-                                    batch_features=batch_features,
-                                    is_batched=is_batched
-                                )
-                        else:
-                            update_func(
-                                inputs=[batches, *results],
-                                batch_features=batch_features,
-                                is_batched=is_batched
-                            )
-                        features_updated = self.model.model_vars.updated
-                    else:
-                        """
-                        if batch_features:
-                            indices = tf.where(not_converged)
-                            update_var = tf.transpose(tf.scatter_nd(
-                                indices,
-                                tf.transpose(results[1]),
-                                shape=(n_features, results[1].get_shape()[0])
-                            ))
-                        else:
-                        """
-                        update_var = results[1]
-                        update_func([(update_var, self.model.params_copy)])
-                        features_updated = not_converged
-
-                    if benchmark:
-                        self.values.append(self.model.trainable_variables[0].numpy().copy())
-
-                    # Update converged status
-                    converged_prev = converged_current.copy()
-                    ll_current = -results[0].numpy() / n_obs
-
-                    if batch_features:
-                        indices = np.where(not_converged)[0]
-                        updated_lls = tf.scatter_nd(
-                            np.expand_dims(indices, 1), ll_current, shape=[n_features])
-                        ll_current = np.where(not_converged, updated_lls.numpy(), ll_prev)
-                    if benchmark:
-                        self.lls.append(ll_current)
-                    if is_batched:
-                        jac_normalization = batch_size
-                    else:
-                        jac_normalization = n_obs
-                    if irls_algo:
-                        grad_numpy = tf.abs(tf.concat((results[1], results[2]), axis=1))
-                    elif nr_algo:
-                        grad_numpy = tf.abs(results[1])
-                    else:
-                        grad_numpy = tf.abs(tf.transpose(results[1]))
-                    if batch_features:
-                        indices = tf.where(not_converged)
-                        grad_numpy = tf.scatter_nd(
-                            indices,
-                            grad_numpy,
-                            shape=(n_features, self.model.params.get_shape()[0])
-                        )
-                    grad_numpy = grad_numpy.numpy()
-                    curr_params = self.model.params_copy
-                    if batch_features:
-                        curr_params = tf.transpose(
-                            tf.scatter_nd(
-                                tf.where(not_converged),
-                                tf.transpose(curr_params),
-                                shape=(self.model.params.shape[1], self.model.params.shape[0])
-                            )
+            convergences = conv_calc.calculate_convergence(
+                results=results,
+                jac_normalization=batch_size if is_batched else n_obs,
+                optimizer_object=optimizer_object,
+                batch_features=batch_features
+            )
+
+            num_converged = np.sum(self.model.model_vars.total_converged)
+            loss = conv_calc.getLoss()
+            if self.irls_algo and self._train_scale:
+                num_updated = np.sum(np.logical_or(self.model.model_vars.updated, self.model.model_vars.updated_b))
+            else:
+                num_updated = np.sum(self.model.model_vars.updated)
+            log_output = f"Step: {train_step} loss: {loss}, "\
+                f"converged {num_converged}, updated {num_updated}"
+            num_converged_prev = conv_calc.getPreviousNumberConverged()
+
+            if num_converged == num_converged_prev:
+                need_new_epoch_set = False
+                logger.warning(log_output)
+            else:
+                if featurewise:
+                    if not batch_features:
+                        batch_features = True
+                        self.model.batch_features = batch_features
+                    conv_diff = num_converged - n_conv_last_featurewise_batch
+                    if conv_diff >= pkg_constants.FEATUREWISE_THRESHOLD:
+                        need_new_epoch_set = True
+                        n_conv_last_featurewise_batch = num_converged
+                        self.model.params.assign(
+                            tf.where(self.model.model_vars.total_converged, self.model.params, conv_calc.last_params)
                         )
-                    curr_params = curr_params.numpy()
-                    convergences = self.calculate_convergence(
-                        converged_prev,
-                        ll_prev,
-                        ll_current,
-                        prev_params,
-                        curr_params,
-                        jac_normalization,
-                        grad_numpy,
-                        features_updated,
-                        optimizer_object
-                    )
-                    prev_params = curr_params
-
-                    # converged_current, converged_f, converged_g, converged_x = convergences
-                    converged_current = convergences[0]
-                    self.model.model_vars.convergence_update(converged_current, features_updated)
-                    num_converged = np.sum(converged_current)
-                    loss = np.sum(ll_current)
-                    num_updated = np.sum(features_updated)
-                    log_output = f"Step: {train_step} loss: {loss}, "\
-                        f"converged {num_converged}, updated {num_updated}"
-                    num_converged_prev = np.sum(converged_prev)
-                    need_new_epoch_set = False
-                    if num_converged == num_converged_prev:
-                        logger.warning(log_output)
-                    else:
-                        if featurewise:
-                            if not batch_features:
-                                batch_features = True
-                                self.model.batch_features = batch_features
-                            conv_diff = num_converged - n_conv_last_featurewise_batch
-                            if conv_diff >= pkg_constants.FEATUREWISE_THRESHOLD:
-                                need_new_epoch_set = True
-                                n_conv_last_featurewise_batch = num_converged
-                                scattered_update_tensor = tf.transpose(
-                                    tf.scatter_nd(
-                                        tf.where(not_converged),
-                                        tf.transpose(self.model.params_copy),
-                                        shape=(self.model.params.shape[1], self.model.params.shape[0])
-                                    )
-                                )
-                                self.model.params.assign(
-                                    tf.where(not_converged, scattered_update_tensor, self.model.params)
-                                )
-                        not_converged = ~self.model.model_vars.converged
-                        sums = [np.sum(convergence_vals) for convergence_vals in convergences[1:]]
-                        log_output = f"{log_output} logs: {sums[0]} grad: {sums[1]}, "\
-                            f"x_step: {sums[2]}"
-                        logger.warning(log_output)
-                    train_step += 1
-                    if benchmark:
-                        t1_epoch = time.time()
-                        self.times.append(t1_epoch-t0_epoch)
-                        self.converged.append(num_converged)
+
+                sums = [np.sum(convergence_vals) for convergence_vals in convergences]
+                log_output = f"{log_output} logs: {sums[0]} grad: {sums[1]}, "\
+                    f"x_step: {sums[2]}"
+                logger.warning(log_output)
+            train_step += 1
+            if benchmark:
+                t1_epoch = time.time()
+                self.times.append(t1_epoch-t0_epoch)
+                self.converged.append(num_converged)
+                self.values.append(self.model.trainable_variables[0].numpy().copy())
+                self.lls.append(conv_calc.last_ll)
 
         # Evaluate final params
         logger.warning("Final Evaluation run.")
@@ -323,7 +196,7 @@ def new_epoch_set():
         # change to hessian mode since we still use hessian instead of FIM for self._fisher_inv
         self.model.setMethod('nr_tr')
         self.model.hessian.compute_b = True
-        final_set = new_epoch_set()
+        final_set = datagenerator.new_epoch_set()
         for i, x_batch_tuple in enumerate(final_set):
             current_results = self.model(x_batch_tuple, keep_previous_params_copy=False)
             if i == 0:
@@ -342,61 +215,36 @@ def new_epoch_set():
         self.model.batch_features = batch_features
         batch_features = True
 
-    def calculate_convergence(self, converged_prev, ll_prev, ll_current, prev_params, curr_params,
-                              jac_normalization, grad_numpy, features_updated, optimizer_object):
-        """
-            Wrapper method to perform all necessary convergence checks.
-        """
-
-        total_converged = converged_prev.copy()
-        not_converged_prev = ~ converged_prev
-        """
-        Get all converged features due to change in ll < LLTOL_BY_FEATURE
-        IMPORTANT: we need to ensure they have also been updated, otherwise ll_prev = ll_current!
-        """
-        ll_difference = np.abs(ll_prev - ll_current) / ll_prev
-        ll_converged = (ll_difference < pkg_constants.LLTOL_BY_FEATURE) & features_updated
-        epoch_ll_converged = not_converged_prev & ll_converged  # formerly known as converged_f
-        total_converged |= epoch_ll_converged
-        """
-        Now getting convergence based on change in gradient below threshold:
-        """
-        grad_loc = np.sum(grad_numpy[:, self.model.model_vars.idx_train_loc], axis=1)
-        grad_norm_loc = grad_loc / jac_normalization
-        grad_scale = np.sum(grad_numpy[:, self.model.model_vars.idx_train_scale], axis=1)
-        grad_norm_scale = grad_scale / jac_normalization
-
-        grad_norm_loc_converged = grad_norm_loc < pkg_constants.GTOL_BY_FEATURE_LOC
-        grad_norm_scale_converged = grad_norm_scale < pkg_constants.GTOL_BY_FEATURE_SCALE
-
-        grad_converged = grad_norm_loc_converged & grad_norm_scale_converged & features_updated
-        epoch_grad_converged = not_converged_prev & grad_converged  # formerly known as converged_g
-
-        total_converged |= grad_converged
-
-        """
-        Now getting convergence based on change of coefficients below threshold:
-        """
-
-        x_step_converged = self.calc_x_step(prev_params, curr_params, features_updated)
-        epoch_step_converged = not_converged_prev & x_step_converged
-
-        """
-        In case we use irls_tr/irls_gd_tr or nr_tr, we can also utilize the trusted region radius.
-        For now it must not be below the threshold for the X step of the loc model.
-        """
-
-        if hasattr(optimizer_object, 'trusted_region_mode') \
-                and optimizer_object.trusted_region_mode:
-            converged_tr = optimizer_object.tr_radius.numpy() < pkg_constants.TRTOL_BY_FEATURE_LOC
-            if hasattr(optimizer_object, 'tr_radius_b') and self._train_scale:
-                converged_tr &= \
-                    optimizer_object.tr_radius_b.numpy() < pkg_constants.TRTOL_BY_FEATURE_SCALE
-            epoch_tr_converged = not_converged_prev & converged_tr
-            epoch_step_converged |= epoch_tr_converged
+    def update_params(self, batches, results, batch_features, is_batched, update_func):
+        if self.irls_algo or self.nr_algo:
+            if self.irls_algo:
+
+                update_func(
+                    inputs=[batches, *results],
+                    compute_a=True,
+                    compute_b=False,
+                    batch_features=batch_features,
+                    is_batched=is_batched
+                )
+                if self._train_scale:
+                    update_func(
+                        inputs=[batches, *results],
+                        compute_a=False,
+                        compute_b=True,
+                        batch_features=batch_features,
+                        is_batched=is_batched
+                    )
+            else:
+                update_func(
+                    inputs=[batches, *results],
+                    batch_features=batch_features,
+                    is_batched=is_batched
+                )
+        else:
 
-        total_converged |= epoch_step_converged
-        return total_converged, epoch_ll_converged, epoch_grad_converged, epoch_step_converged
+            update_var = results[1]
+            update_func([(update_var, self.model.params_copy)])
+            self.model.model_vars.updated = ~self.model.model_vars.converged
 
     def get_optimizer_object(self, optimizer: str, learning_rate):
         """
@@ -465,42 +313,3 @@ def get_init_from_model(init_a, init_b, input_data, init_model):
     @abc.abstractmethod
     def get_model_container(self, input_data):
         pass
-
-    def calc_x_step(self, prev_params, curr_params, features_updated):
-
-        def get_norm_converged(model: str, prev_params):
-            if model == 'loc':
-                idx_train = self.model.model_vars.idx_train_loc
-                xtol = pkg_constants.XTOL_BY_FEATURE_LOC
-            elif model == 'scale':
-                idx_train = self.model.model_vars.idx_train_scale
-                xtol = pkg_constants.XTOL_BY_FEATURE_SCALE
-            else:
-                assert False, "Supply either 'loc' or 'scale'!"
-            x_step = curr_params - prev_params
-            x_norm = np.sqrt(np.sum(np.square(x_step[idx_train, :]), axis=0))
-            return x_norm < xtol
-
-        """
-        We use a trick here: First we set both the loc and scale convergence to True.
-        It is not necessary to use an array with length = number of features, since bitwise
-        AND also works with a single boolean.
-        """
-        loc_conv = np.bool_(True)
-        scale_conv = np.bool_(True)
-
-        """
-        Now we check which models need to be trained. E.g. if you are using quick_scale = True,
-        self._train_scale will be False and so the above single True value will be used.
-        """
-        if self._train_loc:
-            loc_conv = get_norm_converged('loc', prev_params)
-        if self._train_scale:
-            scale_conv = get_norm_converged('scale', prev_params)
-
-        """
-        Finally, we check that only features updated in this epoch can evaluate to True.
-        This is only a problem for 2nd order optims with trusted region mode, since it might occur,
-        that a feature isn't updated, so the x_step is zero although not yet converged.
-        """
-        return loc_conv & scale_conv & features_updated
diff --git a/batchglm/train/tf2/base_glm/generator.py b/batchglm/train/tf2/base_glm/generator.py
index 40a00c68..c3ae70e8 100644
--- a/batchglm/train/tf2/base_glm/generator.py
+++ b/batchglm/train/tf2/base_glm/generator.py
@@ -56,7 +56,7 @@ def _generate(self):
     def _featurewise_batch(self, x_tensor, dloc, dscale, size_factors):
         """Takes an element of a dataset, performs featurewise batching
         and returns the reduced element."""
-        not_converged = np.negative(self.estimator.model.model_vars.total_converged)
+        not_converged = ~self.estimator.model.model_vars.total_converged
         if self.sparse:
             feature_columns = tf.sparse.split(
                 x_tensor,
diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index 7e0d9ad1..b72887d5 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -71,7 +71,7 @@ def _call_parameters(self, inputs, keep_previous_params_copy=True):
         if not keep_previous_params_copy:
             if self.batch_features:
                 self.params_copy = tf.Variable(tf.boolean_mask(tensor=self.params,
-                                                               mask=tf.logical_not(self.model_vars.converged),
+                                                               mask=tf.logical_not(self.model_vars.total_converged),
                                                                axis=1), trainable=True)
             else:
                 self.params_copy = self.params
diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index 68effa2e..a1c7a88f 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -158,13 +158,13 @@ def _trust_region_ops(
         increase_radius = update_theta
         if batch_features:
             n_features = self.model.model_vars.n_features
-            indices = tf.where(tf.logical_not(self.model.model_vars.converged))
+            indices = tf.where(tf.logical_not(self.model.model_vars.total_converged))
             decrease_radius = tf.scatter_nd(indices, decrease_radius, shape=(n_features,))
             increase_radius = tf.scatter_nd(indices, update_theta, shape=(n_features,))
             update_theta = increase_radius
 
         if compute_b and not compute_a:
-            self.model.model_vars.updated &= update_theta.numpy()
+            self.model.model_vars.updated_b = update_theta.numpy()
         else:
             self.model.model_vars.updated = update_theta.numpy()
 
@@ -343,7 +343,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             if batch_features:
                 radius_container = tf.boolean_mask(
                     tensor=self.tr_radius,
-                    mask=tf.logical_not(self.model.model_vars.converged))
+                    mask=tf.logical_not(self.model.model_vars.total_converged))
             else:
                 radius_container = self.tr_radius
             tr_proposed_vector = self._trust_region_update(
@@ -375,7 +375,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
 
         else:
             if batch_features:
-                indices = tf.where(tf.logical_not(self.model.model_vars.converged))
+                indices = tf.where(tf.logical_not(self.model.model_vars.total_converged))
                 update_var = tf.transpose(
                     tf.scatter_nd(
                         indices,
@@ -494,7 +494,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             )
 
             if batch_features:
-                indices = tf.where(tf.logical_not(self.model.model_vars.converged))
+                indices = tf.where(tf.logical_not(self.model.model_vars.total_converged))
                 update_var = tf.transpose(
                     tf.scatter_nd(
                         indices,
@@ -513,7 +513,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                     if batch_features:
                         radius_container = tf.boolean_mask(
                             tensor=self.tr_radius,
-                            mask=tf.logical_not(self.model.model_vars.converged))
+                            mask=tf.logical_not(self.model.model_vars.total_converged))
                     else:
                         radius_container = self.tr_radius
                     tr_proposed_vector_b, tr_pred_cost_gain_b = self._calc_proposed_vector_and_pred_cost_gain(
@@ -530,7 +530,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                     if batch_features:
                         radius_container = tf.boolean_mask(
                             tensor=radius_container,
-                            mask=tf.logical_not(self.model.model_vars.converged))
+                            mask=tf.logical_not(self.model.model_vars.total_converged))
 
                     tr_proposed_vector_b, tr_pred_cost_gain_b = self._calc_proposed_vector_and_pred_cost_gain(
                         update_b, radius_container, self.gd, jac_b, fim_b)
@@ -543,7 +543,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                 if batch_features:
                     radius_container = tf.boolean_mask(
                         tensor=self.tr_radius,
-                        mask=tf.logical_not(self.model.model_vars.converged))
+                        mask=tf.logical_not(self.model.model_vars.total_converged))
                 else:
                     radius_container = self.tr_radius
                 # here train_r is False AND train_mu is true, so the output of the function can directly be applied to
diff --git a/batchglm/train/tf2/base_glm/vars.py b/batchglm/train/tf2/base_glm/vars.py
index 4b0debca..9365f67c 100644
--- a/batchglm/train/tf2/base_glm/vars.py
+++ b/batchglm/train/tf2/base_glm/vars.py
@@ -70,7 +70,11 @@ def __init__(
 
         # Properties to follow gene-wise convergence.
         self.updated = np.repeat(a=True, repeats=self.params.shape[1])  # Initialise to is updated.
+        self.updated_b = np.repeat(a=True, repeats=self.params.shape[1])  # Initialise to is updated.
         self.converged = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to non-converged.
+        self.converged_b = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to non-converged.
+
+        self.total_converged = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to non-converged.
 
         self.dtype = dtype
         self.n_features = self.params.shape[1]

From 54a52509adbd2a26cc0fa12819ae7e0a43759348 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 24 Mar 2020 22:08:28 +0100
Subject: [PATCH 073/124] documentation for train provided in code.

---
 batchglm/train/tf2/base_glm/estimator.py | 140 ++++++++++++++---------
 1 file changed, 87 insertions(+), 53 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 6b04103e..d6c40248 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -22,6 +22,8 @@ class Estimator(TFEstimator, _EstimatorGLM, metaclass=abc.ABCMeta):
     _train_scale: bool
     _initialized: bool = False
     noise_model: str
+    irls_algo: bool = False
+    nr_algo: bool = False
 
     def initialize(self, **kwargs):
         self.values = []
@@ -57,7 +59,7 @@ def __init__(
     def _train(
             self,
             noise_model: str,
-            is_batched: bool = True,
+            is_batched: bool = False,
             batch_size: int = 5000,
             optimizer_object: tf.keras.optimizers.Optimizer = tf.keras.optimizers.Adam(),
             convergence_criteria: str = "step",
@@ -67,17 +69,22 @@ def _train(
             benchmark: bool = False,
             optim_algo: str = "adam"
     ):
+        # define some useful shortcuts here
         n_obs = self.input_data.num_observations
         n_features = self.input_data.num_features
-        conv_all = lambda x, y: x < n_features
-        conv_step = lambda x, y: x < n_features and y < stopping_criteria
+        # set necessary attributes
+        self.noise_model = noise_model
+        self.irls_algo = optim_algo.lower() in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr']
+        self.nr_algo = optim_algo.lower() in ['nr', 'nr_tr']
+
+        ################################################
+        # INIT Step 1: Consistency Checks
+        ####
+        assert not is_batched, "The TF2 backend does not yet support updates on individual" \
+            "batches. Use full data updates instead."
         assert convergence_criteria in ["step", "all_converged"], \
             ("Unrecognized convergence criteria %s", convergence_criteria)
-        convergence_decision = conv_step if convergence_criteria == "step" else conv_all
 
-
-        if batch_size > n_obs:
-            batch_size = n_obs
         if not self._initialized:
             raise RuntimeError("Cannot train the model: Estimator not initialized. \
                 Did you forget to call estimator.initialize() ?")
@@ -87,63 +94,82 @@ def _train(
                 "Automatic differentiation is currently not supported for hessians. Falling back \
                 to closed form. Only Jacobians are calculated using autograd.")
 
-        self.noise_model = noise_model
-
-        batch_features = False
-
-        self.irls_algo = optim_algo.lower() in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr']
-        self.nr_algo = optim_algo.lower() in ['nr', 'nr_tr']
         if featurewise and not (self.irls_algo or self.nr_algo):
             featurewise = False
             logger.warning("WARNING: 'Featurewise batching' is only available for 2nd order "
                            "optimizers IRLS and NR. Fallback to full featurespace fitting.")
+        if batch_size > n_obs:
+            batch_size = n_obs
 
+        ################################################
+        # INIT Step 2: Intialise training loop.
+        #
         update_func = optimizer_object.perform_parameter_update \
             if self.irls_algo or self.nr_algo else optimizer_object.apply_gradients
 
-        train_step = 0
-
         # create a tensorflow dataset using the DataGenerator
         datagenerator = DataGenerator(self, noise_model, is_batched, batch_size)
         epoch_set = datagenerator.new_epoch_set()
 
+        # first model call to initialise prior to first update.
         for i, x_batch in enumerate(epoch_set):
-            current_results = self.model(x_batch)
-            if is_batched or i == 0:
-                results = current_results
+            if i == 0:
+                results = self.model(x_batch)
             else:
-                results = [tf.math.add(results[i], x) for i, x in enumerate(current_results)]
-        # create a ConvergenceCalculator Object
+                results = [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch))]
+
+        # create ConvergenceCalculator to check for new convergences.
         conv_calc = ConvergenceCalculator(self, tf.negative(tf.divide(results[0], n_obs)).numpy())
 
+        # termination decision for training loop
+        def convergence_decision(num_converged, train_step):
+            not_done_fitting = num_converged < n_features
+            if convergence_criteria == "step":
+                not_done_fitting &= train_step < stopping_criteria
+            return not_done_fitting
+
+        # condition variables neede during while loop
+        batch_features = False
+        train_step = 0
         num_converged = 0
         num_converged_prev = 0
         need_new_epoch_set = False
         n_conv_last_featurewise_batch = 0
 
+        ################################################
+        # Training Loop: Model Fitting happens here.
+        ####
+
         while convergence_decision(num_converged, train_step):
             if benchmark:
                 t0_epoch = time.time()
 
+            ############################################
+            # 1. recalculate, only done if featurewise
             if need_new_epoch_set:
+                # this is executed only if a new feature converged in the last train step and
+                # using featurewise.
                 epoch_set = datagenerator.new_epoch_set(batch_features=batch_features)
                 for i, x_batch in enumerate(epoch_set):
-                    current_results = self.model(x_batch, keep_previous_params_copy=not need_new_epoch_set)
-                    if is_batched or i == 0:
-                        results = current_results
+                    if i == 0:
+                        # need new params_copy in model due to new convergence in first model call
+                        # in order to resume calculation in smaller feature space.
+                        results = self.model(x_batch, keep_previous_params_copy=False)
                     else:
-                        results = [tf.math.add(results[i], x) for i, x in enumerate(current_results)]
+                        results = [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch))]
 
-            self.update_params(x_batch if is_batched else epoch_set, results, batch_features, is_batched, update_func)
-            # converged_current, converged_f, converged_g, converged_x = convergences
+            ############################################
+            # 2. Update the parameters
+            self.update_params(epoch_set, results, batch_features, update_func)
 
+            ############################################
+            # 3. calculate new ll, jacs, hessian/fim
             for i, x_batch in enumerate(epoch_set):
-                current_results = self.model(x_batch)
-                if is_batched or i == 0:
-                    results = current_results
-                else:
-                    results = [tf.math.add(results[i], x) for i, x in enumerate(current_results)]
+                results = self.model(x_batch) if i == 0 \
+                    else [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch))]
 
+            ############################################
+            # 4. check for any new convergences
             convergences = conv_calc.calculate_convergence(
                 results=results,
                 jac_normalization=batch_size if is_batched else n_obs,
@@ -154,13 +180,16 @@ def _train(
             num_converged = np.sum(self.model.model_vars.total_converged)
             loss = conv_calc.getLoss()
             if self.irls_algo and self._train_scale:
-                num_updated = np.sum(np.logical_or(self.model.model_vars.updated, self.model.model_vars.updated_b))
+                num_updated = np.sum(
+                    np.logical_or(self.model.model_vars.updated, self.model.model_vars.updated_b))
             else:
                 num_updated = np.sum(self.model.model_vars.updated)
             log_output = f"Step: {train_step} loss: {loss}, "\
                 f"converged {num_converged}, updated {num_updated}"
             num_converged_prev = conv_calc.getPreviousNumberConverged()
 
+            ############################################
+            # 5. report any new convergences
             if num_converged == num_converged_prev:
                 need_new_epoch_set = False
                 logger.warning(log_output)
@@ -170,18 +199,24 @@ def _train(
                         batch_features = True
                         self.model.batch_features = batch_features
                     conv_diff = num_converged - n_conv_last_featurewise_batch
+
+                    # Update params if number of new convergences since last
+                    # featurewise batch is reached again.
                     if conv_diff >= pkg_constants.FEATUREWISE_THRESHOLD:
                         need_new_epoch_set = True
                         n_conv_last_featurewise_batch = num_converged
                         self.model.params.assign(
-                            tf.where(self.model.model_vars.total_converged, self.model.params, conv_calc.last_params)
-                        )
+                            tf.where(
+                                self.model.model_vars.total_converged,
+                                self.model.params, conv_calc.last_params))
 
                 sums = [np.sum(convergence_vals) for convergence_vals in convergences]
                 log_output = f"{log_output} logs: {sums[0]} grad: {sums[1]}, "\
                     f"x_step: {sums[2]}"
                 logger.warning(log_output)
+
             train_step += 1
+            # store some useful stuff for benchmarking purposes.
             if benchmark:
                 t1_epoch = time.time()
                 self.times.append(t1_epoch-t0_epoch)
@@ -189,21 +224,25 @@ def _train(
                 self.values.append(self.model.trainable_variables[0].numpy().copy())
                 self.lls.append(conv_calc.last_ll)
 
-        # Evaluate final params
+        ################################################
+        # Final model call on the full feature space.
+        ####
         logger.warning("Final Evaluation run.")
         self.model.batch_features = False
         batch_features = False
         # change to hessian mode since we still use hessian instead of FIM for self._fisher_inv
         self.model.setMethod('nr_tr')
-        self.model.hessian.compute_b = True
+        self.model.hessian.compute_b = True  # since self._train_scale could be False.
+
+        # need new set here with full feature space.
         final_set = datagenerator.new_epoch_set()
-        for i, x_batch_tuple in enumerate(final_set):
-            current_results = self.model(x_batch_tuple, keep_previous_params_copy=False)
+        for i, x_batch in enumerate(final_set):
             if i == 0:
-                results = current_results
+                results = self.model(x_batch, keep_previous_params_copy=False)
             else:
-                results = [tf.math.add(results[i], x) for i, x in enumerate(current_results)]
+                results = [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch))]
 
+        # store all the final results in this estimator instance.
         self._log_likelihood = results[0].numpy()
         self._jacobian = tf.reduce_sum(tf.abs(results[1] / n_obs), axis=1)
 
@@ -211,20 +250,20 @@ def _train(
         self._fisher_inv = tf.linalg.inv(results[2]).numpy()
         self._hessian = -results[2].numpy()
 
-        self.model.hessian.compute_b = self.model.compute_b
+        self.model.hessian.compute_b = self.model.compute_b  # reset if self._train_scale == False
         self.model.batch_features = batch_features
-        batch_features = True
 
-    def update_params(self, batches, results, batch_features, is_batched, update_func):
+    def update_params(self, batches, results, batch_features, update_func):
+        """Wrapper method to conduct updates based on different optimizers/conditions."""
         if self.irls_algo or self.nr_algo:
             if self.irls_algo:
-
+                # separate loc and scale update if using IRLS.
                 update_func(
                     inputs=[batches, *results],
                     compute_a=True,
                     compute_b=False,
                     batch_features=batch_features,
-                    is_batched=is_batched
+                    is_batched=False
                 )
                 if self._train_scale:
                     update_func(
@@ -232,27 +271,22 @@ def update_params(self, batches, results, batch_features, is_batched, update_fun
                         compute_a=False,
                         compute_b=True,
                         batch_features=batch_features,
-                        is_batched=is_batched
+                        is_batched=False
                     )
             else:
                 update_func(
                     inputs=[batches, *results],
                     batch_features=batch_features,
-                    is_batched=is_batched
+                    is_batched=False
                 )
         else:
-
             update_var = results[1]
             update_func([(update_var, self.model.params_copy)])
             self.model.model_vars.updated = ~self.model.model_vars.converged
 
     def get_optimizer_object(self, optimizer: str, learning_rate):
-        """
-            Creates an optimizer object based on the given optimizer string.
-        """
-
+        """ Creates an optimizer object based on the given optimizer string."""
         optimizer = optimizer.lower()
-
         if optimizer == "gd":
             optim_obj = tf.keras.optimizers.SGD(learning_rate=learning_rate)
         elif optimizer == "adam":

From 64186afe31951da2a59ccd8695d1009c588bf34d Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 25 Mar 2020 19:14:22 +0100
Subject: [PATCH 074/124] reworked spare featurewise batching method

---
 batchglm/train/tf2/base_glm/generator.py | 47 +++++++++++++++++++-----
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/generator.py b/batchglm/train/tf2/base_glm/generator.py
index c3ae70e8..84fc1e1e 100644
--- a/batchglm/train/tf2/base_glm/generator.py
+++ b/batchglm/train/tf2/base_glm/generator.py
@@ -1,7 +1,7 @@
 import numpy as np
 from scipy.sparse import csr_matrix
 import tensorflow as tf
-
+import time
 
 class DataGenerator:
     """Wrapper Object to generate an iterable TensorFlow Dataset from given input data."""
@@ -26,10 +26,6 @@ def __init__(
         output_types = ((tf.int64, dtp, tf.int64), *(dtp,) * 3) if self.sparse else (dtp,) * 4
         self.dataset = tf.data.Dataset.from_generator(
             generator=self._generate, output_types=output_types)
-        if self.sparse:
-            self.dataset = self.dataset.map(
-                lambda ivs_tuple, loc, scale, sf: (tf.SparseTensor(*ivs_tuple), loc, scale, sf)
-            )
 
     def _generate(self):
         """
@@ -53,10 +49,32 @@ def _generate(self):
             size_factors = input_data.fetch_size_factors(idx) if fetch_size_factors else 1
             yield counts, dloc, dscale, size_factors
 
+    def _featurewise_batch_sparse(self, ivs_tuple, dloc, dscale, size_factors):
+
+        not_converged = ~self.estimator.model.model_vars.total_converged
+
+        not_converged_numeric = tf.cast(not_converged, dtype=tf.int64)
+        col_idx_map = tf.cumsum(not_converged_numeric, exclusive=True)
+        ivs_tuple[0].set_shape([None, 2])
+        mask = tf.gather(not_converged, ivs_tuple[0][:, 1])
+        new_indices = tf.gather_nd(ivs_tuple[0], tf.where(mask))
+        row_idx, col_idx = tf.split(new_indices, num_or_size_splits=2, axis=1, num=2)
+
+        new_indices = tf.concat([row_idx, tf.gather(col_idx_map, col_idx)], 1)
+        new_values = tf.boolean_mask(ivs_tuple[1], mask)
+        n_features = col_idx_map[-1]
+        if not_converged[-1]:
+            n_features += 1
+
+        x_tensor = (new_indices, new_values, (ivs_tuple[2][0], n_features))
+        return x_tensor, dloc, dscale, size_factors
+
     def _featurewise_batch(self, x_tensor, dloc, dscale, size_factors):
         """Takes an element of a dataset, performs featurewise batching
         and returns the reduced element."""
+
         not_converged = ~self.estimator.model.model_vars.total_converged
+        """
         if self.sparse:
             feature_columns = tf.sparse.split(
                 x_tensor,
@@ -65,14 +83,23 @@ def _featurewise_batch(self, x_tensor, dloc, dscale, size_factors):
             not_converged_idx = np.where(not_converged)[0]
             feature_columns = [feature_columns[i] for i in not_converged_idx]
             x_tensor = tf.sparse.concat(axis=1, sp_inputs=feature_columns)
-
         else:
-            x_tensor = tf.boolean_mask(tensor=x_tensor, mask=not_converged, axis=1)
+        """
+        x_tensor = tf.boolean_mask(tensor=x_tensor, mask=not_converged, axis=1)
         return x_tensor, dloc, dscale, size_factors
 
     def new_epoch_set(self, batch_features: bool = False):
         """Returns an iterable TensorFlow Dataset of the input data."""
         dataset_to_return = self.dataset.take(self.num_batches)
-        if batch_features:
-            return dataset_to_return.map(self._featurewise_batch).cache().prefetch(1)
-        return self.dataset.take(self.num_batches).cache().prefetch(1)
+
+        if self.sparse:
+            if batch_features:
+                dataset_to_return = dataset_to_return.map(self._featurewise_batch_sparse)
+            dataset_to_return = dataset_to_return.map(
+                lambda ivs_tuple, loc, scale, sf: (tf.SparseTensor(*ivs_tuple), loc, scale, sf)
+            )
+        else:
+            if batch_features:
+                dataset_to_return = dataset_to_return.map(self._featurewise_batch)
+
+        return dataset_to_return.cache().prefetch(1)

From 14fa78bd5b9f0c24bc1f49b2d788eec25d23dbc3 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Thu, 26 Mar 2020 00:00:58 +0100
Subject: [PATCH 075/124] stepwsie featurewise batching implemented

---
 batchglm/pkg_constants.py                  |  2 +-
 batchglm/train/tf2/base_glm/convergence.py |  7 +++----
 batchglm/train/tf2/base_glm/estimator.py   |  6 ++++--
 batchglm/train/tf2/base_glm/generator.py   |  4 ++--
 batchglm/train/tf2/base_glm/model.py       |  6 +++---
 batchglm/train/tf2/base_glm/optim.py       | 14 +++++++-------
 batchglm/train/tf2/base_glm/vars.py        |  2 +-
 7 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index d3147b25..8a1200a5 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -32,7 +32,7 @@
 TRTOL_BY_FEATURE_LOC = 1e-12
 TRTOL_BY_FEATURE_SCALE = 1e-12
 
-FEATUREWISE_THRESHOLD = 1  # the minimal number of features to converge before next featurewise batch
+FEATUREWISE_THRESHOLD = 10  # the minimal number of features to converge before next featurewise batch
 
 try:
     import tensorflow as tf
diff --git a/batchglm/train/tf2/base_glm/convergence.py b/batchglm/train/tf2/base_glm/convergence.py
index c0df5608..77734bd6 100644
--- a/batchglm/train/tf2/base_glm/convergence.py
+++ b/batchglm/train/tf2/base_glm/convergence.py
@@ -14,7 +14,6 @@ def __init__(self, estimator, last_ll: np.ndarray):
         self.previous_number_converged = 0
         self.calc_separated = self.estimator.irls_algo and self.estimator._train_scale
 
-
     def calculate_convergence(self, results, jac_normalization, optimizer_object, batch_features):
         """Calculates convergence based on change in likelihood, gradient and parameters."""
 
@@ -42,11 +41,11 @@ def calculate_convergence(self, results, jac_normalization, optimizer_object, ba
 
         if batch_features:
             # map columns of ll to full feature space
-            not_conv = not_converged_a if not self.calc_separated else ~self.estimator.model.model_vars.total_converged
-            indices = tf.where(not_conv)
+            remaining_features = self.estimator.model.model_vars.remaining_features
+            indices = tf.where(remaining_features)
             updated_lls = tf.scatter_nd(indices, new_ll, shape=[n_features])
             # fill the added columns with previous ll
-            new_ll = tf.where(not_conv, updated_lls, self.last_ll)
+            new_ll = tf.where(remaining_features, updated_lls, self.last_ll)
 
             # fill added columns with the gradients from previous runs.
             grad_numpy = tf.scatter_nd(
diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index d6c40248..08da88d7 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -207,8 +207,10 @@ def convergence_decision(num_converged, train_step):
                         n_conv_last_featurewise_batch = num_converged
                         self.model.params.assign(
                             tf.where(
-                                self.model.model_vars.total_converged,
-                                self.model.params, conv_calc.last_params))
+                                self.model.model_vars.remaining_features,
+                                conv_calc.last_params, self.model.params))
+                        self.model.model_vars.remaining_features = \
+                            ~self.model.model_vars.total_converged
 
                 sums = [np.sum(convergence_vals) for convergence_vals in convergences]
                 log_output = f"{log_output} logs: {sums[0]} grad: {sums[1]}, "\
diff --git a/batchglm/train/tf2/base_glm/generator.py b/batchglm/train/tf2/base_glm/generator.py
index 84fc1e1e..97349423 100644
--- a/batchglm/train/tf2/base_glm/generator.py
+++ b/batchglm/train/tf2/base_glm/generator.py
@@ -51,7 +51,7 @@ def _generate(self):
 
     def _featurewise_batch_sparse(self, ivs_tuple, dloc, dscale, size_factors):
 
-        not_converged = ~self.estimator.model.model_vars.total_converged
+        not_converged = self.estimator.model.model_vars.remaining_features
 
         not_converged_numeric = tf.cast(not_converged, dtype=tf.int64)
         col_idx_map = tf.cumsum(not_converged_numeric, exclusive=True)
@@ -73,7 +73,7 @@ def _featurewise_batch(self, x_tensor, dloc, dscale, size_factors):
         """Takes an element of a dataset, performs featurewise batching
         and returns the reduced element."""
 
-        not_converged = ~self.estimator.model.model_vars.total_converged
+        not_converged = self.estimator.model.model_vars.remaining_features
         """
         if self.sparse:
             feature_columns = tf.sparse.split(
diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index b72887d5..a8683395 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -70,9 +70,9 @@ def setMethod(self, optimizer):
     def _call_parameters(self, inputs, keep_previous_params_copy=True):
         if not keep_previous_params_copy:
             if self.batch_features:
-                self.params_copy = tf.Variable(tf.boolean_mask(tensor=self.params,
-                                                               mask=tf.logical_not(self.model_vars.total_converged),
-                                                               axis=1), trainable=True)
+                self.params_copy = tf.Variable(
+                    tf.boolean_mask(tensor=self.params, mask=self.model_vars.remaining_features, axis=1),
+                    trainable=True)
             else:
                 self.params_copy = self.params
 
diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index a1c7a88f..38aa7310 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -158,7 +158,7 @@ def _trust_region_ops(
         increase_radius = update_theta
         if batch_features:
             n_features = self.model.model_vars.n_features
-            indices = tf.where(tf.logical_not(self.model.model_vars.total_converged))
+            indices = tf.where(self.model.model_vars.remaining_features)
             decrease_radius = tf.scatter_nd(indices, decrease_radius, shape=(n_features,))
             increase_radius = tf.scatter_nd(indices, update_theta, shape=(n_features,))
             update_theta = increase_radius
@@ -343,7 +343,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             if batch_features:
                 radius_container = tf.boolean_mask(
                     tensor=self.tr_radius,
-                    mask=tf.logical_not(self.model.model_vars.total_converged))
+                    mask=self.model.model_vars.remaining_features)
             else:
                 radius_container = self.tr_radius
             tr_proposed_vector = self._trust_region_update(
@@ -375,7 +375,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
 
         else:
             if batch_features:
-                indices = tf.where(tf.logical_not(self.model.model_vars.total_converged))
+                indices = tf.where(self.model.model_vars.remaining_features)
                 update_var = tf.transpose(
                     tf.scatter_nd(
                         indices,
@@ -494,7 +494,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             )
 
             if batch_features:
-                indices = tf.where(tf.logical_not(self.model.model_vars.total_converged))
+                indices = tf.where(self.model.model_vars.remaining_features)
                 update_var = tf.transpose(
                     tf.scatter_nd(
                         indices,
@@ -513,7 +513,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                     if batch_features:
                         radius_container = tf.boolean_mask(
                             tensor=self.tr_radius,
-                            mask=tf.logical_not(self.model.model_vars.total_converged))
+                            mask=self.model.model_vars.remaining_features)
                     else:
                         radius_container = self.tr_radius
                     tr_proposed_vector_b, tr_pred_cost_gain_b = self._calc_proposed_vector_and_pred_cost_gain(
@@ -530,7 +530,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                     if batch_features:
                         radius_container = tf.boolean_mask(
                             tensor=radius_container,
-                            mask=tf.logical_not(self.model.model_vars.total_converged))
+                            mask=self.model.model_vars.remaining_features)
 
                     tr_proposed_vector_b, tr_pred_cost_gain_b = self._calc_proposed_vector_and_pred_cost_gain(
                         update_b, radius_container, self.gd, jac_b, fim_b)
@@ -543,7 +543,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                 if batch_features:
                     radius_container = tf.boolean_mask(
                         tensor=self.tr_radius,
-                        mask=tf.logical_not(self.model.model_vars.total_converged))
+                        mask=self.model.model_vars.remaining_features)
                 else:
                     radius_container = self.tr_radius
                 # here train_r is False AND train_mu is true, so the output of the function can directly be applied to
diff --git a/batchglm/train/tf2/base_glm/vars.py b/batchglm/train/tf2/base_glm/vars.py
index 9365f67c..f3d5bcce 100644
--- a/batchglm/train/tf2/base_glm/vars.py
+++ b/batchglm/train/tf2/base_glm/vars.py
@@ -75,7 +75,7 @@ def __init__(
         self.converged_b = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to non-converged.
 
         self.total_converged = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to non-converged.
-
+        self.remaining_features = np.repeat(a=True, repeats=self.params.shape[1])
         self.dtype = dtype
         self.n_features = self.params.shape[1]
         self.idx_train_loc = np.arange(0, init_a.shape[0])

From 30699bf0654e38752448253b14c749631700dc99 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Thu, 26 Mar 2020 10:36:53 +0100
Subject: [PATCH 076/124] featurewise_threshold set to 100

---
 batchglm/pkg_constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index 8a1200a5..5fde5c2e 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -32,7 +32,7 @@
 TRTOL_BY_FEATURE_LOC = 1e-12
 TRTOL_BY_FEATURE_SCALE = 1e-12
 
-FEATUREWISE_THRESHOLD = 10  # the minimal number of features to converge before next featurewise batch
+FEATUREWISE_THRESHOLD = 100  # the minimal number of features to converge before next featurewise batch
 
 try:
     import tensorflow as tf

From 0508ef7f004ddb75417f0db8c94c2aceca4c684d Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Thu, 26 Mar 2020 10:37:13 +0100
Subject: [PATCH 077/124] bugfix: new epoch set but threhsold not reached

---
 batchglm/train/tf2/base_glm/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 08da88d7..4ce331a3 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -147,6 +147,7 @@ def convergence_decision(num_converged, train_step):
             ############################################
             # 1. recalculate, only done if featurewise
             if need_new_epoch_set:
+                need_new_epoch_set = False
                 # this is executed only if a new feature converged in the last train step and
                 # using featurewise.
                 epoch_set = datagenerator.new_epoch_set(batch_features=batch_features)
@@ -191,7 +192,6 @@ def convergence_decision(num_converged, train_step):
             ############################################
             # 5. report any new convergences
             if num_converged == num_converged_prev:
-                need_new_epoch_set = False
                 logger.warning(log_output)
             else:
                 if featurewise:

From 2b35bacba1670445c0b023ec95b63bc3d25c040d Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 27 Mar 2020 23:39:17 +0100
Subject: [PATCH 078/124] documentation/cleanup new methods for featurewise

---
 batchglm/train/tf2/base_glm/model.py | 80 ++++++++++++++++++----------
 1 file changed, 53 insertions(+), 27 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index a8683395..b28f321d 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -51,12 +51,15 @@ def __init__(
         self.fim = fim
         self.use_gradient_tape = use_gradient_tape
         self.params_copy = self.params
-        self.batch_features = False
 
         self.setMethod(optimizer)
 
-    def setMethod(self, optimizer):
-
+    def setMethod(self, optimizer: str):
+        """
+        Determines which function is executed to calculate and return the desired outputs when
+        calling the model. The internal function is chosen based on the given optimizer. It will
+        through an AssertionError if the optimizer is not understood.
+        """
         optimizer = optimizer.lower()
         if optimizer in ['gd', 'adam', 'adagrad', 'rmsprop']:
             self._calc = self._return_jacobians
@@ -66,16 +69,34 @@ def setMethod(self, optimizer):
 
         elif optimizer in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr']:
             self._calc = self._calc_fim
+        else:
+            assert False, ("Unrecognized optimizer: %s", optimizer)
 
-    def _call_parameters(self, inputs, keep_previous_params_copy=True):
-        if not keep_previous_params_copy:
-            if self.batch_features:
-                self.params_copy = tf.Variable(
-                    tf.boolean_mask(tensor=self.params, mask=self.model_vars.remaining_features, axis=1),
-                    trainable=True)
-            else:
-                self.params_copy = self.params
+    def featurewise_batch(self):
+        """
+        Applies a boolean mask over the feature dimension of the parameter matrix by removing
+        some feature columns (e.g. to exclude converged parameters) determined by the
+        `remaining_features` vector in `model_vars`. This method must be called after each
+        featurewise batch event to ensure the feature dimension of the input tensors matches the
+        feature dimension of `params_copy` in the following model call.
+        """
+        self.params_copy = tf.Variable(
+            tf.boolean_mask(tensor=self.params, mask=self.model_vars.remaining_features, axis=1),
+            trainable=True)
 
+    def apply_featurewise_updates(self, full_params_copy: tf.Tensor):
+        """
+        Applies featurewise updates stored in `params_copy` on `params`. Feature columns in
+        `params` corresponding to remaining feature columns in `params_copy` are overwritten with
+        the new values while the others (corresponding to features with converged parameters) are
+        retained. This method must be called after each featurewise batch event to ensure that the
+        updates stored in `params_copy` aren't lost when deriving a new `params_copy` from `params`
+        in the following model calls using `featurewise_batch()`.
+        """
+        self.params.assign(
+            tf.where(self.model_vars.remaining_features, full_params_copy, self.params))
+
+    def _call_parameters(self, inputs):
         design_loc, design_scale, size_factors = inputs
         a_var, b_var = self.unpack_params([self.params_copy, self.model_vars.a_var.get_shape()[0]])
         eta_loc = self.linear_loc([a_var, design_loc, self.model_vars.constraints_loc, size_factors])
@@ -84,16 +105,20 @@ def _call_parameters(self, inputs, keep_previous_params_copy=True):
         scale = self.linker_scale(eta_scale)
         return eta_loc, eta_scale, loc, scale, a_var, b_var
 
-    def calc_ll(self, inputs, keep_previous_params_copy=True):
-        parameters = self._call_parameters(inputs[1:], keep_previous_params_copy)
+    def calc_ll(self, inputs):
+        """
+        Calculates the log probabilities, summed up per feature column and returns it together with
+        loc, scale, a_var, and b_var (forwarding results from `_call_parameters`).
+        """
+        parameters = self._call_parameters(inputs[1:])
         log_probs = self.likelihood([*parameters[:-2], inputs[0]])
         log_probs = tf.reduce_sum(log_probs, axis=0)
         return (log_probs, *parameters[2:])
 
-    def _return_jacobians(self, inputs, keep_previous_params_copy=True):
-        return self._calc_jacobians(inputs, keep_previous_params_copy=keep_previous_params_copy)[-2:]
+    def _return_jacobians(self, inputs):
+        return self._calc_jacobians(inputs)[-2:]
 
-    def _calc_jacobians(self, inputs, concat=True, transpose=True, keep_previous_params_copy=True):
+    def _calc_jacobians(self, inputs, concat=True, transpose=True):
         """
         calculates jacobian.
 
@@ -109,7 +134,7 @@ def _calc_jacobians(self, inputs, concat=True, transpose=True, keep_previous_par
         """
 
         with tf.GradientTape(persistent=True) as g:
-            log_probs, loc, scale, a_var, b_var = self.calc_ll(inputs, keep_previous_params_copy)
+            log_probs, loc, scale, a_var, b_var = self.calc_ll(inputs)
 
         if self.use_gradient_tape:
 
@@ -154,12 +179,9 @@ def _calc_jacobians(self, inputs, concat=True, transpose=True, keep_previous_par
             return loc, scale, log_probs, tf.negative(jacobians)
         return loc, scale, log_probs, tf.negative(jac_a), tf.negative(jac_b)
 
-    def _calc_hessians(self, inputs, keep_previous_params_copy=True):
+    def _calc_hessians(self, inputs):
         # with tf.GradientTape(persistent=True) as g2:
-        loc, scale, log_probs, jacobians = self._calc_jacobians(
-            inputs,
-            keep_previous_params_copy=keep_previous_params_copy,
-            transpose=False)
+        loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, transpose=False)
         '''
         autograd not yet working. TODO: Search error in the following code:
 
@@ -190,17 +212,21 @@ def _calc_hessians(self, inputs, keep_previous_params_copy=True):
         hessians = tf.negative(self.hessian([*inputs[0:3], loc, scale, True]))
         return log_probs, jacobians, hessians
 
-    def _calc_fim(self, inputs, keep_previous_params_copy=True):
+    def _calc_fim(self, inputs):
         loc, scale, log_probs, jac_a, jac_b = self._calc_jacobians(
             inputs,
             concat=False,
-            transpose=False,
-            keep_previous_params_copy=keep_previous_params_copy)
+            transpose=False)
         fim_a, fim_b = self.fim([*inputs[0:3], loc, scale, False])
         return log_probs, jac_a, jac_b, fim_a, fim_b
 
-    def call(self, inputs, keep_previous_params_copy=True):
-        return self._calc(inputs, keep_previous_params_copy)
+    def call(self, inputs):
+        """
+        Wrapper method to call this model. Depending on the desired calculations specified by the
+        `optimizer` arg to `__init__`, it will forward the call to the necessary function to perform
+        the right calculations and return all the results.
+        """
+        return self._calc(inputs)
 
 class LossGLM(LossBase):
 

From 55190f42fb85027248545a29d8e8e10180a6ca80 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 27 Mar 2020 23:41:32 +0100
Subject: [PATCH 079/124] implemented featurewise without 2nd model call

---
 batchglm/train/tf2/base_glm/estimator.py | 70 ++++++++++++++++--------
 1 file changed, 48 insertions(+), 22 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 4ce331a3..4d683cc6 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -151,13 +151,10 @@ def convergence_decision(num_converged, train_step):
                 # this is executed only if a new feature converged in the last train step and
                 # using featurewise.
                 epoch_set = datagenerator.new_epoch_set(batch_features=batch_features)
-                for i, x_batch in enumerate(epoch_set):
-                    if i == 0:
-                        # need new params_copy in model due to new convergence in first model call
-                        # in order to resume calculation in smaller feature space.
-                        results = self.model(x_batch, keep_previous_params_copy=False)
-                    else:
-                        results = [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch))]
+                if pkg_constants.FEATUREWISE_RECALCULATE:
+                    for i, x_batch in enumerate(epoch_set):
+                        results = self.model(x_batch) if i == 0 else \
+                            [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch))]
 
             ############################################
             # 2. Update the parameters
@@ -166,6 +163,7 @@ def convergence_decision(num_converged, train_step):
             ############################################
             # 3. calculate new ll, jacs, hessian/fim
             for i, x_batch in enumerate(epoch_set):
+                # need new params_copy in model in case we use featurewise without recalculation
                 results = self.model(x_batch) if i == 0 \
                     else [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch))]
 
@@ -205,12 +203,12 @@ def convergence_decision(num_converged, train_step):
                     if conv_diff >= pkg_constants.FEATUREWISE_THRESHOLD:
                         need_new_epoch_set = True
                         n_conv_last_featurewise_batch = num_converged
-                        self.model.params.assign(
-                            tf.where(
-                                self.model.model_vars.remaining_features,
-                                conv_calc.last_params, self.model.params))
+                        self.model.apply_featurewise_updates(conv_calc.last_params)
+                        if not pkg_constants.FEATUREWISE_RECALCULATE:
+                            results = self.mask_unconverged(results)
                         self.model.model_vars.remaining_features = \
                             ~self.model.model_vars.total_converged
+                        self.model.featurewise_batch()
 
                 sums = [np.sum(convergence_vals) for convergence_vals in convergences]
                 log_output = f"{log_output} logs: {sums[0]} grad: {sums[1]}, "\
@@ -230,30 +228,36 @@ def convergence_decision(num_converged, train_step):
         # Final model call on the full feature space.
         ####
         logger.warning("Final Evaluation run.")
-        self.model.batch_features = False
-        batch_features = False
+        if batch_features:
+            # need to update `model.params` if conv_diff wasn't reached in last train step
+            # as updates since the last featurewise batch are not yet applied in that case.
+            if np.any(self.model.model_vars.remaining_features):
+                self.model.apply_featurewise_updates(conv_calc.last_params)
+            # now make sure we use the full feature space for the last update
+            self.model.model_vars.remaining_features = np.ones(n_features, dtype=np.bool)
+            self.model.featurewise_batch()
+
+        batch_features = False  # reset in case train is run repeatedly
         # change to hessian mode since we still use hessian instead of FIM for self._fisher_inv
-        self.model.setMethod('nr_tr')
+        self.model.setMethod('nr_tr')  # TODO: maybe stay with irls to compute fim in the future
         self.model.hessian.compute_b = True  # since self._train_scale could be False.
 
-        # need new set here with full feature space.
+        # need new set here with full feature space
+        # TODO: only ineeded if batch_features, maybe put this in the above if switch later
         final_set = datagenerator.new_epoch_set()
         for i, x_batch in enumerate(final_set):
-            if i == 0:
-                results = self.model(x_batch, keep_previous_params_copy=False)
-            else:
-                results = [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch))]
+            results = self.model(x_batch) if i == 0 else \
+                [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch))]
 
         # store all the final results in this estimator instance.
         self._log_likelihood = results[0].numpy()
         self._jacobian = tf.reduce_sum(tf.abs(results[1] / n_obs), axis=1)
 
-        # TODO: maybe report fisher inf here in the future.
+        # TODO: maybe report fisher inf here in the future instead of inverted hessian.
         self._fisher_inv = tf.linalg.inv(results[2]).numpy()
         self._hessian = -results[2].numpy()
 
-        self.model.hessian.compute_b = self.model.compute_b  # reset if self._train_scale == False
-        self.model.batch_features = batch_features
+        self.model.hessian.compute_b = self.model.compute_b  # reset if not self._train_scale
 
     def update_params(self, batches, results, batch_features, update_func):
         """Wrapper method to conduct updates based on different optimizers/conditions."""
@@ -286,6 +290,28 @@ def update_params(self, batches, results, batch_features, update_func):
             update_func([(update_var, self.model.params_copy)])
             self.model.model_vars.updated = ~self.model.model_vars.converged
 
+    def mask_unconverged(self, results):
+
+        # the idx from unconverged features, thus features reamining in the curent results
+        idx = np.where(self.model.model_vars.remaining_features)[0]
+        # the new remaining_features in reduced feature space
+        mask = ~(self.model.model_vars.total_converged[idx])
+
+        ll = tf.boolean_mask(results[0], mask)
+        if self.irls_algo:
+            jac_a = tf.boolean_mask(results[1], mask)
+            jac_b = tf.boolean_mask(results[2], mask)
+            fim_a = tf.boolean_mask(results[3], mask)
+            fim_b = tf.boolean_mask(results[4], mask)
+            return ll, jac_a, jac_b, fim_a, fim_b
+        elif self.nr_algo:
+            jac = tf.boolean_mask(results[1], mask)
+            hessian = tf.boolean_mask(results[2], mask)
+            return ll, jac, hessian
+        else:
+            jac = tf.boolean_mask(results[1], mask, axis=1)
+        return ll, jac
+
     def get_optimizer_object(self, optimizer: str, learning_rate):
         """ Creates an optimizer object based on the given optimizer string."""
         optimizer = optimizer.lower()

From c2fe15090b35feb5ca7f1479698407ce256271b2 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 27 Mar 2020 23:42:26 +0100
Subject: [PATCH 080/124] added boolean for recalc model call if featurewise

---
 batchglm/pkg_constants.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index 5fde5c2e..69cdbb97 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -32,7 +32,8 @@
 TRTOL_BY_FEATURE_LOC = 1e-12
 TRTOL_BY_FEATURE_SCALE = 1e-12
 
-FEATUREWISE_THRESHOLD = 100  # the minimal number of features to converge before next featurewise batch
+FEATUREWISE_THRESHOLD = 1  # the minimal number of features to converge before next featurewise batch
+FEATUREWISE_RECALCULATE = False # if set to True, recalculate the results from the previous train step
 
 try:
     import tensorflow as tf

From 493e20cbb4e8f674e8ec24442ed3404f78c431b8 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 31 Mar 2020 19:54:22 +0200
Subject: [PATCH 081/124] started implementing linesearch

---
 batchglm/pkg_constants.py                     |   2 +-
 batchglm/train/tf2/base_glm/__init__.py       |   3 +
 batchglm/train/tf2/base_glm/estimator.py      |  15 +-
 batchglm/train/tf2/base_glm/optim.py          | 136 +++++--------
 batchglm/train/tf2/glm_nb/estimator.py        |  44 ++++-
 batchglm/train/tf2/glm_nb/external.py         |   4 +
 batchglm/train/tf2/glm_nb/optim.py            | 184 ++++++++++++++++++
 .../train/tf2/glm_nb/training_strategies.py   |   7 +
 8 files changed, 297 insertions(+), 98 deletions(-)
 create mode 100644 batchglm/train/tf2/glm_nb/optim.py

diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index 69cdbb97..205d5bf6 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -32,7 +32,7 @@
 TRTOL_BY_FEATURE_LOC = 1e-12
 TRTOL_BY_FEATURE_SCALE = 1e-12
 
-FEATUREWISE_THRESHOLD = 1  # the minimal number of features to converge before next featurewise batch
+FEATUREWISE_THRESHOLD = 10  # the minimal number of features to converge before next featurewise batch
 FEATUREWISE_RECALCULATE = False # if set to True, recalculate the results from the previous train step
 
 try:
diff --git a/batchglm/train/tf2/base_glm/__init__.py b/batchglm/train/tf2/base_glm/__init__.py
index f87c8915..08f4e7dd 100644
--- a/batchglm/train/tf2/base_glm/__init__.py
+++ b/batchglm/train/tf2/base_glm/__init__.py
@@ -7,3 +7,6 @@
 from .layers import LikelihoodGLM, UnpackParamsGLM
 from .layers_gradients import JacobianGLM, HessianGLM, FIMGLM
 from .optim import NR, IRLS
+
+from .generator import DataGenerator
+from .convergence import ConvergenceCalculator
diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 4d683cc6..0ca7d8c0 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -24,6 +24,7 @@ class Estimator(TFEstimator, _EstimatorGLM, metaclass=abc.ABCMeta):
     noise_model: str
     irls_algo: bool = False
     nr_algo: bool = False
+    optimizer = None
 
     def initialize(self, **kwargs):
         self.values = []
@@ -33,6 +34,10 @@ def initialize(self, **kwargs):
         self._initialized = True
         self.model = None
 
+    def update(self, results, *args):
+        self.optimizer.apply_gradients([(results[1], self.model.params_copy)])
+        self.model.model_vars.updated = ~self.model.model_vars.converged
+
     def finalize(self, **kwargs):
         """
         Evaluate all tensors that need to be exported from session,
@@ -104,8 +109,6 @@ def _train(
         ################################################
         # INIT Step 2: Intialise training loop.
         #
-        update_func = optimizer_object.perform_parameter_update \
-            if self.irls_algo or self.nr_algo else optimizer_object.apply_gradients
 
         # create a tensorflow dataset using the DataGenerator
         datagenerator = DataGenerator(self, noise_model, is_batched, batch_size)
@@ -158,7 +161,7 @@ def convergence_decision(num_converged, train_step):
 
             ############################################
             # 2. Update the parameters
-            self.update_params(epoch_set, results, batch_features, update_func)
+            self.update(results, epoch_set, batch_features)
 
             ############################################
             # 3. calculate new ll, jacs, hessian/fim
@@ -197,10 +200,14 @@ def convergence_decision(num_converged, train_step):
                         batch_features = True
                         self.model.batch_features = batch_features
                     conv_diff = num_converged - n_conv_last_featurewise_batch
-
+                    print(conv_diff)
+                    if pkg_constants.FEATUREWISE_THRESHOLD < 1:
+                        conv_diff /= n_features-n_conv_last_featurewise_batch
+                    print(conv_diff)
                     # Update params if number of new convergences since last
                     # featurewise batch is reached again.
                     if conv_diff >= pkg_constants.FEATUREWISE_THRESHOLD:
+                        print(num_converged - n_conv_last_featurewise_batch, n_features-n_conv_last_featurewise_batch, conv_diff)
                         need_new_epoch_set = True
                         n_conv_last_featurewise_batch = num_converged
                         self.model.apply_featurewise_updates(conv_calc.last_params)
diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index 38aa7310..d2de2710 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -33,6 +33,11 @@ def _create_slots(self, var_list):
 
         self.add_slot(var_list[0], 'mu_r')
 
+    def gett1t2(self):
+        t1 = tf.constant(pkg_constants.TRUST_REGION_T1, dtype=self._dtype)
+        t2 = tf.constant(pkg_constants.TRUST_REGION_T2, dtype=self._dtype)
+        return t1, t2
+
     def _trust_region_ops(
             self,
             x_batches,
@@ -45,22 +50,18 @@ def _trust_region_ops(
             is_batched
     ):
         # Load hyper-parameters:
-        #assert pkg_constants.TRUST_REGION_ETA0 < pkg_constants.TRUST_REGION_ETA1, \
+        # assert pkg_constants.TRUST_REGION_ETA0 < pkg_constants.TRUST_REGION_ETA1, \
         #    "eta0 must be smaller than eta1"
-        #assert pkg_constants.TRUST_REGION_ETA1 <= pkg_constants.TRUST_REGION_ETA2, \
+        # assert pkg_constants.TRUST_REGION_ETA1 <= pkg_constants.TRUST_REGION_ETA2, \
         #    "eta1 must be smaller than or equal to eta2"
-        #assert pkg_constants.TRUST_REGION_T1 <= 1, "t1 must be smaller than 1"
-        #assert pkg_constants.TRUST_REGION_T2 >= 1, "t1 must be larger than 1"
+        # assert pkg_constants.TRUST_REGION_T1 <= 1, "t1 must be smaller than 1"
+        # assert pkg_constants.TRUST_REGION_T2 >= 1, "t1 must be larger than 1"
         # Set trust region hyper-parameters
         eta0 = tf.constant(pkg_constants.TRUST_REGION_ETA0, dtype=self._dtype)
-        eta1 = tf.constant(pkg_constants.TRUST_REGION_ETA1, dtype=self._dtype)
-        eta2 = tf.constant(pkg_constants.TRUST_REGION_ETA2, dtype=self._dtype)
-        if self.gd and compute_b:
-            t1 = tf.constant(pkg_constants.TRUST_REGIONT_T1_IRLS_GD_TR_SCALE, dtype=self._dtype)
-            t2 = tf.constant(pkg_constants.TRUST_REGIONT_T2_IRLS_GD_TR_SCALE, dtype=self._dtype)
-        else:
-            t1 = tf.constant(pkg_constants.TRUST_REGION_T1, dtype=self._dtype)
-            t2 = tf.constant(pkg_constants.TRUST_REGION_T2, dtype=self._dtype)
+        # eta1 = tf.constant(pkg_constants.TRUST_REGION_ETA1, dtype=self._dtype)
+        # eta2 = tf.constant(pkg_constants.TRUST_REGION_ETA2, dtype=self._dtype)
+        t1, t2 = self.gett1t2()
+
         upper_bound = tf.constant(pkg_constants.TRUST_REGION_UPPER_BOUND, dtype=self._dtype)
 
         # Phase I: Perform a trial update.
@@ -203,7 +204,6 @@ def __init__(self, dtype: tf.dtypes.DType, trusted_region_mode: bool, model: tf.
         super(SecondOrderOptim, self).__init__(name)
 
         self.model = model
-        self.gd = name in ['irls_gd', 'irls_gd_tr']
         self._dtype = dtype
         self.n_obs = tf.cast(n_obs, dtype=self._dtype)
         self.trusted_region_mode = trusted_region_mode
@@ -213,15 +213,12 @@ def __init__(self, dtype: tf.dtypes.DType, trusted_region_mode: bool, model: tf.
             self.tr_radius = tf.Variable(
                 np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT,
                 dtype=self._dtype, trainable=False)
-            if self.gd:
-                self.tr_radius_b = tf.Variable(
-                    np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT_SCALE,
-                    dtype=self._dtype, trainable=False)
+
         else:
             self.tr_radius = tf.Variable(np.array([np.inf]), dtype=self._dtype, trainable=False)
 
     @abc.abstractmethod
-    def perform_parameter_update(self, inputs):
+    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
         pass
 
     def _newton_type_update(self, lhs, rhs, psd=False):
@@ -265,8 +262,7 @@ def _pad_updates(
     def _trust_region_update(
             self,
             update_raw,
-            radius_container,
-            n_obs=None
+            radius_container
     ):
         update_magnitude_sq = tf.reduce_sum(tf.square(update_raw), axis=0)
         update_magnitude = tf.where(
@@ -283,9 +279,9 @@ def _trust_region_update(
             y=tf.zeros_like(update_magnitude)
         )
         update_norm = tf.multiply(update_raw, update_magnitude_inv)
-        # the following switch is for irls_gd_tr (linear instead of newton)
-        if n_obs is not None:
-            update_magnitude = update_magnitude / n_obs * radius_container
+        # the following method is for irls_gd_tr (linear instead of newton)
+        self.normalize_update_magnitude(update_magnitude)
+
         update_scale = tf.minimum(
             radius_container,
             update_magnitude
@@ -297,6 +293,9 @@ def _trust_region_update(
 
         return proposed_vector
 
+    def normalize_update_magnitude(self, update_magnitude):
+        return update_magnitude
+
     def _trust_region_newton_cost_gain(
             self,
             proposed_vector,
@@ -394,7 +393,6 @@ def _calc_proposed_vector_and_pred_cost_gain(
             self,
             update_x,
             radius_container,
-            gd,
             neg_jac_x,
             fim_x=None
     ):
@@ -404,10 +402,6 @@ def _calc_proposed_vector_and_pred_cost_gain(
 
         :param radius_container: tf.tensor ? x ? TODO
 
-        :param gd: boolean
-            If True, the proposed vector and predicted cost gain are
-            calculated by linear functions related to IRLS_GD(_TR) optimizer.
-            If False, use newton functions for IRLS_TR optimizer instead.
         :param neg_jac_x: tf.Tensor coefficients x features ? TODO
             Upper (mu part) or lower (r part) of negative jacobian matrix
         :param fim_x
@@ -420,35 +414,17 @@ def _calc_proposed_vector_and_pred_cost_gain(
 
         proposed_vector_x = self._trust_region_update(
             update_raw=update_x,
-            radius_container=radius_container,
-            n_obs=self.n_obs if gd else None
+            radius_container=radius_container
+        )
+
+        pred_cost_gain_x = self._trust_region_newton_cost_gain(
+            proposed_vector=proposed_vector_x,
+            neg_jac=neg_jac_x,
+            hessian_fim=fim_x
         )
-        # here, functions have different number of arguments, thus
-        # must be written out
-        if gd:
-            pred_cost_gain_x = self._trust_region_linear_cost_gain(
-                proposed_vector=proposed_vector_x,
-                neg_jac=neg_jac_x
-            )
-        else:
-            pred_cost_gain_x = self._trust_region_newton_cost_gain(
-                proposed_vector=proposed_vector_x,
-                neg_jac=neg_jac_x,
-                hessian_fim=fim_x
-            )
 
         return proposed_vector_x, pred_cost_gain_x
 
-    def _trust_region_linear_cost_gain(
-            self,
-            proposed_vector,
-            neg_jac
-    ):
-        pred_cost_gain = tf.reduce_sum(tf.multiply(
-            proposed_vector,
-            tf.transpose(neg_jac)
-        ), axis=0)
-        return pred_cost_gain
 
     def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
 
@@ -469,14 +445,10 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             )
         if compute_b:
 
-            if self.gd:
-                update_b = tf.transpose(jac_b)
-
-            else:
-                update_b = self._newton_type_update(
-                    lhs=fim_b,
-                    rhs=jac_b
-                )
+            update_b = self._newton_type_update(
+                lhs=fim_b,
+                rhs=jac_b
+            )
 
         if not self.trusted_region_mode:
             if compute_a:
@@ -504,60 +476,44 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                 )
             else:
                 update_var = update
-            self.model.params.assign_sub(update_var)
+            self.model.params_copy.assign_sub(update_var)
 
         else:
             # put together update_raw based on proposed vector and cost gain depending on train_r and train_mu
+            if batch_features:
+                radius_container = tf.boolean_mask(
+                    tensor=self.tr_radius,
+                    mask=self.model.model_vars.remaining_features)
+            else:
+                radius_container = self.tr_radius
+
             if compute_b:
+                tr_proposed_vector_b, tr_pred_cost_gain_b = self._calc_proposed_vector_and_pred_cost_gain(
+                    update_b, radius_container, jac_b, fim_b)
                 if compute_a:
-                    if batch_features:
-                        radius_container = tf.boolean_mask(
-                            tensor=self.tr_radius,
-                            mask=self.model.model_vars.remaining_features)
-                    else:
-                        radius_container = self.tr_radius
-                    tr_proposed_vector_b, tr_pred_cost_gain_b = self._calc_proposed_vector_and_pred_cost_gain(
-                        update_b, radius_container, self.gd, jac_b, fim_b)
-
                     tr_proposed_vector_a, tr_pred_cost_gain_a = self._calc_proposed_vector_and_pred_cost_gain(
-                        update_a, radius_container, False, jac_a, fim_a)
+                        update_a, radius_container, jac_a, fim_a)
 
                     tr_update_raw = tf.concat([tr_proposed_vector_a, tr_proposed_vector_b], axis=0)
                     tr_pred_cost_gain = tf.add(tr_pred_cost_gain_a, tr_pred_cost_gain_b)
-
                 else:
-                    radius_container = self.tr_radius_b if self.gd else self.tr_radius
-                    if batch_features:
-                        radius_container = tf.boolean_mask(
-                            tensor=radius_container,
-                            mask=self.model.model_vars.remaining_features)
-
-                    tr_proposed_vector_b, tr_pred_cost_gain_b = self._calc_proposed_vector_and_pred_cost_gain(
-                        update_b, radius_container, self.gd, jac_b, fim_b)
-
                     # directly apply output of calc_proposed_vector_and_pred_cost_gain to tr_update_raw
                     # and tr_pred_cost_gain
                     tr_update_raw = tr_proposed_vector_b
                     tr_pred_cost_gain = tr_pred_cost_gain_b
             else:
-                if batch_features:
-                    radius_container = tf.boolean_mask(
-                        tensor=self.tr_radius,
-                        mask=self.model.model_vars.remaining_features)
-                else:
-                    radius_container = self.tr_radius
                 # here train_r is False AND train_mu is true, so the output of the function can directly be applied to
                 # tr_update_raw and tr_pred_cost_gain, similar to train_r = True and train_mu = False
                 tr_update_raw, tr_pred_cost_gain = self._calc_proposed_vector_and_pred_cost_gain(
-                    update_a, radius_container, False, jac_a, fim_a)
+                    update_a, radius_container, jac_a, fim_a)
 
-            # perform update
             tr_update = self._pad_updates(
                 update_raw=tr_update_raw,
                 compute_a=compute_a,
                 compute_b=compute_b
             )
 
+            # perform update
             self._trust_region_ops(
                 x_batches=x_batches,
                 log_probs=log_probs,
diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index 3109bb23..69ff4478 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -1,16 +1,21 @@
 import logging
 from typing import Union
+import time  # needed for train_irls_ls_tr benchmarking
 import numpy as np
+import tensorflow as tf  # needed for train_irls_ls_tr
 
 from .external import InputDataGLM, Model
 from .external import closedform_nb_glm_logmu, closedform_nb_glm_logphi
-
 from .model import NBGLM, LossGLMNB
 from .vars import ModelVars
 from .processModel import ProcessModel
 from .external import Estimator as GLMEstimator
 from .training_strategies import TrainingStrategies
 
+# needed for train_irls_ls_tr
+from .external import DataGenerator, ConvergenceCalculator, pkg_constants
+from .optim import IRLS_LS
+logger = logging.getLogger("batchglm")
 
 class Estimator(GLMEstimator, ProcessModel):
     """
@@ -109,9 +114,11 @@ def train(
         else:
             self.model.setMethod(optim_algo)
 
-        self._loss = LossGLMNB()
-
         optimizer_object = self.get_optimizer_object(optim_algo, learning_rate)
+        self.optimizer = optimizer_object
+        if optimizer_object.name in ['irls_gd_tr', 'irls_ar_tr']:
+            self.update = self.update_separated
+            self.epochs_until_b_update = 5
 
         super(Estimator, self)._train(
             noise_model="nb",
@@ -126,6 +133,37 @@ def train(
             optim_algo=optim_algo
         )
 
+    def get_optimizer_object(self, optimizer, learning_rate):
+        optim = optimizer.lower()
+        if optim in ['irls_gd_tr', 'irls_gd', 'irls_ar', 'irls_ar_tr']:
+            return IRLS_LS(
+                dtype=self.dtype,
+                trusted_region_mode=optim.endswith('tr'),
+                model=self.model,
+                name=optim,
+                n_obs=self.input_data.num_observations,
+                max_iter=20)
+        return super().get_optimizer_object(optimizer, learning_rate)
+
+    def update_separated(self, results, batches, batch_features):
+
+        self.optimizer.perform_parameter_update(
+            inputs=[batches, *results],
+            compute_a=True,
+            compute_b=False,
+            batch_features=batch_features,
+            is_batched=False
+        )
+        if self._train_scale and self.epochs_until_b_update == 0:
+            self.optimizer.perform_parameter_update(
+                inputs=[batches, *results],
+                compute_a=False,
+                compute_b=True,
+                batch_features=batch_features,
+                is_batched=False
+            )
+        self.epochs_until_b_update -= 1
+
     def get_model_container(
             self,
             input_data
diff --git a/batchglm/train/tf2/glm_nb/external.py b/batchglm/train/tf2/glm_nb/external.py
index d5c3a2e7..5d62ca66 100644
--- a/batchglm/train/tf2/glm_nb/external.py
+++ b/batchglm/train/tf2/glm_nb/external.py
@@ -16,3 +16,7 @@
 from batchglm.train.tf2.base_glm import FIMGLM, JacobianGLM, HessianGLM
 from batchglm.train.tf2.base_glm import LossGLM
 from batchglm.train.tf2.base_glm import Estimator
+
+# these are needed for nb specific irls_ls_tr training
+from batchglm.train.tf2.base_glm import IRLS
+from batchglm.train.tf2.base_glm import DataGenerator, ConvergenceCalculator
diff --git a/batchglm/train/tf2/glm_nb/optim.py b/batchglm/train/tf2/glm_nb/optim.py
new file mode 100644
index 00000000..d0195672
--- /dev/null
+++ b/batchglm/train/tf2/glm_nb/optim.py
@@ -0,0 +1,184 @@
+import tensorflow as tf
+import numpy as np
+from .external import IRLS, pkg_constants
+
+class IRLS_LS(IRLS):
+
+    def __init__(self, dtype, trusted_region_mode, model, name, n_obs, max_iter):
+
+        super(IRLS_LS, self).__init__(
+            dtype=dtype,
+            trusted_region_mode=trusted_region_mode,
+            model=model,
+            name=name,
+            n_obs=n_obs)
+
+        self.max_iter = max_iter
+
+        if name.startswith('irls_gd'):
+            self.update_b_func = self.update_b_gd
+            if trusted_region_mode:
+                n_features = self.model.model_vars.n_features
+                self.tr_radius_b = tf.Variable(
+                    np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT_SCALE,
+                    dtype=self._dtype, trainable=False)
+
+        elif name in ['irls_ar_tr', 'irls_ar']:
+            self.update_b_func = self.update_b_armijio
+
+    def _trust_region_linear_cost_gain(
+            self,
+            proposed_vector,
+            neg_jac
+    ):
+        pred_cost_gain = tf.reduce_sum(tf.multiply(
+            proposed_vector,
+            tf.transpose(neg_jac)
+        ), axis=0)
+        return pred_cost_gain
+
+    def normalize_update_magnitude(self, update_magnitude):
+        update_magnitude = update_magnitude / self.n_obs * self.tr_radius_b
+
+    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
+
+        assert compute_a ^ compute_b, \
+            "IRLSLS computes either loc or scale model updates, not both nor none at the same time."
+
+        if compute_a:
+            super(IRLS_LS, self).perform_parameter_update(
+                inputs, compute_a, compute_b, batch_features, is_batched)
+        else:
+            all_features_converged = False
+            i = 0
+            while(not all_features_converged and i < self.max_iter):
+                all_features_converged = self.update_b_func(inputs, batch_features, is_batched)
+
+    def gett1t2(self):
+        t1 = tf.constant(pkg_constants.TRUST_REGIONT_T1_IRLS_GD_TR_SCALE, dtype=self._dtype)
+        t2 = tf.constant(pkg_constants.TRUST_REGIONT_T2_IRLS_GD_TR_SCALE, dtype=self._dtype)
+        return t1, t2
+
+    def update_b_gd(self, inputs, batch_features, is_batched):
+
+        x_batches, log_probs, _, jac_b, _, _ = inputs
+
+        update_b = tf.transpose(jac_b)
+        if not self.trusted_region_mode:
+            update = self._pad_updates(
+                update_raw=update_b,
+                compute_a=False,
+                compute_b=True
+            )
+            if batch_features:
+                indices = tf.where(self.model.model_vars.remaining_features)
+                update_var = tf.transpose(
+                    tf.scatter_nd(
+                        indices,
+                        tf.transpose(update),
+                        shape=(self.model.model_vars.n_features, update.get_shape()[0])
+                    )
+                )
+            else:
+                update_var = update
+            self.model.params.assign_sub(update_var)
+
+        else:
+            if batch_features:
+                radius_container = tf.boolean_mask(
+                    tensor=self.tr_radius_b,
+                    mask=self.model.model_vars.remaining_features)
+            else:
+                radius_container = self.tr_radius_b
+
+            tr_proposed_vector_b = self._trust_region_update(
+                update_raw=update_b,
+                radius_container=radius_container
+            )
+
+            tr_update_b = self._pad_updates(
+                update_raw=tr_proposed_vector_b,
+                compute_a=False,
+                compute_b=True
+            )
+
+            # perform update
+            self._trust_region_ops(
+                x_batches=x_batches,
+                log_probs=log_probs,
+                proposed_vector=tr_update_b,
+                proposed_gain=None,  # TODO remove completely, not needed any longer
+                compute_a=False,
+                compute_b=True,
+                batch_features=batch_features,
+                is_batched=is_batched
+            )
+
+        return False
+
+    def update_b_ar(self, inputs, batch_features, is_batched):
+
+        raise NotImplementedError('Armijio line search not implemented yet.')
+        """
+        x_batches = inputs[0]
+        proposed_vector = self._perform_trial_update()
+        self._check_and_apply_update(x_batches, proposed_vector, batch_features)
+
+        return None
+        """
+
+    def _check_and_apply_update(
+        self,
+        x_batches,
+        proposed_vector,
+        batch_features,
+    ):
+        eta0 = tf.constant(pkg_constants.TRUST_REGION_ETA0, dtype=self._dtype)
+        """
+        Current likelihood refers to the likelihood that has been calculated in the last model call.
+        We are always evaluating on the full model, so if we train on the batched model (is_batched),
+        current likelihood needs to be calculated on the full model using the same model state as
+        used in the last model call. Moreover, if this update is conducted separately for loc
+        (compute_a) and scale (compute_b), current likelihood always needs to be recalculated when
+        updating the scale params since the location params changed in the location update before.
+        This is only true if the location params are updated before the scale params however!
+        """
+
+        for i, x_batch in enumerate(x_batches):
+            log_likelihood = self.model.calc_ll([*x_batch])[0]
+            if i == 0:
+                current_likelihood = log_likelihood
+            else:
+                current_likelihood = tf.math.add(current_likelihood, log_likelihood)
+
+        current_likelihood = self._norm_neg_log_likelihood(current_likelihood)
+
+        """
+        The new likelihood is calculated on the full model now, after updating the parameters using
+        the proposed vector:
+        """
+        original_params_copy = tf.identity(self.model.params_copy)
+        self.model.params_copy.assign_sub(proposed_vector)
+        for i, x_batch in enumerate(x_batches):
+            log_likelihood = self.model.calc_ll([*x_batch])[0]
+            if i == 0:
+                new_likelihood = log_likelihood
+            else:
+                new_likelihood += log_likelihood
+        new_likelihood = self._norm_neg_log_likelihood(new_likelihood)
+
+        """
+        delta_f_actual shows the difference between the log likelihoods before and after the proposed
+        update of parameters. It is > 0 if the new likelihood is greater than the old.
+        """
+        delta_f_actual = tf.math.subtract(current_likelihood, new_likelihood)
+
+        update_theta = delta_f_actual > eta0
+        self.model.params_copy.assign(tf.where(update_theta, self.model.params_copy, original_params_copy))
+
+        if batch_features:
+            n_features = self.model.model_vars.n_features
+            indices = tf.where(self.model.model_vars.remaining_features)
+            update_theta = tf.scatter_nd(indices, update_theta, shape=(n_features,))
+
+        self.model.model_vars.updated_b = update_theta.numpy()
diff --git a/batchglm/train/tf2/glm_nb/training_strategies.py b/batchglm/train/tf2/glm_nb/training_strategies.py
index 858ec8a6..030b3c8a 100644
--- a/batchglm/train/tf2/glm_nb/training_strategies.py
+++ b/batchglm/train/tf2/glm_nb/training_strategies.py
@@ -38,3 +38,10 @@ class TrainingStrategies(Enum):
             "optim_algo": "adam",
         },
     ]
+    IRLS_LS = [
+        {
+            "convergence_criteria": "all_converged",
+            "use_batching": False,
+            "optim_algo": "irls_ls_tr",
+        },
+    ]

From c5ef7d93193c8a6c95477db47c78f2a986f68c45 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 31 Mar 2020 21:27:20 +0200
Subject: [PATCH 082/124] bugfixes to get irls_gd_tr to work

---
 batchglm/train/tf2/base_glm/estimator.py |  3 ---
 batchglm/train/tf2/base_glm/optim.py     | 17 ++++++++------
 batchglm/train/tf2/glm_nb/estimator.py   | 17 +++++++-------
 batchglm/train/tf2/glm_nb/optim.py       | 30 ++++++++++++++++++++----
 4 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 0ca7d8c0..4905fd71 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -200,14 +200,11 @@ def convergence_decision(num_converged, train_step):
                         batch_features = True
                         self.model.batch_features = batch_features
                     conv_diff = num_converged - n_conv_last_featurewise_batch
-                    print(conv_diff)
                     if pkg_constants.FEATUREWISE_THRESHOLD < 1:
                         conv_diff /= n_features-n_conv_last_featurewise_batch
-                    print(conv_diff)
                     # Update params if number of new convergences since last
                     # featurewise batch is reached again.
                     if conv_diff >= pkg_constants.FEATUREWISE_THRESHOLD:
-                        print(num_converged - n_conv_last_featurewise_batch, n_features-n_conv_last_featurewise_batch, conv_diff)
                         need_new_epoch_set = True
                         n_conv_last_featurewise_batch = num_converged
                         self.model.apply_featurewise_updates(conv_calc.last_params)
diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index d2de2710..c779f880 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -259,11 +259,8 @@ def _pad_updates(
 
         return netwon_type_update
 
-    def _trust_region_update(
-            self,
-            update_raw,
-            radius_container
-    ):
+    @staticmethod
+    def _calc_update_magnitudes(update_raw):
         update_magnitude_sq = tf.reduce_sum(tf.square(update_raw), axis=0)
         update_magnitude = tf.where(
             condition=update_magnitude_sq > 0,
@@ -278,9 +275,15 @@ def _trust_region_update(
             ),
             y=tf.zeros_like(update_magnitude)
         )
+        return update_magnitude, update_magnitude_inv
+
+    def _trust_region_update(
+            self,
+            update_raw,
+            radius_container,
+    ):
+        update_magnitude, update_magnitude_inv = SecondOrderOptim._calc_update_magnitudes(update_raw)
         update_norm = tf.multiply(update_raw, update_magnitude_inv)
-        # the following method is for irls_gd_tr (linear instead of newton)
-        self.normalize_update_magnitude(update_magnitude)
 
         update_scale = tf.minimum(
             radius_container,
diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index 69ff4478..d47cced3 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -1,21 +1,18 @@
 import logging
 from typing import Union
-import time  # needed for train_irls_ls_tr benchmarking
 import numpy as np
-import tensorflow as tf  # needed for train_irls_ls_tr
 
 from .external import InputDataGLM, Model
 from .external import closedform_nb_glm_logmu, closedform_nb_glm_logphi
-from .model import NBGLM, LossGLMNB
+from .model import NBGLM
 from .vars import ModelVars
 from .processModel import ProcessModel
 from .external import Estimator as GLMEstimator
 from .training_strategies import TrainingStrategies
 
 # needed for train_irls_ls_tr
-from .external import DataGenerator, ConvergenceCalculator, pkg_constants
 from .optim import IRLS_LS
-logger = logging.getLogger("batchglm")
+
 
 class Estimator(GLMEstimator, ProcessModel):
     """
@@ -116,9 +113,9 @@ def train(
 
         optimizer_object = self.get_optimizer_object(optim_algo, learning_rate)
         self.optimizer = optimizer_object
-        if optimizer_object.name in ['irls_gd_tr', 'irls_ar_tr']:
+        if optim_algo.lower() in ['irls_gd_tr', 'irls_ar_tr']:
             self.update = self.update_separated
-            self.epochs_until_b_update = 5
+            self.epochs_until_b_update = 0
 
         super(Estimator, self)._train(
             noise_model="nb",
@@ -142,7 +139,7 @@ def get_optimizer_object(self, optimizer, learning_rate):
                 model=self.model,
                 name=optim,
                 n_obs=self.input_data.num_observations,
-                max_iter=20)
+                max_iter=1)
         return super().get_optimizer_object(optimizer, learning_rate)
 
     def update_separated(self, results, batches, batch_features):
@@ -162,7 +159,9 @@ def update_separated(self, results, batches, batch_features):
                 batch_features=batch_features,
                 is_batched=False
             )
-        self.epochs_until_b_update -= 1
+            self.epochs_until_b_update = 0
+        else:
+            self.epochs_until_b_update -= 1
 
     def get_model_container(
             self,
diff --git a/batchglm/train/tf2/glm_nb/optim.py b/batchglm/train/tf2/glm_nb/optim.py
index d0195672..5a8e74c7 100644
--- a/batchglm/train/tf2/glm_nb/optim.py
+++ b/batchglm/train/tf2/glm_nb/optim.py
@@ -37,9 +37,6 @@ def _trust_region_linear_cost_gain(
         ), axis=0)
         return pred_cost_gain
 
-    def normalize_update_magnitude(self, update_magnitude):
-        update_magnitude = update_magnitude / self.n_obs * self.tr_radius_b
-
     def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
 
         assert compute_a ^ compute_b, \
@@ -53,12 +50,34 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             i = 0
             while(not all_features_converged and i < self.max_iter):
                 all_features_converged = self.update_b_func(inputs, batch_features, is_batched)
+                i += 1
+                print(i)
 
     def gett1t2(self):
         t1 = tf.constant(pkg_constants.TRUST_REGIONT_T1_IRLS_GD_TR_SCALE, dtype=self._dtype)
         t2 = tf.constant(pkg_constants.TRUST_REGIONT_T2_IRLS_GD_TR_SCALE, dtype=self._dtype)
         return t1, t2
 
+    def _trust_region_update_b(
+            self,
+            update_raw,
+            radius_container,
+    ):
+        update_magnitude, update_magnitude_inv = IRLS_LS._calc_update_magnitudes(update_raw)
+        update_norm = tf.multiply(update_raw, update_magnitude_inv)
+
+        update_magnitude = update_magnitude / self.n_obs * radius_container
+
+        update_scale = tf.minimum(
+            radius_container,
+            update_magnitude
+        )
+        proposed_vector = tf.multiply(
+            update_norm,
+            update_scale
+        )
+
+        return proposed_vector
     def update_b_gd(self, inputs, batch_features, is_batched):
 
         x_batches, log_probs, _, jac_b, _, _ = inputs
@@ -90,8 +109,9 @@ def update_b_gd(self, inputs, batch_features, is_batched):
                     mask=self.model.model_vars.remaining_features)
             else:
                 radius_container = self.tr_radius_b
-
-            tr_proposed_vector_b = self._trust_region_update(
+            print(update_b.shape)
+            print(radius_container.shape)
+            tr_proposed_vector_b = self._trust_region_update_b(
                 update_raw=update_b,
                 radius_container=radius_container
             )

From f7f618481056cee941d339f62abab2589f24461c Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Sun, 12 Apr 2020 16:32:56 +0200
Subject: [PATCH 083/124] added irls_ar_tr to known optimizers

---
 batchglm/train/tf2/base_glm/estimator.py | 15 ++++++++++-----
 batchglm/train/tf2/base_glm/model.py     |  2 +-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 4905fd71..9f244704 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -1,4 +1,5 @@
 import abc
+import sys
 import logging
 import time
 import numpy as np
@@ -79,7 +80,7 @@ def _train(
         n_features = self.input_data.num_features
         # set necessary attributes
         self.noise_model = noise_model
-        self.irls_algo = optim_algo.lower() in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr']
+        self.irls_algo = optim_algo.lower() in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr', 'irls_ar', 'irls_ar_tr']
         self.nr_algo = optim_algo.lower() in ['nr', 'nr_tr']
 
         ################################################
@@ -247,7 +248,7 @@ def convergence_decision(num_converged, train_step):
         self.model.hessian.compute_b = True  # since self._train_scale could be False.
 
         # need new set here with full feature space
-        # TODO: only ineeded if batch_features, maybe put this in the above if switch later
+        # TODO: only needed if batch_features, maybe put this in the above if switch later
         final_set = datagenerator.new_epoch_set()
         for i, x_batch in enumerate(final_set):
             results = self.model(x_batch) if i == 0 else \
@@ -256,10 +257,14 @@ def convergence_decision(num_converged, train_step):
         # store all the final results in this estimator instance.
         self._log_likelihood = results[0].numpy()
         self._jacobian = tf.reduce_sum(tf.abs(results[1] / n_obs), axis=1)
-
+        self._hessian = - results[2].numpy()
         # TODO: maybe report fisher inf here in the future instead of inverted hessian.
-        self._fisher_inv = tf.linalg.inv(results[2]).numpy()
-        self._hessian = -results[2].numpy()
+        fisher_inv = np.zeros_like(self._hessian)
+        invertible = np.where(np.linalg.cond(self._hessian, p=None) < 1 / sys.float_info.epsilon)[0]
+        num_non_invertible = n_features - len(invertible)
+        if num_non_invertible > 0:
+            logger.warning(f"fisher_inv could not be calculated for {num_non_invertible} features.")
+        fisher_inv[invertible] = np.linalg.inv(- self._hessian[invertible])
 
         self.model.hessian.compute_b = self.model.compute_b  # reset if not self._train_scale
 
diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index b28f321d..3c43b7b4 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -67,7 +67,7 @@ def setMethod(self, optimizer: str):
         elif optimizer in ['nr', 'nr_tr']:
             self._calc = self._calc_hessians
 
-        elif optimizer in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr']:
+        elif optimizer in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr', 'irls_ar', 'irls_ar_tr']:
             self._calc = self._calc_fim
         else:
             assert False, ("Unrecognized optimizer: %s", optimizer)

From 0ae221524a44ec56906f30c3ef2f59e5fe057fed Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Sun, 12 Apr 2020 16:37:45 +0200
Subject: [PATCH 084/124] added irls_ar_tr to known optimizers

---
 batchglm/train/tf2/glm_nb/estimator.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index d47cced3..e9bffb06 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -113,7 +113,7 @@ def train(
 
         optimizer_object = self.get_optimizer_object(optim_algo, learning_rate)
         self.optimizer = optimizer_object
-        if optim_algo.lower() in ['irls_gd_tr', 'irls_ar_tr']:
+        if optim_algo.lower() in ['irls_gd_tr', 'irls_ar_tr', 'irls_ar']:
             self.update = self.update_separated
             self.epochs_until_b_update = 0
 
@@ -139,7 +139,7 @@ def get_optimizer_object(self, optimizer, learning_rate):
                 model=self.model,
                 name=optim,
                 n_obs=self.input_data.num_observations,
-                max_iter=1)
+                max_iter=10)
         return super().get_optimizer_object(optimizer, learning_rate)
 
     def update_separated(self, results, batches, batch_features):
@@ -152,6 +152,7 @@ def update_separated(self, results, batches, batch_features):
             is_batched=False
         )
         if self._train_scale and self.epochs_until_b_update == 0:
+            self.model.model_vars.updated_b = False
             self.optimizer.perform_parameter_update(
                 inputs=[batches, *results],
                 compute_a=False,

From 1ededb8abe1d42ccff6a1debb1bfbbf6868cf6b9 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Sun, 12 Apr 2020 16:38:04 +0200
Subject: [PATCH 085/124] implemented first version of update_b_ar

---
 batchglm/train/tf2/glm_nb/optim.py | 143 ++++++++++++++++++++++++-----
 1 file changed, 121 insertions(+), 22 deletions(-)

diff --git a/batchglm/train/tf2/glm_nb/optim.py b/batchglm/train/tf2/glm_nb/optim.py
index 5a8e74c7..5c841553 100644
--- a/batchglm/train/tf2/glm_nb/optim.py
+++ b/batchglm/train/tf2/glm_nb/optim.py
@@ -17,14 +17,18 @@ def __init__(self, dtype, trusted_region_mode, model, name, n_obs, max_iter):
 
         if name.startswith('irls_gd'):
             self.update_b_func = self.update_b_gd
-            if trusted_region_mode:
-                n_features = self.model.model_vars.n_features
-                self.tr_radius_b = tf.Variable(
-                    np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT_SCALE,
-                    dtype=self._dtype, trainable=False)
 
         elif name in ['irls_ar_tr', 'irls_ar']:
-            self.update_b_func = self.update_b_armijio
+            self.update_b_func = self.update_b_ar
+
+        else:
+            assert False, "Unrecognized method for optimization given."
+
+        if trusted_region_mode:
+            n_features = self.model.model_vars.n_features
+            self.tr_radius_b = tf.Variable(
+                np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT_SCALE,
+                dtype=self._dtype, trainable=False)
 
     def _trust_region_linear_cost_gain(
             self,
@@ -46,12 +50,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             super(IRLS_LS, self).perform_parameter_update(
                 inputs, compute_a, compute_b, batch_features, is_batched)
         else:
-            all_features_converged = False
-            i = 0
-            while(not all_features_converged and i < self.max_iter):
-                all_features_converged = self.update_b_func(inputs, batch_features, is_batched)
-                i += 1
-                print(i)
+            self.update_b_func(inputs, batch_features, is_batched)
 
     def gett1t2(self):
         t1 = tf.constant(pkg_constants.TRUST_REGIONT_T1_IRLS_GD_TR_SCALE, dtype=self._dtype)
@@ -109,8 +108,7 @@ def update_b_gd(self, inputs, batch_features, is_batched):
                     mask=self.model.model_vars.remaining_features)
             else:
                 radius_container = self.tr_radius_b
-            print(update_b.shape)
-            print(radius_container.shape)
+
             tr_proposed_vector_b = self._trust_region_update_b(
                 update_raw=update_b,
                 radius_container=radius_container
@@ -136,16 +134,117 @@ def update_b_gd(self, inputs, batch_features, is_batched):
 
         return False
 
-    def update_b_ar(self, inputs, batch_features, is_batched):
+    def update_b_ar(self, inputs, batch_features, is_batched, alpha0=None):
+
+
+        c1 = pkg_constants.TRUST_REGION_ETA1
+        x_batches, log_probs, _, jac_b, _, _ = inputs
+        jac_b = tf.reshape(jac_b, [jac_b.shape[0]])
+        #jac_b = tf.negative(jac_b)
+        direction = -tf.sign(jac_b)
+        derphi0 = jac_b / self.n_obs
+        alpha0 = tf.ones_like(jac_b) * pkg_constants.TRUST_REGION_RADIUS_INIT_SCALE # self.tr_radius_b
+        original_params_b_copy = self.model.params_copy[-1]
+        print(direction[0].numpy(), jac_b[0].numpy())
+        def phi(alpha):
+            multiplier = tf.multiply(alpha, direction)
+            new_scale_params = tf.add(original_params_b_copy, multiplier)
+            self.model.params_copy[-1].assign(new_scale_params)
+            new_likelihood = None
+            for i, x_batch in enumerate(x_batches):
+                log_likelihood = self.model.calc_ll([*x_batch])[0]
+                new_likelihood = log_likelihood if i == 0 else tf.math.add(new_likelihood, log_likelihood)
+            new_likelihood = self._norm_neg_log_likelihood(new_likelihood)
+            return new_likelihood
+        current_likelihood = self._norm_neg_log_likelihood(log_probs)
+
+        new_likelihood = phi(alpha0)
+        #print(new_likelihood, current_likelihood)
+        beneficial = new_likelihood < current_likelihood + c1 * alpha0 * derphi0
+        #print(beneficial)
+
+        if tf.reduce_all(beneficial):  # are all beneficial?
+            updated = beneficial
+            if batch_features:
+                n_features = self.model.model_vars.n_features
+                indices = tf.where(self.model.model_vars.remaining_features)
+                updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
+            self.model.model_vars.updated_b = updated
+            # self.tr_radius_b.assign(alpha0)
+            return
+
+        alpha1 = tf.negative(derphi0) * alpha0**2 / 2 / (new_likelihood - current_likelihood - derphi0 * alpha0)
+        alpha1 = tf.where(beneficial, alpha0, alpha1)
+        new_likelihood2 = phi(alpha1)
+        #print(new_likelihood2, current_likelihood)
+        beneficial = new_likelihood2 < current_likelihood + c1 * alpha1 * derphi0
+        #print(beneficial)
+        if tf.reduce_all(beneficial):
+            updated = beneficial
+            if batch_features:
+                n_features = self.model.model_vars.n_features
+                indices = tf.where(self.model.model_vars.remaining_features)
+                updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
+            self.model.model_vars.updated_b = updated
+            # self.tr_radius_b.assign(alpha1)
+            return
+
+        for i in range(self.max_iter):
+            print(i)
+            factor = alpha0**2 * alpha1**2 * (alpha1-alpha0)
+            a = alpha0**2 * (new_likelihood2 - current_likelihood - derphi0 * alpha1) - \
+                alpha1**2 * (new_likelihood - current_likelihood - derphi0 * alpha0)
+            a = a / factor
+
+            b = -alpha0**3 * (new_likelihood2 - current_likelihood - derphi0 * alpha1) + \
+                alpha1**3 * (new_likelihood - current_likelihood - derphi0 * alpha0)
+            b = b / factor
+
+            alpha2 = (-b + tf.sqrt(tf.abs(tf.square(b) - 3 * a * derphi0))) / (3 * a)
+            alpha2 = tf.where(beneficial, alpha1, alpha2)
+            alpha2 = tf.clip_by_value(alpha2, clip_value_min=1e-12, clip_value_max=np.inf)
+            #print(alpha2)
+            if tf.reduce_all(alpha2 == 1e-12):
+                print('Minimum allowed step size reached for all features.')
+                self.model.model_vars.updated_b = np.zeros(self.model.model_vars.n_features, dtype=np.bool)
+            #print(alpha2)
+            new_likelihood3 = phi(alpha2)
+            #print(new_likelihood3, current_likelihood)
+            beneficial = new_likelihood3 < current_likelihood + c1 * alpha2 * derphi0
+            #print(beneficial)
+            if tf.reduce_all(beneficial):
+                updated = beneficial
+                if batch_features:
+                    n_features = self.model.model_vars.n_features
+                    indices = tf.where(self.model.model_vars.remaining_features)
+                    updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
+                self.model.model_vars.updated_b = updated
+                # self.tr_radius_b.assign(alpha1)
+                return
+
+            step_diff_greater_half_alpha1 = (alpha1 - alpha2) > alpha1 / 2
+            ratio = (1 - alpha2/alpha1) < 0.96
+            set_back = tf.logical_or(step_diff_greater_half_alpha1, ratio)
+            alpha2 = tf.where(set_back, alpha1 / 2, alpha2)
+            #if step_diff or ratio:
+            #    alpha2 = alpha1 / 2
+
+            alpha0 = alpha1
+            alpha1 = alpha2
+            new_likelihood = new_likelihood2
+            new_likelihood2 = new_likelihood3
+
+        # self.tr_radius_b.assign(alpha2)
+        new_scale_params = tf.where(beneficial, self.model.params_copy[-1], original_params_b_copy)
+        self.model.params_copy[-1].assign(new_scale_params)
+        updated = beneficial
+        if batch_features:
+            n_features = self.model.model_vars.n_features
+            indices = tf.where(self.model.model_vars.remaining_features)
+            updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
+        self.model.model_vars.updated_b = updated
 
-        raise NotImplementedError('Armijio line search not implemented yet.')
-        """
-        x_batches = inputs[0]
-        proposed_vector = self._perform_trial_update()
-        self._check_and_apply_update(x_batches, proposed_vector, batch_features)
 
-        return None
-        """
 
     def _check_and_apply_update(
         self,

From d71ecfaca311e552f1a2ab3648cde94a799db6c4 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 13 Apr 2020 15:04:41 +0200
Subject: [PATCH 086/124] added WOLFE C1 and C2

---
 batchglm/pkg_constants.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index 205d5bf6..2204a6c2 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -35,6 +35,9 @@
 FEATUREWISE_THRESHOLD = 10  # the minimal number of features to converge before next featurewise batch
 FEATUREWISE_RECALCULATE = False # if set to True, recalculate the results from the previous train step
 
+WOLFE_C1 = 1e-3
+WOLFE_C2 = 0.99
+
 try:
     import tensorflow as tf
 

From 28c6af3db07f9567883dd4094162c25b6e6aa58b Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 13 Apr 2020 15:07:09 +0200
Subject: [PATCH 087/124] make compute_a and compute_b function args

---
 batchglm/train/tf2/base_glm/estimator.py      | 20 +++++++----
 .../train/tf2/base_glm/layers_gradients.py    | 28 +++++++--------
 batchglm/train/tf2/base_glm/model.py          | 35 +++++++++++--------
 3 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 9f244704..84cc425d 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -73,15 +73,18 @@ def _train(
             autograd: bool = False,
             featurewise: bool = True,
             benchmark: bool = False,
-            optim_algo: str = "adam"
+            optim_algo: str = "adam",
+            b_update_freq = 0
     ):
         # define some useful shortcuts here
         n_obs = self.input_data.num_observations
         n_features = self.input_data.num_features
         # set necessary attributes
         self.noise_model = noise_model
-        self.irls_algo = optim_algo.lower() in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr', 'irls_ar', 'irls_ar_tr']
-        self.nr_algo = optim_algo.lower() in ['nr', 'nr_tr']
+        optim = optim_algo.lower()
+        self.irls_algo = optim in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr', 'irls_ar_tr']
+        self.nr_algo = optim in ['nr', 'nr_tr']
+        epochs_until_b_update = b_update_freq
 
         ################################################
         # INIT Step 1: Consistency Checks
@@ -116,11 +119,12 @@ def _train(
         epoch_set = datagenerator.new_epoch_set()
 
         # first model call to initialise prior to first update.
+        compute_b = epochs_until_b_update == 0
         for i, x_batch in enumerate(epoch_set):
             if i == 0:
-                results = self.model(x_batch)
+                results = self.model(x_batch, compute_b=compute_b)
             else:
-                results = [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch))]
+                results = [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch, compute_b=compute_b))]
 
         # create ConvergenceCalculator to check for new convergences.
         conv_calc = ConvergenceCalculator(self, tf.negative(tf.divide(results[0], n_obs)).numpy())
@@ -166,10 +170,11 @@ def convergence_decision(num_converged, train_step):
 
             ############################################
             # 3. calculate new ll, jacs, hessian/fim
+            compute_b = epochs_until_b_update == 0
             for i, x_batch in enumerate(epoch_set):
                 # need new params_copy in model in case we use featurewise without recalculation
-                results = self.model(x_batch) if i == 0 \
-                    else [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch))]
+                results = self.model(x_batch, compute_b=compute_b) if i == 0 \
+                    else [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch, compute_b=compute_b))]
 
             ############################################
             # 4. check for any new convergences
@@ -221,6 +226,7 @@ def convergence_decision(num_converged, train_step):
                 logger.warning(log_output)
 
             train_step += 1
+            epochs_until_b_update = b_update_freq if compute_b else epochs_until_b_update - 1
             # store some useful stuff for benchmarking purposes.
             if benchmark:
                 t1_epoch = time.time()
diff --git a/batchglm/train/tf2/base_glm/layers_gradients.py b/batchglm/train/tf2/base_glm/layers_gradients.py
index 01b7dfb7..65df20b6 100644
--- a/batchglm/train/tf2/base_glm/layers_gradients.py
+++ b/batchglm/train/tf2/base_glm/layers_gradients.py
@@ -6,11 +6,9 @@ class Gradient(tf.keras.layers.Layer):
 
     """Superclass for Jacobians, Hessian, FIM"""
 
-    def __init__(self, model_vars, compute_a, compute_b, dtype):
+    def __init__(self, model_vars, dtype):
         super(Gradient, self).__init__()
         self.model_vars = model_vars
-        self.compute_a = compute_a
-        self.compute_b = compute_b
         self.grad_dtype = dtype
 
     @abc.abstractmethod
@@ -40,7 +38,7 @@ class FIMGLM(Gradient):
     def call(self, inputs, **kwargs):
         return self._fim_analytic(*inputs)
 
-    def _fim_analytic(self, x, design_loc, design_scale, loc, scale, concat=False) -> tf.Tensor:
+    def _fim_analytic(self, x, design_loc, design_scale, loc, scale, concat=False, compute_a=True, compute_b=True) -> tf.Tensor:
         """
         Compute the closed-form of the base_glm_all model fim
         by evalutating its terms grouped by observations.
@@ -87,14 +85,14 @@ def _b_byobs():
         # Here, the non-zero model-wise diagonal blocks are computed and returned
         # as a dictionary. The according score function vectors are also returned as a dictionary.
 
-        if self.compute_a and self.compute_b:
+        if compute_a and compute_b:
             fim_a = _a_byobs()
             fim_b = _b_byobs()
 
-        elif self.compute_a and not self.compute_b:
+        elif compute_a and not compute_b:
             fim_a = _a_byobs()
             fim_b = tf.zeros(fim_a.get_shape(), self.grad_dtype)
-        elif not self.compute_a and self.compute_b:
+        elif not compute_a and compute_b:
             fim_a = tf.zeros(fim_a.get_shape(), self.grad_dtype)
             fim_b = _b_byobs()
         else:
@@ -157,7 +155,7 @@ class JacobianGLM(Gradient):
     def call(self, inputs, **kwargs):
         return self._jac_analytic(*inputs)
 
-    def _jac_analytic(self, x, design_loc, design_scale, loc, scale, concat) -> tf.Tensor:
+    def _jac_analytic(self, x, design_loc, design_scale, loc, scale, concat, compute_a=True, compute_b=True) -> tf.Tensor:
         """
         Compute the closed-form of the base_glm_all model jacobian
         by evalutating its terms grouped by observations.
@@ -196,13 +194,13 @@ def _b_byobs():
             jblock = tf.matmul(tf.transpose(w), xh)  # [features, coefficients]
             return jblock
 
-        if self.compute_a and self.compute_b:
+        if compute_a and compute_b:
             j_a = _a_byobs()
             j_b = _b_byobs()
-        elif self.compute_a and not self.compute_b:
+        elif compute_a and not compute_b:
             j_a = _a_byobs()
             j_b = tf.zeros((j_a.get_shape()[0], self.model_vars.b_var.get_shape()[0]), dtype=self.grad_dtype)
-        elif not self.compute_a and self.compute_b:
+        elif not compute_a and compute_b:
             j_b = _b_byobs()
             j_a = tf.zeros((j_b.get_shape()[0], self.model_vars.b_var.get_shape()[0]), dtype=self.grad_dtype)
         else:
@@ -276,7 +274,7 @@ class HessianGLM(Gradient):
     def call(self, inputs, **kwargs):
         return self._hessian_analytic(*inputs)
 
-    def _hessian_analytic(self, x, design_loc, design_scale, loc, scale, concat) -> tf.Tensor:
+    def _hessian_analytic(self, x, design_loc, design_scale, loc, scale, concat, compute_a=True, compute_b=True) -> tf.Tensor:
         """
         Compute the closed-form of the base_glm_all model hessian
         by evaluating its terms grouped by observations.
@@ -342,17 +340,17 @@ def _ab_byobs_batched():
             hblock = self.create_specific_block(w, xhloc, xhscale)
             return hblock
 
-        if self.compute_a and self.compute_b:
+        if compute_a and compute_b:
             h_aa = _aa_byobs_batched()
             h_bb = _bb_byobs_batched()
             h_ab = _ab_byobs_batched()
             h_ba = tf.transpose(h_ab, perm=[0, 2, 1])
-        elif self.compute_a and not self.compute_b:
+        elif compute_a and not compute_b:
             h_aa = _aa_byobs_batched()
             h_bb = tf.zeros_like(h_aa, dtype=self.grad_dtype)
             h_ab = tf.zeros_like(h_aa, dtype=self.grad_dtype)
             h_ba = tf.zeros_like(h_aa, dtype=self.grad_dtype)
-        elif not self.compute_a and self.compute_b:
+        elif not compute_a and compute_b:
             h_bb = _bb_byobs_batched()
             h_aa = tf.zeros_like(h_bb, dtype=self.grad_dtype)
             h_ab = tf.zeros_like(h_bb, dtype=self.grad_dtype)
diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index 3c43b7b4..0d1eb869 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -62,7 +62,7 @@ def setMethod(self, optimizer: str):
         """
         optimizer = optimizer.lower()
         if optimizer in ['gd', 'adam', 'adagrad', 'rmsprop']:
-            self._calc = self._return_jacobians
+            self._calc = self.calc_jacobians
 
         elif optimizer in ['nr', 'nr_tr']:
             self._calc = self._calc_hessians
@@ -115,10 +115,11 @@ def calc_ll(self, inputs):
         log_probs = tf.reduce_sum(log_probs, axis=0)
         return (log_probs, *parameters[2:])
 
-    def _return_jacobians(self, inputs):
-        return self._calc_jacobians(inputs)[-2:]
+    def calc_jacobians(self, inputs, compute_a, compute_b):
 
-    def _calc_jacobians(self, inputs, concat=True, transpose=True):
+        return self._calc_jacobians(inputs, compute_a=compute_a, compute_b=compute_b)[-2:]
+
+    def _calc_jacobians(self, inputs, compute_a, compute_b, concat=True, transpose=True):
         """
         calculates jacobian.
 
@@ -138,8 +139,8 @@ def _calc_jacobians(self, inputs, concat=True, transpose=True):
 
         if self.use_gradient_tape:
 
-            if self.compute_a:
-                if self.compute_b:
+            if compute_a:
+                if compute_b:
                     if concat:
                         jacobians = g.gradient(log_probs, self.params_copy)
                         if not transpose:
@@ -168,20 +169,20 @@ def _calc_jacobians(self, inputs, concat=True, transpose=True):
         else:
 
             if concat:
-                jacobians = self.jacobian([*inputs[0:3], loc, scale, True])
+                jacobians = self.jacobian([*inputs[0:3], loc, scale, True, compute_a, compute_b])
                 if transpose:
                     jacobians = tf.transpose(jacobians)
             else:
-                jac_a, jac_b = self.jacobian([*inputs[0:3], loc, scale, False])
+                jac_a, jac_b = self.jacobian([*inputs[0:3], loc, scale, compute_a, compute_b])
 
         del g
         if concat:
             return loc, scale, log_probs, tf.negative(jacobians)
         return loc, scale, log_probs, tf.negative(jac_a), tf.negative(jac_b)
 
-    def _calc_hessians(self, inputs):
+    def _calc_hessians(self, inputs, compute_a, compute_b):
         # with tf.GradientTape(persistent=True) as g2:
-        loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, transpose=False)
+        loc, scale, log_probs, jacobians = self._calc_jacobians(inputs, compute_a=compute_a, compute_b=compute_b, transpose=False)
         '''
         autograd not yet working. TODO: Search error in the following code:
 
@@ -209,24 +210,28 @@ def _calc_hessians(self, inputs):
             ), perm=[2, 1, 0])
             hessians = tf.negative(hessians)
         '''
-        hessians = tf.negative(self.hessian([*inputs[0:3], loc, scale, True]))
+        hessians = tf.negative(self.hessian([*inputs[0:3], loc, scale, True, compute_a, compute_b]))
         return log_probs, jacobians, hessians
 
-    def _calc_fim(self, inputs):
+    def _calc_fim(self, inputs, compute_a, compute_b):
         loc, scale, log_probs, jac_a, jac_b = self._calc_jacobians(
             inputs,
+            compute_a=compute_a,
+            compute_b=compute_b,
             concat=False,
             transpose=False)
-        fim_a, fim_b = self.fim([*inputs[0:3], loc, scale, False])
+        fim_a, fim_b = self.fim([*inputs[0:3], loc, scale, False, compute_a, compute_b])
         return log_probs, jac_a, jac_b, fim_a, fim_b
 
-    def call(self, inputs):
+    def call(self, inputs, compute_a=True, compute_b=None):
         """
         Wrapper method to call this model. Depending on the desired calculations specified by the
         `optimizer` arg to `__init__`, it will forward the call to the necessary function to perform
         the right calculations and return all the results.
         """
-        return self._calc(inputs)
+        if compute_b is None:
+            compute_b = self.compute_b
+        return self._calc(inputs, compute_a, compute_b)
 
 class LossGLM(LossBase):
 

From 20968294f0916da34512a2305a5b83115ce706ac Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 13 Apr 2020 15:08:47 +0200
Subject: [PATCH 088/124] cleanup and added return val to trust_region_ops

---
 batchglm/train/tf2/base_glm/optim.py | 99 +++++++---------------------
 1 file changed, 24 insertions(+), 75 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index c779f880..3ffd2c63 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -58,7 +58,6 @@ def _trust_region_ops(
         # assert pkg_constants.TRUST_REGION_T2 >= 1, "t1 must be larger than 1"
         # Set trust region hyper-parameters
         eta0 = tf.constant(pkg_constants.TRUST_REGION_ETA0, dtype=self._dtype)
-        # eta1 = tf.constant(pkg_constants.TRUST_REGION_ETA1, dtype=self._dtype)
         # eta2 = tf.constant(pkg_constants.TRUST_REGION_ETA2, dtype=self._dtype)
         t1, t2 = self.gett1t2()
 
@@ -108,53 +107,14 @@ def _trust_region_ops(
         feature space by adding columns corresponding to positions of converged (non calculated)
         features.
         """
-        """
-        if batch_features:
-            n_features = self.model.model_vars.n_features
-            indices = tf.where(tf.logical_not(self.model.model_vars.converged))
-
-            delta_f_actual = tf.scatter_nd(indices, delta_f_actual, shape=(n_features,))
-            update_var = tf.transpose(tf.scatter_nd(
-                indices,
-                tf.transpose(proposed_vector),
-                shape=(n_features, proposed_vector.get_shape()[0])
-            ))
-            gain_var = tf.transpose(tf.scatter_nd(
-                indices,
-                proposed_gain,
-                shape=(n_features,)))
-        else:
-            update_var = proposed_vector
-            gain_var = proposed_gain
-        #delta_f_ratio = tf.divide(delta_f_actual, gain_var)
-        """
+
         # Compute parameter updates.g
-        #update_theta = tf.logical_and(delta_f_actual > eta0, tf.logical_not(self.model.model_vars.converged))
         update_theta = delta_f_actual > eta0
         self.model.params_copy.assign(tf.where(update_theta, self.model.params_copy, original_params_copy))
 
         #update_theta_numeric = tf.expand_dims(tf.cast(update_theta, self._dtype), axis=0)
         #keep_theta_numeric = tf.ones_like(update_theta_numeric) - update_theta_numeric
-        """
-        if batch_features:
-            params = tf.transpose(tf.scatter_nd(
-                indices,
-                tf.transpose(self.model.params_copy),
-                shape=(n_features, self.model.params.get_shape()[0])
-            ))
-
-            theta_new_tr = tf.add(
-                tf.multiply(self.model.params, keep_theta_numeric),
-                tf.multiply(params, update_theta_numeric)
-            )
-        else:
-            params = self.model.params_copy
-            theta_new_tr = tf.add(
-                tf.multiply(params + update_var, keep_theta_numeric),  # old values
-                tf.multiply(params, update_theta_numeric)  # new values
-            )
-        self.model.params.assign(theta_new_tr)
-        """
+
         decrease_radius = tf.math.logical_not(update_theta)
         increase_radius = update_theta
         if batch_features:
@@ -162,27 +122,14 @@ def _trust_region_ops(
             indices = tf.where(self.model.model_vars.remaining_features)
             decrease_radius = tf.scatter_nd(indices, decrease_radius, shape=(n_features,))
             increase_radius = tf.scatter_nd(indices, update_theta, shape=(n_features,))
-            update_theta = increase_radius
 
         if compute_b and not compute_a:
-            self.model.model_vars.updated_b = update_theta.numpy()
+            self.model.model_vars.updated_b |= increase_radius.numpy()  # needs to be |= if maxiter > 1
         else:
-            self.model.model_vars.updated = update_theta.numpy()
+            self.model.model_vars.updated = increase_radius.numpy()
 
         # Update trusted region accordingly:
 
-        #decrease_radius = delta_f_actual <= eta0
-        #increase_radius = delta_f_actual > eta0
-        """
-        decrease_radius = tf.logical_or(
-            delta_f_actual <= eta0,
-            tf.logical_and(delta_f_ratio <= eta1, tf.logical_not(self.model.model_vars.converged))
-        )
-        increase_radius = tf.logical_and(
-            delta_f_actual > eta0,
-            tf.logical_and(delta_f_ratio > eta2, tf.logical_not(self.model.model_vars.converged))
-        )
-        """
         keep_radius = tf.logical_and(tf.logical_not(decrease_radius),
                                      tf.logical_not(increase_radius))
         radius_update = tf.add_n([
@@ -199,6 +146,8 @@ def _trust_region_ops(
         radius_new = tf.minimum(tf.multiply(tr_radius, radius_update), upper_bound)
         tr_radius.assign(radius_new)
 
+        return update_theta
+
     def __init__(self, dtype: tf.dtypes.DType, trusted_region_mode: bool, model: tf.keras.Model, name: str, n_obs: int):
 
         super(SecondOrderOptim, self).__init__(name)
@@ -334,9 +283,12 @@ def _get_updates(self, lhs, rhs, compute_a, compute_b):
     def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
 
         x_batches, log_probs, jacobians, hessians = inputs
-        if not (compute_a or compute_b):
-            raise ValueError(
-                "Nothing can be trained. Please make sure at least one of train_mu and train_r is set to True.")
+        if compute_b:
+            if not compute_a:
+                self.model.model_vars.updated_b = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to is updated.
+
+        assert (compute_a or compute_b), "Nothing can be trained. Please make sure" \
+            "at least one of train_mu and train_r is set to True."
 
         update_raw, update = self._get_updates(hessians, jacobians, compute_a, compute_b)
 
@@ -376,18 +328,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             )
 
         else:
-            if batch_features:
-                indices = tf.where(self.model.model_vars.remaining_features)
-                update_var = tf.transpose(
-                    tf.scatter_nd(
-                        indices,
-                        tf.transpose(update),
-                        shape=(self.model.model_vars.n_features, update.get_shape()[0])
-                    )
-                )
-            else:
-                update_var = update
-            self.model.params.assign_sub(update_var)
+            self.model.params_copy.assign_sub(update)
 
 
 class IRLS(SecondOrderOptim):
@@ -432,9 +373,13 @@ def _calc_proposed_vector_and_pred_cost_gain(
     def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
 
         x_batches, log_probs, jac_a, jac_b, fim_a, fim_b = inputs
-        if not (compute_a or compute_b):
-            raise ValueError(
-                "Nothing can be trained. Please make sure at least one of train_mu and train_r is set to True.")
+        if compute_b:
+            if not compute_a:
+                self.model.model_vars.updated_b = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to is updated.
+
+        assert (compute_a or compute_b), "Nothing can be trained. Please make sure" \
+            "at least one of train_mu and train_r is set to True."
+
         # Compute a and b model updates separately.
         if compute_a:
             # The FIM of the mean model is guaranteed to be
@@ -527,3 +472,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                 batch_features=batch_features,
                 is_batched=is_batched
             )
+
+    def calc_delta_f_actual(self, current_likelihood, new_likelihood, jacobian):
+        eta1 = tf.constant(pkg_constants.TRUST_REGION_ETA1, dtype=self._dtype)
+        return

From 3a02a57cf562af0d8252a9d5698677eb931f04c4 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 13 Apr 2020 15:10:00 +0200
Subject: [PATCH 089/124] divided loc and scale update calls

---
 batchglm/train/tf2/glm_nb/estimator.py | 34 ++++++++++++++------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index e9bffb06..ccc89b3f 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -111,11 +111,13 @@ def train(
         else:
             self.model.setMethod(optim_algo)
 
-        optimizer_object = self.get_optimizer_object(optim_algo, learning_rate)
+        intercept_scale = len(self.model.model_vars.idx_train_scale) == 1
+        optimizer_object = self.get_optimizer_object(optim_algo, learning_rate, intercept_scale)
         self.optimizer = optimizer_object
-        if optim_algo.lower() in ['irls_gd_tr', 'irls_ar_tr', 'irls_ar']:
+        if optim_algo.lower() in ['irls_gd_tr', 'irls_ar_tr']:
             self.update = self.update_separated
-            self.epochs_until_b_update = 0
+            self.b_update_freq = 0
+            self.epochs_until_b_update = self.b_update_freq
 
         super(Estimator, self)._train(
             noise_model="nb",
@@ -130,28 +132,21 @@ def train(
             optim_algo=optim_algo
         )
 
-    def get_optimizer_object(self, optimizer, learning_rate):
+    def get_optimizer_object(self, optimizer, learning_rate, intercept_scale):
         optim = optimizer.lower()
-        if optim in ['irls_gd_tr', 'irls_gd', 'irls_ar', 'irls_ar_tr']:
+        if optim in ['irls_gd_tr', 'irls_gd', 'irls_ar_tr']:
             return IRLS_LS(
                 dtype=self.dtype,
-                trusted_region_mode=optim.endswith('tr'),
+                tr_mode=optim.endswith('tr'),
                 model=self.model,
                 name=optim,
                 n_obs=self.input_data.num_observations,
-                max_iter=10)
+                intercept_scale=intercept_scale)
         return super().get_optimizer_object(optimizer, learning_rate)
 
     def update_separated(self, results, batches, batch_features):
 
-        self.optimizer.perform_parameter_update(
-            inputs=[batches, *results],
-            compute_a=True,
-            compute_b=False,
-            batch_features=batch_features,
-            is_batched=False
-        )
-        if self._train_scale and self.epochs_until_b_update == 0:
+        if self.epochs_until_b_update == 0:
             self.model.model_vars.updated_b = False
             self.optimizer.perform_parameter_update(
                 inputs=[batches, *results],
@@ -160,9 +155,16 @@ def update_separated(self, results, batches, batch_features):
                 batch_features=batch_features,
                 is_batched=False
             )
-            self.epochs_until_b_update = 0
+            self.epochs_until_b_update = self.b_update_freq
         else:
             self.epochs_until_b_update -= 1
+            self.optimizer.perform_parameter_update(
+                inputs=[batches, *results],
+                compute_a=True,
+                compute_b=False,
+                batch_features=batch_features,
+                is_batched=False
+            )
 
     def get_model_container(
             self,

From 4ab9cce34133440dd757b11a3cc440c0c2d28fcb Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 13 Apr 2020 15:10:34 +0200
Subject: [PATCH 090/124] removed compute_a/b from gradient layer init

---
 batchglm/train/tf2/glm_nb/model.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/batchglm/train/tf2/glm_nb/model.py b/batchglm/train/tf2/glm_nb/model.py
index af1f524e..be08868e 100644
--- a/batchglm/train/tf2/glm_nb/model.py
+++ b/batchglm/train/tf2/glm_nb/model.py
@@ -31,9 +31,9 @@ def __init__(
             linker_loc=LinkerLoc(),
             linker_scale=LinkerScale(),
             likelihood=Likelihood(dtype),
-            jacobian=Jacobian(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
-            hessian=Hessian(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
-            fim=FIM(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
+            jacobian=Jacobian(model_vars=model_vars, dtype=dtype),
+            hessian=Hessian(model_vars=model_vars, dtype=dtype),
+            fim=FIM(model_vars=model_vars, dtype=dtype),
             use_gradient_tape=use_gradient_tape,
             optimizer=optimizer
         )

From c8575dd8a11a4cd4cbc6834268287771149d317e Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 13 Apr 2020 15:11:30 +0200
Subject: [PATCH 091/124] cleanup and update_b in while loop with maxiter

---
 batchglm/train/tf2/glm_nb/optim.py | 153 ++++++++++-------------------
 1 file changed, 54 insertions(+), 99 deletions(-)

diff --git a/batchglm/train/tf2/glm_nb/optim.py b/batchglm/train/tf2/glm_nb/optim.py
index 5c841553..55ca5128 100644
--- a/batchglm/train/tf2/glm_nb/optim.py
+++ b/batchglm/train/tf2/glm_nb/optim.py
@@ -4,27 +4,27 @@
 
 class IRLS_LS(IRLS):
 
-    def __init__(self, dtype, trusted_region_mode, model, name, n_obs, max_iter):
+    def __init__(self, dtype, tr_mode, model, name, n_obs, intercept_scale):
 
         super(IRLS_LS, self).__init__(
             dtype=dtype,
-            trusted_region_mode=trusted_region_mode,
+            trusted_region_mode=tr_mode,
             model=model,
             name=name,
             n_obs=n_obs)
 
-        self.max_iter = max_iter
-
         if name.startswith('irls_gd'):
             self.update_b_func = self.update_b_gd
-
+            if intercept_scale:
+                self.delta_f_actual_b = self.intercept_delta_f_actual_b
         elif name in ['irls_ar_tr', 'irls_ar']:
+            assert intercept_scale, "Line search (IRLS_AR_TR) is only available" \
+                "for scale models with a single coefficient (intercept scale)."
             self.update_b_func = self.update_b_ar
-
         else:
             assert False, "Unrecognized method for optimization given."
 
-        if trusted_region_mode:
+        if tr_mode:
             n_features = self.model.model_vars.n_features
             self.tr_radius_b = tf.Variable(
                 np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT_SCALE,
@@ -41,16 +41,31 @@ def _trust_region_linear_cost_gain(
         ), axis=0)
         return pred_cost_gain
 
-    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
+    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False, maxiter=20):
 
         assert compute_a ^ compute_b, \
-            "IRLSLS computes either loc or scale model updates, not both nor none at the same time."
+            "IRLS_LS computes either loc or scale model updates, not both nor none at the same time."
 
         if compute_a:
             super(IRLS_LS, self).perform_parameter_update(
                 inputs, compute_a, compute_b, batch_features, is_batched)
         else:
-            self.update_b_func(inputs, batch_features, is_batched)
+            self.model.model_vars.update_b = np.zeros(self.model.model_vars.n_features, dtype=np.bool)
+            # global_step = tf.zeros_like(self.model.model_vars.remaining_features)
+            results = inputs[1:4]
+            x_batches = inputs[0]
+            iteration = 0
+            not_converged = np.zeros(self.model.model_vars.remaining_features, dtype=np.bool)
+            while True:
+                iteration += 1
+                step = self.update_b_func([x_batches, *results], batch_features, is_batched)
+                not_converged = step.numpy() > pkg_constants.XTOL_BY_FEATURE_SCALE
+                if tf.reduce_any(not_converged) or iteration == maxiter:
+                    break
+                for i, x_batch in enumerate(inputs[0]):
+                    results = self.model.calc_jacobians(x_batch, compute_a=False) if i == 0 else \
+                        [tf.math.add(results[i], x) for
+                         i, x in enumerate(self.calc_jacobians(x_batch, compute_a=False))]
 
     def gett1t2(self):
         t1 = tf.constant(pkg_constants.TRUST_REGIONT_T1_IRLS_GD_TR_SCALE, dtype=self._dtype)
@@ -79,7 +94,7 @@ def _trust_region_update_b(
         return proposed_vector
     def update_b_gd(self, inputs, batch_features, is_batched):
 
-        x_batches, log_probs, _, jac_b, _, _ = inputs
+        x_batches, log_probs, _, jac_b = inputs
 
         update_b = tf.transpose(jac_b)
         if not self.trusted_region_mode:
@@ -88,18 +103,9 @@ def update_b_gd(self, inputs, batch_features, is_batched):
                 compute_a=False,
                 compute_b=True
             )
-            if batch_features:
-                indices = tf.where(self.model.model_vars.remaining_features)
-                update_var = tf.transpose(
-                    tf.scatter_nd(
-                        indices,
-                        tf.transpose(update),
-                        shape=(self.model.model_vars.n_features, update.get_shape()[0])
-                    )
-                )
-            else:
-                update_var = update
-            self.model.params.assign_sub(update_var)
+            self.model.params_copy.assign_sub(update)
+
+            return update
 
         else:
             if batch_features:
@@ -121,7 +127,7 @@ def update_b_gd(self, inputs, batch_features, is_batched):
             )
 
             # perform update
-            self._trust_region_ops(
+            update_theta = self._trust_region_ops(
                 x_batches=x_batches,
                 log_probs=log_probs,
                 proposed_vector=tr_update_b,
@@ -132,18 +138,19 @@ def update_b_gd(self, inputs, batch_features, is_batched):
                 is_batched=is_batched
             )
 
-        return False
+            return tf.where(update_theta, tr_update_b, tf.zeros_like(tr_update_b))
 
     def update_b_ar(self, inputs, batch_features, is_batched, alpha0=None):
 
 
         c1 = pkg_constants.TRUST_REGION_ETA1
-        x_batches, log_probs, _, jac_b, _, _ = inputs
+        x_batches, log_probs, _, jac_b = inputs
         jac_b = tf.reshape(jac_b, [jac_b.shape[0]])
         #jac_b = tf.negative(jac_b)
         direction = -tf.sign(jac_b)
         derphi0 = jac_b / self.n_obs
-        alpha0 = tf.ones_like(jac_b) * pkg_constants.TRUST_REGION_RADIUS_INIT_SCALE # self.tr_radius_b
+        if alpha0 is None:
+            alpha0 = tf.ones_like(jac_b) * pkg_constants.TRUST_REGION_RADIUS_INIT_SCALE # self.tr_radius_b
         original_params_b_copy = self.model.params_copy[-1]
         print(direction[0].numpy(), jac_b[0].numpy())
         def phi(alpha):
@@ -160,9 +167,7 @@ def phi(alpha):
 
         new_likelihood = phi(alpha0)
         #print(new_likelihood, current_likelihood)
-        beneficial = new_likelihood < current_likelihood + c1 * alpha0 * derphi0
-        #print(beneficial)
-
+        beneficial = self.wolfe1(current_likelihood, new_likelihood, alpha0, derphi0)
         if tf.reduce_all(beneficial):  # are all beneficial?
             updated = beneficial
             if batch_features:
@@ -171,13 +176,13 @@ def phi(alpha):
                 updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
             self.model.model_vars.updated_b = updated
             # self.tr_radius_b.assign(alpha0)
-            return
+            return tf.multiply(alpha0, direction)
 
         alpha1 = tf.negative(derphi0) * alpha0**2 / 2 / (new_likelihood - current_likelihood - derphi0 * alpha0)
         alpha1 = tf.where(beneficial, alpha0, alpha1)
         new_likelihood2 = phi(alpha1)
         #print(new_likelihood2, current_likelihood)
-        beneficial = new_likelihood2 < current_likelihood + c1 * alpha1 * derphi0
+        beneficial = self.wolfe1(current_likelihood, new_likelihood2, alpha1, derphi0)
         #print(beneficial)
         if tf.reduce_all(beneficial):
             updated = beneficial
@@ -187,10 +192,9 @@ def phi(alpha):
                 updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
             self.model.model_vars.updated_b = updated
             # self.tr_radius_b.assign(alpha1)
-            return
+            return tf.multiply(alpha1, direction)
 
-        for i in range(self.max_iter):
-            print(i)
+        while tf.reduce_any(alpha1 > 0):
             factor = alpha0**2 * alpha1**2 * (alpha1-alpha0)
             a = alpha0**2 * (new_likelihood2 - current_likelihood - derphi0 * alpha1) - \
                 alpha1**2 * (new_likelihood - current_likelihood - derphi0 * alpha0)
@@ -202,15 +206,15 @@ def phi(alpha):
 
             alpha2 = (-b + tf.sqrt(tf.abs(tf.square(b) - 3 * a * derphi0))) / (3 * a)
             alpha2 = tf.where(beneficial, alpha1, alpha2)
-            alpha2 = tf.clip_by_value(alpha2, clip_value_min=1e-12, clip_value_max=np.inf)
+            alpha2 = tf.clip_by_value(alpha2, clip_value_min=0, clip_value_max=np.inf)
             #print(alpha2)
-            if tf.reduce_all(alpha2 == 1e-12):
-                print('Minimum allowed step size reached for all features.')
+            if tf.reduce_all(alpha2 == 0):
+                #print('Minimum allowed step size reached for all features.')
                 self.model.model_vars.updated_b = np.zeros(self.model.model_vars.n_features, dtype=np.bool)
             #print(alpha2)
             new_likelihood3 = phi(alpha2)
             #print(new_likelihood3, current_likelihood)
-            beneficial = new_likelihood3 < current_likelihood + c1 * alpha2 * derphi0
+            beneficial = self.wolfe1(current_likelihood, new_likelihood3, alpha2, derphi0)
             #print(beneficial)
             if tf.reduce_all(beneficial):
                 updated = beneficial
@@ -220,12 +224,13 @@ def phi(alpha):
                     updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
                 self.model.model_vars.updated_b = updated
                 # self.tr_radius_b.assign(alpha1)
-                return
+                return tf.multiply(alpha2, direction)
 
             step_diff_greater_half_alpha1 = (alpha1 - alpha2) > alpha1 / 2
             ratio = (1 - alpha2/alpha1) < 0.96
             set_back = tf.logical_or(step_diff_greater_half_alpha1, ratio)
             alpha2 = tf.where(set_back, alpha1 / 2, alpha2)
+            alpha2 = tf.clip_by_value(alpha2, clip_value_min=0, clip_value_max=np.inf)
             #if step_diff or ratio:
             #    alpha2 = alpha1 / 2
 
@@ -242,62 +247,12 @@ def phi(alpha):
             n_features = self.model.model_vars.n_features
             indices = tf.where(self.model.model_vars.remaining_features)
             updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
-        self.model.model_vars.updated_b = updated
-
-
-
-    def _check_and_apply_update(
-        self,
-        x_batches,
-        proposed_vector,
-        batch_features,
-    ):
-        eta0 = tf.constant(pkg_constants.TRUST_REGION_ETA0, dtype=self._dtype)
-        """
-        Current likelihood refers to the likelihood that has been calculated in the last model call.
-        We are always evaluating on the full model, so if we train on the batched model (is_batched),
-        current likelihood needs to be calculated on the full model using the same model state as
-        used in the last model call. Moreover, if this update is conducted separately for loc
-        (compute_a) and scale (compute_b), current likelihood always needs to be recalculated when
-        updating the scale params since the location params changed in the location update before.
-        This is only true if the location params are updated before the scale params however!
-        """
-
-        for i, x_batch in enumerate(x_batches):
-            log_likelihood = self.model.calc_ll([*x_batch])[0]
-            if i == 0:
-                current_likelihood = log_likelihood
-            else:
-                current_likelihood = tf.math.add(current_likelihood, log_likelihood)
-
-        current_likelihood = self._norm_neg_log_likelihood(current_likelihood)
-
-        """
-        The new likelihood is calculated on the full model now, after updating the parameters using
-        the proposed vector:
-        """
-        original_params_copy = tf.identity(self.model.params_copy)
-        self.model.params_copy.assign_sub(proposed_vector)
-        for i, x_batch in enumerate(x_batches):
-            log_likelihood = self.model.calc_ll([*x_batch])[0]
-            if i == 0:
-                new_likelihood = log_likelihood
-            else:
-                new_likelihood += log_likelihood
-        new_likelihood = self._norm_neg_log_likelihood(new_likelihood)
-
-        """
-        delta_f_actual shows the difference between the log likelihoods before and after the proposed
-        update of parameters. It is > 0 if the new likelihood is greater than the old.
-        """
-        delta_f_actual = tf.math.subtract(current_likelihood, new_likelihood)
-
-        update_theta = delta_f_actual > eta0
-        self.model.params_copy.assign(tf.where(update_theta, self.model.params_copy, original_params_copy))
-
-        if batch_features:
-            n_features = self.model.model_vars.n_features
-            indices = tf.where(self.model.model_vars.remaining_features)
-            update_theta = tf.scatter_nd(indices, update_theta, shape=(n_features,))
-
-        self.model.model_vars.updated_b = update_theta.numpy()
+        self.model.model_vars.updated_b |= updated.numpy()
+        return tf.multiply(alpha2, direction)
+
+    def wolfe1(self, current_likelihood, new_likelihood, alpha, jacobian):
+        """Checks if an update satisfies the first wolfe condition by returning the difference
+        to the previous likelihood."""
+        c1 = tf.constant(pkg_constants.WOLFE_C1, dtype=self._dtype)
+        limit = tf.add(current_likelihood, tf.multiply(tf.multiply(c1, alpha), jacobian))
+        return new_likelihood < limit

From 69cbdbbae0513d85ec6123fbfd84d6cb8c7ab422 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 15 Apr 2020 10:04:57 +0200
Subject: [PATCH 092/124] added support for b_update_freq

---
 batchglm/train/tf2/base_glm/estimator.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 84cc425d..60414792 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -74,7 +74,7 @@ def _train(
             featurewise: bool = True,
             benchmark: bool = False,
             optim_algo: str = "adam",
-            b_update_freq = 0
+            b_update_freq = 1
     ):
         # define some useful shortcuts here
         n_obs = self.input_data.num_observations
@@ -84,7 +84,6 @@ def _train(
         optim = optim_algo.lower()
         self.irls_algo = optim in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr', 'irls_ar_tr']
         self.nr_algo = optim in ['nr', 'nr_tr']
-        epochs_until_b_update = b_update_freq
 
         ################################################
         # INIT Step 1: Consistency Checks
@@ -98,6 +97,9 @@ def _train(
             raise RuntimeError("Cannot train the model: Estimator not initialized. \
                 Did you forget to call estimator.initialize() ?")
 
+        if b_update_freq == 0:
+            b_update_freq = 1
+
         if autograd and optim_algo.lower() in ['nr', 'nr_tr']:
             logger.warning(
                 "Automatic differentiation is currently not supported for hessians. Falling back \
@@ -119,6 +121,7 @@ def _train(
         epoch_set = datagenerator.new_epoch_set()
 
         # first model call to initialise prior to first update.
+        epochs_until_b_update = b_update_freq - 1
         compute_b = epochs_until_b_update == 0
         for i, x_batch in enumerate(epoch_set):
             if i == 0:
@@ -149,6 +152,7 @@ def convergence_decision(num_converged, train_step):
         ####
 
         while convergence_decision(num_converged, train_step):
+
             if benchmark:
                 t0_epoch = time.time()
 
@@ -160,17 +164,17 @@ def convergence_decision(num_converged, train_step):
                 # using featurewise.
                 epoch_set = datagenerator.new_epoch_set(batch_features=batch_features)
                 if pkg_constants.FEATUREWISE_RECALCULATE:
-                    for i, x_batch in enumerate(epoch_set):
+                    for i, x_batch in enumerate(epoch_set, compute_b=compute_b):
                         results = self.model(x_batch) if i == 0 else \
-                            [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch))]
+                            [tf.math.add(results[i], x) for i, x in enumerate(self.model(x_batch, compute_b=compute_b))]
 
             ############################################
             # 2. Update the parameters
-            self.update(results, epoch_set, batch_features)
+            self.update(results, epoch_set, batch_features, epochs_until_b_update == 0)
 
             ############################################
             # 3. calculate new ll, jacs, hessian/fim
-            compute_b = epochs_until_b_update == 0
+            compute_b = epochs_until_b_update < 2
             for i, x_batch in enumerate(epoch_set):
                 # need new params_copy in model in case we use featurewise without recalculation
                 results = self.model(x_batch, compute_b=compute_b) if i == 0 \
@@ -226,7 +230,7 @@ def convergence_decision(num_converged, train_step):
                 logger.warning(log_output)
 
             train_step += 1
-            epochs_until_b_update = b_update_freq if compute_b else epochs_until_b_update - 1
+            epochs_until_b_update = (epochs_until_b_update + b_update_freq - 1) % b_update_freq
             # store some useful stuff for benchmarking purposes.
             if benchmark:
                 t1_epoch = time.time()

From def51e4729b9442fbed16bc2df941907be457fa2 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 15 Apr 2020 10:05:59 +0200
Subject: [PATCH 093/124] bugfix: wrong positions of defautl args

---
 batchglm/train/tf2/base_glm/model.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index 0d1eb869..69f8d0ef 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -115,9 +115,10 @@ def calc_ll(self, inputs):
         log_probs = tf.reduce_sum(log_probs, axis=0)
         return (log_probs, *parameters[2:])
 
-    def calc_jacobians(self, inputs, compute_a, compute_b):
-
-        return self._calc_jacobians(inputs, compute_a=compute_a, compute_b=compute_b)[-2:]
+    def calc_jacobians(self, inputs, compute_a=True, compute_b=None, concat=True):
+        if compute_b is None:
+            compute_b = self.compute_b
+        return self._calc_jacobians(inputs, compute_a=compute_a, compute_b=compute_b, concat=concat)[2:]
 
     def _calc_jacobians(self, inputs, compute_a, compute_b, concat=True, transpose=True):
         """
@@ -173,7 +174,7 @@ def _calc_jacobians(self, inputs, compute_a, compute_b, concat=True, transpose=T
                 if transpose:
                     jacobians = tf.transpose(jacobians)
             else:
-                jac_a, jac_b = self.jacobian([*inputs[0:3], loc, scale, compute_a, compute_b])
+                jac_a, jac_b = self.jacobian([*inputs[0:3], loc, scale, False, compute_a, compute_b])
 
         del g
         if concat:

From 44abc642202a9a7351e87f5b0e3871c3d957ee45 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 15 Apr 2020 10:06:30 +0200
Subject: [PATCH 094/124] rm or equal cond for updated_b

---
 batchglm/train/tf2/base_glm/optim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index 3ffd2c63..dcef5a23 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -124,7 +124,7 @@ def _trust_region_ops(
             increase_radius = tf.scatter_nd(indices, update_theta, shape=(n_features,))
 
         if compute_b and not compute_a:
-            self.model.model_vars.updated_b |= increase_radius.numpy()  # needs to be |= if maxiter > 1
+            self.model.model_vars.updated_b = increase_radius.numpy()
         else:
             self.model.model_vars.updated = increase_radius.numpy()
 

From 561306fa727d05b48c11afef0baad724beeaa5df Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 15 Apr 2020 10:07:10 +0200
Subject: [PATCH 095/124] init updated vectors with False

---
 batchglm/train/tf2/base_glm/vars.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/vars.py b/batchglm/train/tf2/base_glm/vars.py
index f3d5bcce..f57ff08c 100644
--- a/batchglm/train/tf2/base_glm/vars.py
+++ b/batchglm/train/tf2/base_glm/vars.py
@@ -69,8 +69,8 @@ def __init__(
         self.b_var = self.tf_clip_param(b_var, "b_var")
 
         # Properties to follow gene-wise convergence.
-        self.updated = np.repeat(a=True, repeats=self.params.shape[1])  # Initialise to is updated.
-        self.updated_b = np.repeat(a=True, repeats=self.params.shape[1])  # Initialise to is updated.
+        self.updated = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to is updated.
+        self.updated_b = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to is updated.
         self.converged = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to non-converged.
         self.converged_b = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to non-converged.
 

From 54b0bfb8bd5644d1e23c720d7cbafb8b03fc4f8c Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 15 Apr 2020 10:07:47 +0200
Subject: [PATCH 096/124] added support for b_update_freq and maxiter

---
 batchglm/train/tf2/glm_nb/estimator.py | 36 ++++++++++++++------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index ccc89b3f..1367f3e2 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -97,7 +97,9 @@ def train(
             stopping_criteria: int = 1000,
             autograd: bool = False,
             featurewise: bool = True,
-            benchmark: bool = False
+            benchmark: bool = False,
+            maxiter: int = 1,
+            b_update_freq = 5
     ):
         if self.model is None:
             self.model = NBGLM(
@@ -116,8 +118,7 @@ def train(
         self.optimizer = optimizer_object
         if optim_algo.lower() in ['irls_gd_tr', 'irls_ar_tr']:
             self.update = self.update_separated
-            self.b_update_freq = 0
-            self.epochs_until_b_update = self.b_update_freq
+            self.maxiter = maxiter
 
         super(Estimator, self)._train(
             noise_model="nb",
@@ -129,7 +130,8 @@ def train(
             autograd=autograd,
             featurewise=featurewise,
             benchmark=benchmark,
-            optim_algo=optim_algo
+            optim_algo=optim_algo,
+            b_update_freq = b_update_freq
         )
 
     def get_optimizer_object(self, optimizer, learning_rate, intercept_scale):
@@ -144,27 +146,27 @@ def get_optimizer_object(self, optimizer, learning_rate, intercept_scale):
                 intercept_scale=intercept_scale)
         return super().get_optimizer_object(optimizer, learning_rate)
 
-    def update_separated(self, results, batches, batch_features):
+    def update_separated(self, results, batches, batch_features, compute_b):
 
-        if self.epochs_until_b_update == 0:
-            self.model.model_vars.updated_b = False
+        self.optimizer.perform_parameter_update(
+            inputs=[batches, *results],
+            compute_a=True,
+            compute_b=False,
+            batch_features=batch_features,
+            is_batched=False
+        )
+        if compute_b:
             self.optimizer.perform_parameter_update(
                 inputs=[batches, *results],
                 compute_a=False,
                 compute_b=True,
                 batch_features=batch_features,
-                is_batched=False
+                is_batched=False,
+                maxiter=self.maxiter
             )
-            self.epochs_until_b_update = self.b_update_freq
         else:
-            self.epochs_until_b_update -= 1
-            self.optimizer.perform_parameter_update(
-                inputs=[batches, *results],
-                compute_a=True,
-                compute_b=False,
-                batch_features=batch_features,
-                is_batched=False
-            )
+            self.model.model_vars.updated_b = np.zeros_like(self.model.model_vars.updated_b)
+
 
     def get_model_container(
             self,

From 442572a79fe56e2dd11e31ecc060ca47c1a94996 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 15 Apr 2020 10:09:40 +0200
Subject: [PATCH 097/124] get update_b loop to work

---
 batchglm/train/tf2/glm_nb/optim.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/batchglm/train/tf2/glm_nb/optim.py b/batchglm/train/tf2/glm_nb/optim.py
index 55ca5128..8fe987b2 100644
--- a/batchglm/train/tf2/glm_nb/optim.py
+++ b/batchglm/train/tf2/glm_nb/optim.py
@@ -15,8 +15,6 @@ def __init__(self, dtype, tr_mode, model, name, n_obs, intercept_scale):
 
         if name.startswith('irls_gd'):
             self.update_b_func = self.update_b_gd
-            if intercept_scale:
-                self.delta_f_actual_b = self.intercept_delta_f_actual_b
         elif name in ['irls_ar_tr', 'irls_ar']:
             assert intercept_scale, "Line search (IRLS_AR_TR) is only available" \
                 "for scale models with a single coefficient (intercept scale)."
@@ -41,7 +39,7 @@ def _trust_region_linear_cost_gain(
         ), axis=0)
         return pred_cost_gain
 
-    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False, maxiter=20):
+    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False, maxiter=1):
 
         assert compute_a ^ compute_b, \
             "IRLS_LS computes either loc or scale model updates, not both nor none at the same time."
@@ -50,22 +48,24 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
             super(IRLS_LS, self).perform_parameter_update(
                 inputs, compute_a, compute_b, batch_features, is_batched)
         else:
-            self.model.model_vars.update_b = np.zeros(self.model.model_vars.n_features, dtype=np.bool)
             # global_step = tf.zeros_like(self.model.model_vars.remaining_features)
             results = inputs[1:4]
             x_batches = inputs[0]
             iteration = 0
-            not_converged = np.zeros(self.model.model_vars.remaining_features, dtype=np.bool)
+            not_converged = np.zeros_like(self.model.model_vars.remaining_features)
+            updated_b = np.zeros_like(self.model.model_vars.updated_b)
             while True:
                 iteration += 1
                 step = self.update_b_func([x_batches, *results], batch_features, is_batched)
-                not_converged = step.numpy() > pkg_constants.XTOL_BY_FEATURE_SCALE
-                if tf.reduce_any(not_converged) or iteration == maxiter:
+                not_converged = tf.abs(step).numpy() > pkg_constants.XTOL_BY_FEATURE_SCALE
+                updated_b |= self.model.model_vars.updated_b
+                if not tf.reduce_any(not_converged) or iteration == maxiter:
                     break
                 for i, x_batch in enumerate(inputs[0]):
-                    results = self.model.calc_jacobians(x_batch, compute_a=False) if i == 0 else \
+                    results = self.model.calc_jacobians(x_batch, concat=False, compute_a=False) if i == 0 else \
                         [tf.math.add(results[i], x) for
-                         i, x in enumerate(self.calc_jacobians(x_batch, compute_a=False))]
+                         i, x in enumerate(self.calc_jacobians(x_batch, concat=False, compute_a=False))]
+            self.model.model_vars.updated_b = updated_b
 
     def gett1t2(self):
         t1 = tf.constant(pkg_constants.TRUST_REGIONT_T1_IRLS_GD_TR_SCALE, dtype=self._dtype)
@@ -138,7 +138,7 @@ def update_b_gd(self, inputs, batch_features, is_batched):
                 is_batched=is_batched
             )
 
-            return tf.where(update_theta, tr_update_b, tf.zeros_like(tr_update_b))
+            return tf.where(update_theta, tr_proposed_vector_b, tf.zeros_like(tr_proposed_vector_b))
 
     def update_b_ar(self, inputs, batch_features, is_batched, alpha0=None):
 
@@ -152,7 +152,7 @@ def update_b_ar(self, inputs, batch_features, is_batched, alpha0=None):
         if alpha0 is None:
             alpha0 = tf.ones_like(jac_b) * pkg_constants.TRUST_REGION_RADIUS_INIT_SCALE # self.tr_radius_b
         original_params_b_copy = self.model.params_copy[-1]
-        print(direction[0].numpy(), jac_b[0].numpy())
+        #print(direction[0].numpy(), jac_b[0].numpy())
         def phi(alpha):
             multiplier = tf.multiply(alpha, direction)
             new_scale_params = tf.add(original_params_b_copy, multiplier)

From b18bc5bee42f1a50976adc3db95eaa08fe0f7706 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 15 Apr 2020 10:16:12 +0200
Subject: [PATCH 098/124] bugfix: catch errors in fisher inv

---
 batchglm/train/tf2/base_glm/estimator.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 4d683cc6..802f7721 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -1,4 +1,5 @@
 import abc
+import sys
 import logging
 import time
 import numpy as np
@@ -252,10 +253,15 @@ def convergence_decision(num_converged, train_step):
         # store all the final results in this estimator instance.
         self._log_likelihood = results[0].numpy()
         self._jacobian = tf.reduce_sum(tf.abs(results[1] / n_obs), axis=1)
-
-        # TODO: maybe report fisher inf here in the future instead of inverted hessian.
-        self._fisher_inv = tf.linalg.inv(results[2]).numpy()
         self._hessian = -results[2].numpy()
+        # TODO: maybe report fisher inf here in the future instead of inverted hessian.
+        fisher_inv = np.zeros_like(self._hessian)
+        invertible = np.where(np.linalg.cond(self._hessian, p=None) < 1 / sys.float_info.epsilon)[0]
+        num_non_invertible = n_features - len(invertible)
+        if num_non_invertible > 0:
+            logger.warning(f"fisher_inv could not be calculated for {num_non_invertible} features.")
+        fisher_inv[invertible] = np.linalg.inv(- self._hessian[invertible])
+
 
         self.model.hessian.compute_b = self.model.compute_b  # reset if not self._train_scale
 

From 9d5f4588df554cec762beb9d8d76c2ad529e034c Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 22 Apr 2020 18:28:17 +0200
Subject: [PATCH 099/124] bugfix: wrong method call to jacobian

---
 batchglm/train/tf2/glm_nb/optim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/glm_nb/optim.py b/batchglm/train/tf2/glm_nb/optim.py
index 8fe987b2..16211345 100644
--- a/batchglm/train/tf2/glm_nb/optim.py
+++ b/batchglm/train/tf2/glm_nb/optim.py
@@ -64,7 +64,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                 for i, x_batch in enumerate(inputs[0]):
                     results = self.model.calc_jacobians(x_batch, concat=False, compute_a=False) if i == 0 else \
                         [tf.math.add(results[i], x) for
-                         i, x in enumerate(self.calc_jacobians(x_batch, concat=False, compute_a=False))]
+                         i, x in enumerate(self.model.calc_jacobians(x_batch, concat=False, compute_a=False))]
             self.model.model_vars.updated_b = updated_b
 
     def gett1t2(self):

From 5205d4e7f6cd6583f65d1d02e403d45721473b6e Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Wed, 22 Apr 2020 18:29:47 +0200
Subject: [PATCH 100/124] set to updated if proposed vector smaller TR_TOL

---
 batchglm/train/tf2/base_glm/optim.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index dcef5a23..c4b91926 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -33,11 +33,6 @@ def _create_slots(self, var_list):
 
         self.add_slot(var_list[0], 'mu_r')
 
-    def gett1t2(self):
-        t1 = tf.constant(pkg_constants.TRUST_REGION_T1, dtype=self._dtype)
-        t2 = tf.constant(pkg_constants.TRUST_REGION_T2, dtype=self._dtype)
-        return t1, t2
-
     def _trust_region_ops(
             self,
             x_batches,
@@ -47,7 +42,7 @@ def _trust_region_ops(
             compute_a,
             compute_b,
             batch_features,
-            is_batched
+            is_batched,
     ):
         # Load hyper-parameters:
         # assert pkg_constants.TRUST_REGION_ETA0 < pkg_constants.TRUST_REGION_ETA1, \
@@ -59,8 +54,14 @@ def _trust_region_ops(
         # Set trust region hyper-parameters
         eta0 = tf.constant(pkg_constants.TRUST_REGION_ETA0, dtype=self._dtype)
         # eta2 = tf.constant(pkg_constants.TRUST_REGION_ETA2, dtype=self._dtype)
-        t1, t2 = self.gett1t2()
-
+        if compute_b and not compute_a:
+            t1 = pkg_constants.TRUST_REGION_T1
+            t2 = pkg_constants.TRUST_REGION_T2
+        else:
+            t1 = pkg_constants.TRUST_REGIONT_T1_IRLS_GD_TR_SCALE
+            t2 = pkg_constants.TRUST_REGIONT_T2_IRLS_GD_TR_SCALE
+        t1 = tf.constant(t1, dtype=self._dtype)
+        t2 = tf.constant(t2, dtype=self._dtype)
         upper_bound = tf.constant(pkg_constants.TRUST_REGION_UPPER_BOUND, dtype=self._dtype)
 
         # Phase I: Perform a trial update.
@@ -88,6 +89,7 @@ def _trust_region_ops(
         """
         original_params_copy = tf.identity(self.model.params_copy)
         self.model.params_copy.assign_sub(proposed_vector)
+
         for i, x_batch in enumerate(x_batches):
             log_likelihood = self.model.calc_ll([*x_batch])[0]
             if i == 0:
@@ -111,7 +113,7 @@ def _trust_region_ops(
         # Compute parameter updates.g
         update_theta = delta_f_actual > eta0
         self.model.params_copy.assign(tf.where(update_theta, self.model.params_copy, original_params_copy))
-
+        update_theta |= tf.sqrt(tf.reduce_sum(tf.square(proposed_vector), axis=0)) < pkg_constants.TRTOL_BY_FEATURE_LOC
         #update_theta_numeric = tf.expand_dims(tf.cast(update_theta, self._dtype), axis=0)
         #keep_theta_numeric = tf.ones_like(update_theta_numeric) - update_theta_numeric
 
@@ -128,6 +130,7 @@ def _trust_region_ops(
         else:
             self.model.model_vars.updated = increase_radius.numpy()
 
+
         # Update trusted region accordingly:
 
         keep_radius = tf.logical_and(tf.logical_not(decrease_radius),

From bece86289a48598a9ec157e795cc1c0c93e89782 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 5 May 2020 19:20:24 +0200
Subject: [PATCH 101/124] make TRTOL same as XTOL

---
 batchglm/pkg_constants.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index 2204a6c2..8d8c4cc3 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -29,8 +29,8 @@
 GTOL_BY_FEATURE_LOC = 1e-8
 GTOL_BY_FEATURE_SCALE = 1e-8
 
-TRTOL_BY_FEATURE_LOC = 1e-12
-TRTOL_BY_FEATURE_SCALE = 1e-12
+TRTOL_BY_FEATURE_LOC = 1e-8
+TRTOL_BY_FEATURE_SCALE = 1e-6
 
 FEATUREWISE_THRESHOLD = 10  # the minimal number of features to converge before next featurewise batch
 FEATUREWISE_RECALCULATE = False # if set to True, recalculate the results from the previous train step

From 20e718869adccae925a46a7b811627fae9cc4907 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 5 May 2020 19:22:28 +0200
Subject: [PATCH 102/124] cleanup train/base, rm Loss class

---
 batchglm/train/tf2/base/__init__.py  |  2 +-
 batchglm/train/tf2/base/estimator.py |  6 +-----
 batchglm/train/tf2/base/model.py     | 14 ++------------
 3 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/batchglm/train/tf2/base/__init__.py b/batchglm/train/tf2/base/__init__.py
index 9b75ab32..bc66a13a 100644
--- a/batchglm/train/tf2/base/__init__.py
+++ b/batchglm/train/tf2/base/__init__.py
@@ -1,3 +1,3 @@
 from .estimator import TFEstimator
-from .model import ProcessModelBase, ModelBase, LossBase
+from .model import ProcessModelBase, ModelBase
 from .optim import OptimizerBase
diff --git a/batchglm/train/tf2/base/estimator.py b/batchglm/train/tf2/base/estimator.py
index a90ec901..7c103b02 100644
--- a/batchglm/train/tf2/base/estimator.py
+++ b/batchglm/train/tf2/base/estimator.py
@@ -1,13 +1,9 @@
-from .external import pkg_constants
-from .model import ModelBase, LossBase
-
-import numpy as np
+from .model import ModelBase
 import tensorflow as tf
 
 
 class TFEstimator:
     model: ModelBase
-    loss: LossBase
 
     def __init__(self, input_data, dtype):
 
diff --git a/batchglm/train/tf2/base/model.py b/batchglm/train/tf2/base/model.py
index acce4dee..76eaa6fa 100644
--- a/batchglm/train/tf2/base/model.py
+++ b/batchglm/train/tf2/base/model.py
@@ -8,24 +8,14 @@
 
 class ModelBase(tf.keras.Model, metaclass=abc.ABCMeta):
 
-    def __init__(self):
-        super(ModelBase, self).__init__()
+    def __init__(self, dtype):
+        super(ModelBase, self).__init__(dtype=dtype)
 
     @abc.abstractmethod
     def call(self, inputs, training=False, mask=None):
         pass
 
 
-class LossBase(tf.keras.losses.Loss, metaclass=abc.ABCMeta):
-
-    def __init__(self):
-        super(LossBase, self).__init__()
-
-    @abc.abstractmethod
-    def call(self, y_true, y_pred):
-        pass
-
-
 class ProcessModelBase:
 
     @abc.abstractmethod

From 83648d2375d7a37e050b8b454018fd2616cc1fb1 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 5 May 2020 19:24:18 +0200
Subject: [PATCH 103/124] rm layers as args, Loss + add layers in base class

---
 batchglm/train/tf2/base_glm/model.py | 64 +++++++++++-----------------
 batchglm/train/tf2/glm_beta/model.py | 44 +++++--------------
 batchglm/train/tf2/glm_nb/model.py   | 43 +++++--------------
 batchglm/train/tf2/glm_norm/model.py | 55 +++++-------------------
 4 files changed, 58 insertions(+), 148 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index 69f8d0ef..396abb8e 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -1,37 +1,32 @@
+from importlib import import_module
 import logging
 import tensorflow as tf
-import numpy as np
-from .external import ModelBase, LossBase
+from .external import ModelBase
 from .processModel import ProcessModelGLM
 logger = logging.getLogger("batchglm")
 
 
 class GLM(ModelBase, ProcessModelGLM):
-
     """
     base GLM class containg the model call.
     """
 
-    compute_a: bool = True
-    compute_b: bool = True
-
     def __init__(
             self,
             model_vars,
-            unpack_params: tf.keras.layers.Layer,
-            linear_loc: tf.keras.layers.Layer,
-            linear_scale: tf.keras.layers.Layer,
-            linker_loc: tf.keras.layers.Layer,
-            linker_scale: tf.keras.layers.Layer,
-            likelihood: tf.keras.layers.Layer,
-            jacobian: tf.keras.layers.Layer,
-            hessian: tf.keras.layers.Layer,
-            fim: tf.keras.layers.Layer,
             optimizer: str,
+            noise_module: str,
             use_gradient_tape: bool = False,
+            compute_a: bool = True,
+            compute_b: bool = True,
+            dtype: str = "float32",
     ):
-        super(GLM, self).__init__()
+        super(GLM, self).__init__(dtype=dtype)
+
         self.model_vars = model_vars
+        self.use_gradient_tape = use_gradient_tape
+        self.compute_a = compute_a
+        self.compute_b = compute_b
         self.params = tf.Variable(tf.concat(
             [
                 model_vars.init_a_clipped,
@@ -39,19 +34,21 @@ def __init__(
             ],
             axis=0
         ), name="params", trainable=True)
-
-        self.unpack_params = unpack_params
-        self.linear_loc = linear_loc
-        self.linear_scale = linear_scale
-        self.linker_loc = linker_loc
-        self.linker_scale = linker_scale
-        self.likelihood = likelihood
-        self.jacobian = jacobian
-        self.hessian = hessian
-        self.fim = fim
-        self.use_gradient_tape = use_gradient_tape
         self.params_copy = self.params
 
+        # import and add noise model specific layers.
+        layers = import_module('...' + noise_module + '.layers', __name__)
+        grad_layers = import_module('...' + noise_module + '.layers_gradients', __name__)
+        self.unpack_params = layers.UnpackParams(dtype=dtype)
+        self.linear_loc = layers.LinearLoc(dtype=dtype)
+        self.linear_scale = layers.LinearScale(dtype=dtype)
+        self.linker_loc = layers.LinkerLoc(dtype=dtype)
+        self.linker_scale = layers.LinkerScale(dtype=dtype)
+        self.likelihood = layers.Likelihood(dtype=dtype)
+        self.jacobian = grad_layers.Jacobian(model_vars=model_vars, dtype=dtype)
+        self.hessian = grad_layers.Hessian(model_vars=model_vars, dtype=dtype)
+        self.fim = grad_layers.FIM(model_vars=model_vars, dtype=dtype)
+
         self.setMethod(optimizer)
 
     def setMethod(self, optimizer: str):
@@ -67,7 +64,7 @@ def setMethod(self, optimizer: str):
         elif optimizer in ['nr', 'nr_tr']:
             self._calc = self._calc_hessians
 
-        elif optimizer in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr', 'irls_ar', 'irls_ar_tr']:
+        elif optimizer in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr', 'irls_ar', 'irls_ar_tr', 'irls_tr_gd_tr']:
             self._calc = self._calc_fim
         else:
             assert False, ("Unrecognized optimizer: %s", optimizer)
@@ -233,14 +230,3 @@ def call(self, inputs, compute_a=True, compute_b=None):
         if compute_b is None:
             compute_b = self.compute_b
         return self._calc(inputs, compute_a, compute_b)
-
-class LossGLM(LossBase):
-
-    def norm_log_likelihood(self, log_probs):
-        return tf.reduce_mean(log_probs, axis=0, name="log_likelihood")
-
-    def norm_neg_log_likelihood(self, log_probs):
-        return - self.norm_log_likelihood(log_probs)
-
-    def call(self, y_true, log_probs):
-        return tf.reduce_sum(self.norm_neg_log_likelihood(log_probs))
diff --git a/batchglm/train/tf2/glm_beta/model.py b/batchglm/train/tf2/glm_beta/model.py
index bdc7f1cd..556f5335 100644
--- a/batchglm/train/tf2/glm_beta/model.py
+++ b/batchglm/train/tf2/glm_beta/model.py
@@ -1,46 +1,24 @@
-import logging
-
-from .layers import UnpackParams, LinearLoc, LinearScale, LinkerLoc, LinkerScale, Likelihood
-from .external import GLM, LossGLM
-from .layers_gradients import Jacobian, Hessian, FIM
+from .external import GLM
 from .processModel import ProcessModel
 
-logger = logging.getLogger(__name__)
-
 
 class BetaGLM(GLM, ProcessModel):
 
     def __init__(
             self,
             model_vars,
-            dtype,
-            compute_a,
-            compute_b,
-            use_gradient_tape,
-            optimizer
+            optimizer: str,
+            compute_a: bool,
+            compute_b: bool,
+            use_gradient_tape: bool,
+            dtype: str,
     ):
-        self.compute_a = compute_a
-        self.compute_b = compute_b
-
         super(BetaGLM, self).__init__(
             model_vars=model_vars,
-            unpack_params=UnpackParams(),
-            linear_loc=LinearLoc(),
-            linear_scale=LinearScale(),
-            linker_loc=LinkerLoc(),
-            linker_scale=LinkerScale(),
-            likelihood=Likelihood(dtype),
-            jacobian=Jacobian(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
-            hessian=Hessian(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
-            fim=FIM(model_vars=model_vars, compute_a=compute_a, compute_b=compute_b, dtype=dtype),
+            noise_module='glm_beta',
+            optimizer=optimizer,
+            compute_a=compute_a,
+            compute_b=compute_b,
             use_gradient_tape=use_gradient_tape,
-            optimizer=optimizer
-
+            dtype=dtype
         )
-
-
-class LossGLMBeta(LossGLM):
-
-    """
-    Full class
-    """
diff --git a/batchglm/train/tf2/glm_nb/model.py b/batchglm/train/tf2/glm_nb/model.py
index be08868e..bd2dbe66 100644
--- a/batchglm/train/tf2/glm_nb/model.py
+++ b/batchglm/train/tf2/glm_nb/model.py
@@ -1,45 +1,24 @@
-import logging
-
-from .external import LossGLM, GLM
-from .layers import UnpackParams, LinearLoc, LinearScale, LinkerLoc, LinkerScale, Likelihood
-from .layers_gradients import Jacobian, Hessian, FIM
-
+from .external import GLM
 from .processModel import ProcessModel
 
-logger = logging.getLogger(__name__)
-
 
 class NBGLM(GLM, ProcessModel):
 
     def __init__(
             self,
             model_vars,
-            dtype,
-            compute_a,
-            compute_b,
-            use_gradient_tape,
-            optimizer
+            optimizer: str,
+            compute_a: bool,
+            compute_b: bool,
+            use_gradient_tape: bool,
+            dtype: str,
     ):
-        self.compute_a = compute_a
-        self.compute_b = compute_b
-
         super(NBGLM, self).__init__(
             model_vars=model_vars,
-            unpack_params=UnpackParams(),
-            linear_loc=LinearLoc(),
-            linear_scale=LinearScale(),
-            linker_loc=LinkerLoc(),
-            linker_scale=LinkerScale(),
-            likelihood=Likelihood(dtype),
-            jacobian=Jacobian(model_vars=model_vars, dtype=dtype),
-            hessian=Hessian(model_vars=model_vars, dtype=dtype),
-            fim=FIM(model_vars=model_vars, dtype=dtype),
+            noise_module='glm_nb',
+            optimizer=optimizer,
+            compute_a=compute_a,
+            compute_b=compute_b,
             use_gradient_tape=use_gradient_tape,
-            optimizer=optimizer
+            dtype=dtype
         )
-
-
-class LossGLMNB(LossGLM):
-    """
-    Full class
-    """
diff --git a/batchglm/train/tf2/glm_norm/model.py b/batchglm/train/tf2/glm_norm/model.py
index 58e31636..7bcb41fe 100644
--- a/batchglm/train/tf2/glm_norm/model.py
+++ b/batchglm/train/tf2/glm_norm/model.py
@@ -1,57 +1,24 @@
-import logging
-
-from .layers import UnpackParams, LinearLoc, LinearScale, LinkerLoc, LinkerScale, Likelihood
-from .layers_gradients import Jacobian, Hessian, FIM
-from .external import GLM, LossGLM
+from .external import GLM
 from .processModel import ProcessModel
 
-logger = logging.getLogger(__name__)
-
 
 class NormGLM(GLM, ProcessModel):
 
     def __init__(
             self,
             model_vars,
-            dtype,
-            compute_a,
-            compute_b,
-            use_gradient_tape,
-            optimizer
+            optimizer: str,
+            compute_a: bool,
+            compute_b: bool,
+            use_gradient_tape: bool,
+            dtype: str,
     ):
-        self.compute_a = compute_a
-        self.compute_b = compute_b
-
         super(NormGLM, self).__init__(
             model_vars=model_vars,
-            unpack_params=UnpackParams(),
-            linear_loc=LinearLoc(),
-            linear_scale=LinearScale(),
-            linker_loc=LinkerLoc(),
-            linker_scale=LinkerScale(),
-            likelihood=Likelihood(dtype),
-            jacobian=Jacobian(
-                model_vars=model_vars,
-                compute_a=self.compute_a,
-                compute_b=self.compute_b,
-                dtype=dtype),
-            hessian=Hessian(
-                model_vars=model_vars,
-                compute_a=self.compute_a,
-                compute_b=self.compute_b,
-                dtype=dtype),
-            fim=FIM(
-                model_vars=model_vars,
-                compute_a=self.compute_a,
-                compute_b=self.compute_b,
-                dtype=dtype),
+            noise_module='glm_norm',
+            optimizer=optimizer,
+            compute_a=compute_a,
+            compute_b=compute_b,
             use_gradient_tape=use_gradient_tape,
-            optimizer=optimizer
+            dtype=dtype
         )
-
-
-class LossGLMNorm(LossGLM):
-
-    """
-    Full class
-    """

From dc4fd933d5f7612cdcd321f993131c6c86708c71 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 5 May 2020 19:26:04 +0200
Subject: [PATCH 104/124] cleanup glm_norm folder, rm Loss

---
 batchglm/train/tf2/glm_norm/estimator.py | 7 ++-----
 batchglm/train/tf2/glm_norm/external.py  | 2 +-
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/batchglm/train/tf2/glm_norm/estimator.py b/batchglm/train/tf2/glm_norm/estimator.py
index d92798ab..b66176da 100644
--- a/batchglm/train/tf2/glm_norm/estimator.py
+++ b/batchglm/train/tf2/glm_norm/estimator.py
@@ -6,7 +6,7 @@
 from .external import closedform_norm_glm_logsd
 from .external import InputDataGLM, Model
 from .external import Estimator as GLMEstimator
-from .model import NormGLM, LossGLMNorm
+from .model import NormGLM
 from .processModel import ProcessModel
 from .vars import ModelVars
 from .training_strategies import TrainingStrategies
@@ -22,7 +22,6 @@ class Estimator(GLMEstimator, ProcessModel):
     """
 
     model: NormGLM
-    loss: LossGLMNorm
 
     def __init__(
             self,
@@ -30,7 +29,7 @@ def __init__(
             init_a: Union[np.ndarray, str] = "AUTO",
             init_b: Union[np.ndarray, str] = "AUTO",
             quick_scale: bool = False,
-            dtype="float64",
+            dtype="float32",
     ):
         """
         Performs initialisation and creates a new estimator.
@@ -115,8 +114,6 @@ def train(
         else:
             self.model.setMethod(optim_algo)
 
-        self._loss = LossGLMNorm()
-
         optimizer_object = self.get_optimizer_object(optim_algo, learning_rate)
 
         super(Estimator, self)._train(
diff --git a/batchglm/train/tf2/glm_norm/external.py b/batchglm/train/tf2/glm_norm/external.py
index 4b290d2e..5fc7ab0b 100644
--- a/batchglm/train/tf2/glm_norm/external.py
+++ b/batchglm/train/tf2/glm_norm/external.py
@@ -7,6 +7,6 @@
 from batchglm.utils.linalg import groupwise_solve_lm
 from batchglm import pkg_constants
 
-from batchglm.train.tf2.base_glm import ProcessModelGLM, GLM, LossGLM, Estimator, ModelVarsGLM
+from batchglm.train.tf2.base_glm import ProcessModelGLM, GLM, Estimator, ModelVarsGLM
 from batchglm.train.tf2.base_glm import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
 from batchglm.train.tf2.base_glm import FIMGLM, JacobianGLM, HessianGLM

From 1a462837ee1b147ebe5b0e40247b15732d65b789 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 5 May 2020 19:26:46 +0200
Subject: [PATCH 105/124] cleanup glm_beta folder, rm Loss

---
 batchglm/train/tf2/glm_beta/estimator.py | 6 ++----
 batchglm/train/tf2/glm_beta/external.py  | 3 +--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/batchglm/train/tf2/glm_beta/estimator.py b/batchglm/train/tf2/glm_beta/estimator.py
index 57ae76e5..a2f3f056 100644
--- a/batchglm/train/tf2/glm_beta/estimator.py
+++ b/batchglm/train/tf2/glm_beta/estimator.py
@@ -6,7 +6,7 @@
 from .external import closedform_beta_glm_logitmean, closedform_beta_glm_logsamplesize
 from .external import InputDataGLM, Model
 from .external import Estimator as GLMEstimator
-from .model import BetaGLM, LossGLMBeta
+from .model import BetaGLM
 from .processModel import ProcessModel
 from .vars import ModelVars
 from .training_strategies import TrainingStrategies
@@ -26,7 +26,7 @@ def __init__(
             init_a: Union[np.ndarray, str] = "AUTO",
             init_b: Union[np.ndarray, str] = "AUTO",
             quick_scale: bool = False,
-            dtype="float64",
+            dtype="float32",
     ):
         """
         Performs initialisation and creates a new estimator.
@@ -112,8 +112,6 @@ def train(
         else:
             self.model.setMethod(optim_algo)
 
-        self._loss = LossGLMBeta()
-
         optimizer_object = self.get_optimizer_object(optim_algo, learning_rate)
 
         super(Estimator, self)._train(
diff --git a/batchglm/train/tf2/glm_beta/external.py b/batchglm/train/tf2/glm_beta/external.py
index f7b5d508..6ee962a0 100644
--- a/batchglm/train/tf2/glm_beta/external.py
+++ b/batchglm/train/tf2/glm_beta/external.py
@@ -6,7 +6,6 @@
 from batchglm.models.glm_beta.utils import closedform_beta_glm_logitmean, closedform_beta_glm_logsamplesize
 from batchglm.utils.linalg import groupwise_solve_lm
 
-from batchglm.train.tf2.base_glm import ProcessModelGLM, GLM, LossGLM, Estimator, ModelVarsGLM
+from batchglm.train.tf2.base_glm import ProcessModelGLM, GLM, Estimator, ModelVarsGLM
 from batchglm.train.tf2.base_glm import LinearLocGLM, LinearScaleGLM, LinkerLocGLM, LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
 from batchglm.train.tf2.base_glm import FIMGLM, JacobianGLM, HessianGLM
-

From ae9d864dc381087a13d411a54e752fb6baef3805 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 5 May 2020 19:28:21 +0200
Subject: [PATCH 106/124] ll_dtype->self.dtype in likelihood

---
 batchglm/train/tf2/glm_nb/layers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/glm_nb/layers.py b/batchglm/train/tf2/glm_nb/layers.py
index 4aff0436..afa67f7e 100644
--- a/batchglm/train/tf2/glm_nb/layers.py
+++ b/batchglm/train/tf2/glm_nb/layers.py
@@ -43,7 +43,7 @@ def _ll(self, eta_loc, eta_scale, loc, scale, x):
         if isinstance(x, tf.SparseTensor):
             log_probs_sparse = x.__mul__(eta_loc - log_r_plus_mu)
             log_probs_dense = tf.math.lgamma(tf.sparse.add(x, scale)) - \
-                              tf.math.lgamma(tf.sparse.add(x, tf.ones(shape=x.dense_shape, dtype=self.ll_dtype))) - \
+                              tf.math.lgamma(tf.sparse.add(x, tf.ones(shape=x.dense_shape, dtype=self.dtype))) - \
                               tf.math.lgamma(scale) + \
                               tf.multiply(scale, eta_scale - log_r_plus_mu)
             log_probs = tf.sparse.add(log_probs_sparse, log_probs_dense)

From 3fc33b5e57babd2733b422855319093e219042e6 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 5 May 2020 19:29:05 +0200
Subject: [PATCH 107/124] rm Loss

---
 batchglm/train/tf2/glm_nb/external.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/batchglm/train/tf2/glm_nb/external.py b/batchglm/train/tf2/glm_nb/external.py
index 5d62ca66..2d6638fa 100644
--- a/batchglm/train/tf2/glm_nb/external.py
+++ b/batchglm/train/tf2/glm_nb/external.py
@@ -14,7 +14,6 @@
 from batchglm.train.tf2.base_glm import LinearLocGLM, LinearScaleGLM, LinkerLocGLM
 from batchglm.train.tf2.base_glm import LinkerScaleGLM, LikelihoodGLM, UnpackParamsGLM
 from batchglm.train.tf2.base_glm import FIMGLM, JacobianGLM, HessianGLM
-from batchglm.train.tf2.base_glm import LossGLM
 from batchglm.train.tf2.base_glm import Estimator
 
 # these are needed for nb specific irls_ls_tr training

From 7ab343e8405409d18010f726b3449d63ba943de7 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 5 May 2020 19:29:40 +0200
Subject: [PATCH 108/124] add support for irls(_tr)_gd_tr, cleanup

---
 batchglm/train/tf2/glm_nb/estimator.py |  8 ++--
 batchglm/train/tf2/glm_nb/optim.py     | 59 +++++++++++++++++++-------
 2 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index 1367f3e2..cb90e6ca 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -27,7 +27,7 @@ def __init__(
             init_a: Union[np.ndarray, str] = "AUTO",
             init_b: Union[np.ndarray, str] = "AUTO",
             quick_scale: bool = False,
-            dtype="float64",
+            dtype="float32",
     ):
         """
         Performs initialisation and creates a new estimator.
@@ -116,7 +116,7 @@ def train(
         intercept_scale = len(self.model.model_vars.idx_train_scale) == 1
         optimizer_object = self.get_optimizer_object(optim_algo, learning_rate, intercept_scale)
         self.optimizer = optimizer_object
-        if optim_algo.lower() in ['irls_gd_tr', 'irls_ar_tr']:
+        if optim_algo.lower() in ['irls_gd_tr', 'irls_ar_tr', 'irls_tr_gd_tr']:
             self.update = self.update_separated
             self.maxiter = maxiter
 
@@ -131,12 +131,12 @@ def train(
             featurewise=featurewise,
             benchmark=benchmark,
             optim_algo=optim_algo,
-            b_update_freq = b_update_freq
+            b_update_freq=b_update_freq
         )
 
     def get_optimizer_object(self, optimizer, learning_rate, intercept_scale):
         optim = optimizer.lower()
-        if optim in ['irls_gd_tr', 'irls_gd', 'irls_ar_tr']:
+        if optim in ['irls_gd_tr', 'irls_gd', 'irls_ar_tr', 'irls_tr_gd_tr']:
             return IRLS_LS(
                 dtype=self.dtype,
                 tr_mode=optim.endswith('tr'),
diff --git a/batchglm/train/tf2/glm_nb/optim.py b/batchglm/train/tf2/glm_nb/optim.py
index 16211345..97301fe7 100644
--- a/batchglm/train/tf2/glm_nb/optim.py
+++ b/batchglm/train/tf2/glm_nb/optim.py
@@ -6,14 +6,13 @@ class IRLS_LS(IRLS):
 
     def __init__(self, dtype, tr_mode, model, name, n_obs, intercept_scale):
 
-        super(IRLS_LS, self).__init__(
-            dtype=dtype,
-            trusted_region_mode=tr_mode,
-            model=model,
-            name=name,
-            n_obs=n_obs)
-
-        if name.startswith('irls_gd'):
+        parent_tr_mode = False
+        self.tr_mode_b = False
+        if name.startswith('irls_tr'):
+            parent_tr_mode = True  # for loc model
+            if name.startswith('irls_tr_gd'):
+                self.update_b_func = self.update_b_gd
+        elif name.startswith('irls_gd'):
             self.update_b_func = self.update_b_gd
         elif name in ['irls_ar_tr', 'irls_ar']:
             assert intercept_scale, "Line search (IRLS_AR_TR) is only available" \
@@ -21,12 +20,19 @@ def __init__(self, dtype, tr_mode, model, name, n_obs, intercept_scale):
             self.update_b_func = self.update_b_ar
         else:
             assert False, "Unrecognized method for optimization given."
+        super(IRLS_LS, self).__init__(
+            dtype=dtype,
+            tr_mode=parent_tr_mode,
+            model=model,
+            name=name,
+            n_obs=n_obs)
 
         if tr_mode:
             n_features = self.model.model_vars.n_features
             self.tr_radius_b = tf.Variable(
                 np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT_SCALE,
                 dtype=self._dtype, trainable=False)
+            self.tr_mode_b = True
 
     def _trust_region_linear_cost_gain(
             self,
@@ -39,8 +45,15 @@ def _trust_region_linear_cost_gain(
         ), axis=0)
         return pred_cost_gain
 
-    def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False, maxiter=1):
-
+    def perform_parameter_update(
+        self,
+        inputs,
+        compute_a=True,
+        compute_b=True,
+        batch_features=False,
+        is_batched=False,
+        maxiter=1
+    ):
         assert compute_a ^ compute_b, \
             "IRLS_LS computes either loc or scale model updates, not both nor none at the same time."
 
@@ -97,15 +110,24 @@ def update_b_gd(self, inputs, batch_features, is_batched):
         x_batches, log_probs, _, jac_b = inputs
 
         update_b = tf.transpose(jac_b)
-        if not self.trusted_region_mode:
+        if not self.tr_mode_b:
             update = self._pad_updates(
                 update_raw=update_b,
                 compute_a=False,
                 compute_b=True
             )
+
+            update_theta = self._trial_update(
+                x_batches=x_batches,
+                log_probs=log_probs,
+                proposed_vector=update,
+                is_batched=is_batched,
+                compute_a=False,
+                compute_b=True
+            )
             self.model.params_copy.assign_sub(update)
 
-            return update
+            return tf.where(update_theta, update, tf.zeros_like(update))
 
         else:
             if batch_features:
@@ -127,16 +149,21 @@ def update_b_gd(self, inputs, batch_features, is_batched):
             )
 
             # perform update
-            update_theta = self._trust_region_ops(
+            update_theta = self._trial_update(
                 x_batches=x_batches,
                 log_probs=log_probs,
                 proposed_vector=tr_update_b,
-                proposed_gain=None,  # TODO remove completely, not needed any longer
+                is_batched=is_batched,
+                compute_a=False,
+                compute_b=True)
+            self._trust_region_ops(
+                proposed_vector=tr_update_b,
                 compute_a=False,
                 compute_b=True,
                 batch_features=batch_features,
-                is_batched=is_batched
-            )
+                update_theta=update_theta)
+
+            #print(self.tr_radius_b[self.model.model_vars.remaining_features])
 
             return tf.where(update_theta, tr_proposed_vector_b, tf.zeros_like(tr_proposed_vector_b))
 

From 98d3fe6ed4a8fe475223a125c2f6ba0348cb0d03 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 5 May 2020 19:30:42 +0200
Subject: [PATCH 109/124] cleanup, revert updates not only if tr_mode

---
 batchglm/train/tf2/base_glm/optim.py | 219 +++++++++++++++------------
 1 file changed, 120 insertions(+), 99 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index c4b91926..2bd5670c 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -35,37 +35,75 @@ def _create_slots(self, var_list):
 
     def _trust_region_ops(
             self,
-            x_batches,
-            log_probs,
             proposed_vector,
-            proposed_gain,
             compute_a,
             compute_b,
             batch_features,
-            is_batched,
+            update_theta,
     ):
-        # Load hyper-parameters:
-        # assert pkg_constants.TRUST_REGION_ETA0 < pkg_constants.TRUST_REGION_ETA1, \
-        #    "eta0 must be smaller than eta1"
-        # assert pkg_constants.TRUST_REGION_ETA1 <= pkg_constants.TRUST_REGION_ETA2, \
-        #    "eta1 must be smaller than or equal to eta2"
-        # assert pkg_constants.TRUST_REGION_T1 <= 1, "t1 must be smaller than 1"
-        # assert pkg_constants.TRUST_REGION_T2 >= 1, "t1 must be larger than 1"
-        # Set trust region hyper-parameters
-        eta0 = tf.constant(pkg_constants.TRUST_REGION_ETA0, dtype=self._dtype)
-        # eta2 = tf.constant(pkg_constants.TRUST_REGION_ETA2, dtype=self._dtype)
         if compute_b and not compute_a:
-            t1 = pkg_constants.TRUST_REGION_T1
-            t2 = pkg_constants.TRUST_REGION_T2
-        else:
             t1 = pkg_constants.TRUST_REGIONT_T1_IRLS_GD_TR_SCALE
             t2 = pkg_constants.TRUST_REGIONT_T2_IRLS_GD_TR_SCALE
+        else:
+            t1 = pkg_constants.TRUST_REGION_T1
+            t2 = pkg_constants.TRUST_REGION_T2
         t1 = tf.constant(t1, dtype=self._dtype)
         t2 = tf.constant(t2, dtype=self._dtype)
         upper_bound = tf.constant(pkg_constants.TRUST_REGION_UPPER_BOUND, dtype=self._dtype)
 
         # Phase I: Perform a trial update.
         # Propose parameter update:
+
+        coeffs = self.model.model_vars.idx_train_loc if compute_b and not compute_a else self.model.model_vars.idx_train_scale
+        if compute_b:
+            idx_stop = self.model.model_vars.idx_train_scale[-1] + 1
+            if compute_a:
+                idx_start = 0
+            else:
+                idx_start = self.model.model_vars.idx_train_loc[-1] + 1
+        else:
+            idx_start = 0
+            idx_stop = self.model.model_vars.idx_train_loc[-1] + 1
+        update_vector_length = tf.sqrt(tf.reduce_sum(tf.square(proposed_vector[idx_start:idx_stop]), axis=0))
+        update_theta_full = update_theta
+        if batch_features:
+            n_features = self.model.model_vars.n_features
+            indices = tf.where(self.model.model_vars.remaining_features)
+            update_theta_full = tf.scatter_nd(indices, update_theta, shape=(n_features,))
+            update_vector_length = tf.scatter_nd(indices, update_vector_length, shape=(n_features,))
+
+        tr_radius = self.tr_radius_b if compute_b and not compute_a else self.tr_radius
+
+        increase_radius = update_theta_full
+        decrease_radius = tf.logical_not(update_theta_full)
+
+        if compute_a:
+            increase_radius = tf.logical_and(increase_radius, update_vector_length > 0.9 * tr_radius)
+            decrease_radius = tf.logical_or(decrease_radius, update_vector_length < 0.5 * tr_radius)
+
+        if compute_b and not compute_a:
+            self.model.model_vars.updated_b = update_theta_full.numpy()
+        else:
+            self.model.model_vars.updated = update_theta_full.numpy()
+
+        # Update trusted region accordingly:
+
+        keep_radius = tf.logical_and(tf.logical_not(decrease_radius),
+                                     tf.logical_not(increase_radius))
+        radius_update = tf.add_n([
+            tf.multiply(t1, tf.cast(decrease_radius, self._dtype)),
+            tf.multiply(t2, tf.cast(increase_radius, self._dtype)),
+            tf.multiply(tf.ones_like(t1), tf.cast(keep_radius, self._dtype))
+        ])
+
+        radius_new = tf.minimum(tf.multiply(tr_radius, radius_update), upper_bound)
+        tr_radius.assign(radius_new)
+
+        return update_theta
+
+    def _trial_update(self, x_batches, log_probs, proposed_vector, is_batched, compute_a, compute_b):
+
+        eta0 = tf.constant(pkg_constants.TRUST_REGION_ETA0, dtype=self._dtype)
         """
         Current likelihood refers to the likelihood that has been calculated in the last model call.
         We are always evaluating on the full model, so if we train on the batched model (is_batched),
@@ -113,61 +151,27 @@ def _trust_region_ops(
         # Compute parameter updates.g
         update_theta = delta_f_actual > eta0
         self.model.params_copy.assign(tf.where(update_theta, self.model.params_copy, original_params_copy))
-        update_theta |= tf.sqrt(tf.reduce_sum(tf.square(proposed_vector), axis=0)) < pkg_constants.TRTOL_BY_FEATURE_LOC
-        #update_theta_numeric = tf.expand_dims(tf.cast(update_theta, self._dtype), axis=0)
-        #keep_theta_numeric = tf.ones_like(update_theta_numeric) - update_theta_numeric
-
-        decrease_radius = tf.math.logical_not(update_theta)
-        increase_radius = update_theta
-        if batch_features:
-            n_features = self.model.model_vars.n_features
-            indices = tf.where(self.model.model_vars.remaining_features)
-            decrease_radius = tf.scatter_nd(indices, decrease_radius, shape=(n_features,))
-            increase_radius = tf.scatter_nd(indices, update_theta, shape=(n_features,))
-
-        if compute_b and not compute_a:
-            self.model.model_vars.updated_b = increase_radius.numpy()
-        else:
-            self.model.model_vars.updated = increase_radius.numpy()
-
-
-        # Update trusted region accordingly:
-
-        keep_radius = tf.logical_and(tf.logical_not(decrease_radius),
-                                     tf.logical_not(increase_radius))
-        radius_update = tf.add_n([
-            tf.multiply(t1, tf.cast(decrease_radius, self._dtype)),
-            tf.multiply(t2, tf.cast(increase_radius, self._dtype)),
-            tf.multiply(tf.ones_like(t1), tf.cast(keep_radius, self._dtype))
-        ])
-
-        if compute_b and not compute_a:
-            tr_radius = self.tr_radius_b
-        else:
-            tr_radius = self.tr_radius
-
-        radius_new = tf.minimum(tf.multiply(tr_radius, radius_update), upper_bound)
-        tr_radius.assign(radius_new)
 
         return update_theta
 
-    def __init__(self, dtype: tf.dtypes.DType, trusted_region_mode: bool, model: tf.keras.Model, name: str, n_obs: int):
+    def __init__(self, dtype: tf.dtypes.DType, tr_mode: bool, model: tf.keras.Model, name: str, n_obs: int):
 
         super(SecondOrderOptim, self).__init__(name)
 
         self.model = model
         self._dtype = dtype
         self.n_obs = tf.cast(n_obs, dtype=self._dtype)
-        self.trusted_region_mode = trusted_region_mode
+        self.tr_mode = tr_mode
 
-        if trusted_region_mode:
-            n_features = self.model.model_vars.n_features
+        n_features = self.model.model_vars.n_features
+        if tr_mode:
             self.tr_radius = tf.Variable(
                 np.zeros(shape=[n_features]) + pkg_constants.TRUST_REGION_RADIUS_INIT,
                 dtype=self._dtype, trainable=False)
 
         else:
             self.tr_radius = tf.Variable(np.array([np.inf]), dtype=self._dtype, trainable=False)
+            self.model.model_vars.updated = np.repeat(a=True, repeats=n_features)
 
     @abc.abstractmethod
     def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
@@ -286,17 +290,18 @@ def _get_updates(self, lhs, rhs, compute_a, compute_b):
     def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
 
         x_batches, log_probs, jacobians, hessians = inputs
+        tr_mode = self.tr_mode
         if compute_b:
             if not compute_a:
                 self.model.model_vars.updated_b = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to is updated.
+                tr_mode = self.tr_mode_b
 
         assert (compute_a or compute_b), "Nothing can be trained. Please make sure" \
             "at least one of train_mu and train_r is set to True."
 
         update_raw, update = self._get_updates(hessians, jacobians, compute_a, compute_b)
 
-        if self.trusted_region_mode:
-
+        if tr_mode:
             if batch_features:
                 radius_container = tf.boolean_mask(
                     tensor=self.tr_radius,
@@ -307,31 +312,43 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                 update_raw=update_raw,
                 radius_container=radius_container
             )
-            tr_pred_cost_gain = self._trust_region_newton_cost_gain(
+            #tr_pred_cost_gain = self._trust_region_newton_cost_gain(
+            #    proposed_vector=tr_proposed_vector,
+            #    neg_jac=jacobians,
+            #    hessian_fim=hessians
+            #)
+
+            #tr_proposed_vector_pad = self._pad_updates(
+            #    update_raw=tr_proposed_vector,
+            #    compute_a=compute_a,
+            #    compute_b=compute_b
+            #)
+            update_theta = self._trial_update(
+                x_batches=x_batches,
+                log_probs=log_probs,
                 proposed_vector=tr_proposed_vector,
-                neg_jac=jacobians,
-                hessian_fim=hessians
-            )
-
-            tr_proposed_vector_pad = self._pad_updates(
-                update_raw=tr_proposed_vector,
+                is_batched=is_batched,
                 compute_a=compute_a,
-                compute_b=compute_b
-            )
-
+                compute_b=compute_b)
             self._trust_region_ops(
-                x_batches=x_batches,
-                log_probs=log_probs,
-                proposed_vector=tr_proposed_vector_pad,
-                proposed_gain=tr_pred_cost_gain,
+                proposed_vector=tr_proposed_vector,
                 compute_a=compute_a,
                 compute_b=compute_b,
                 batch_features=batch_features,
-                is_batched=is_batched
+                update_theta=update_theta
             )
 
         else:
-            self.model.params_copy.assign_sub(update)
+            update_theta = self._trial_update(
+                x_batches=x_batches,
+                log_probs=log_probs,
+                proposed_vector=update,
+                is_batched=is_batched,
+                compute_a=compute_a,
+                compute_b=compute_b
+            )
+
+            #self.model.params_copy.assign_sub(update)
 
 
 class IRLS(SecondOrderOptim):
@@ -364,21 +381,23 @@ def _calc_proposed_vector_and_pred_cost_gain(
             radius_container=radius_container
         )
 
-        pred_cost_gain_x = self._trust_region_newton_cost_gain(
-            proposed_vector=proposed_vector_x,
-            neg_jac=neg_jac_x,
-            hessian_fim=fim_x
-        )
+        #pred_cost_gain_x = self._trust_region_newton_cost_gain(
+        #    proposed_vector=proposed_vector_x,
+        #    neg_jac=neg_jac_x,
+        #    hessian_fim=fim_x
+        #)
 
-        return proposed_vector_x, pred_cost_gain_x
+        return proposed_vector_x, None#pred_cost_gain_x
 
 
     def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch_features=False, is_batched=False):
 
         x_batches, log_probs, jac_a, jac_b, fim_a, fim_b = inputs
+        tr_mode = self.tr_mode
         if compute_b:
             if not compute_a:
                 self.model.model_vars.updated_b = np.repeat(a=False, repeats=self.params.shape[1])  # Initialise to is updated.
+                tr_mode = self.tr_mode_b
 
         assert (compute_a or compute_b), "Nothing can be trained. Please make sure" \
             "at least one of train_mu and train_r is set to True."
@@ -401,7 +420,7 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                 rhs=jac_b
             )
 
-        if not self.trusted_region_mode:
+        if not tr_mode:
             if compute_a:
                 if compute_b:
                     update_raw = tf.concat([update_a, update_b], axis=0)
@@ -416,18 +435,16 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                 compute_b=compute_b
             )
 
-            if batch_features:
-                indices = tf.where(self.model.model_vars.remaining_features)
-                update_var = tf.transpose(
-                    tf.scatter_nd(
-                        indices,
-                        tf.transpose(update),
-                        shape=(self.model.model_vars.n_features, update.get_shape()[0])
-                    )
-                )
-            else:
-                update_var = update
-            self.model.params_copy.assign_sub(update_var)
+            update_theta = self._trial_update(
+                x_batches=x_batches,
+                log_probs=log_probs,
+                proposed_vector=update,
+                is_batched=is_batched,
+                compute_a=compute_a,
+                compute_b=compute_b
+            )
+            #print(update_theta)
+            #self.model.params_copy.assign_sub(update_var)
 
         else:
             # put together update_raw based on proposed vector and cost gain depending on train_r and train_mu
@@ -446,12 +463,12 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                         update_a, radius_container, jac_a, fim_a)
 
                     tr_update_raw = tf.concat([tr_proposed_vector_a, tr_proposed_vector_b], axis=0)
-                    tr_pred_cost_gain = tf.add(tr_pred_cost_gain_a, tr_pred_cost_gain_b)
+                    #tr_pred_cost_gain = tf.add(tr_pred_cost_gain_a, tr_pred_cost_gain_b)
                 else:
                     # directly apply output of calc_proposed_vector_and_pred_cost_gain to tr_update_raw
                     # and tr_pred_cost_gain
                     tr_update_raw = tr_proposed_vector_b
-                    tr_pred_cost_gain = tr_pred_cost_gain_b
+                    #tr_pred_cost_gain = tr_pred_cost_gain_b
             else:
                 # here train_r is False AND train_mu is true, so the output of the function can directly be applied to
                 # tr_update_raw and tr_pred_cost_gain, similar to train_r = True and train_mu = False
@@ -463,18 +480,22 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                 compute_a=compute_a,
                 compute_b=compute_b
             )
-
             # perform update
-            self._trust_region_ops(
+            update_theta = self._trial_update(
                 x_batches=x_batches,
                 log_probs=log_probs,
                 proposed_vector=tr_update,
-                proposed_gain=tr_pred_cost_gain,
+                is_batched=is_batched,
+                compute_a=compute_a,
+                compute_b=compute_b)
+            self._trust_region_ops(
+                proposed_vector=tr_update,
                 compute_a=compute_a,
                 compute_b=compute_b,
                 batch_features=batch_features,
-                is_batched=is_batched
+                update_theta=update_theta
             )
+            print(self.tr_radius[self.model.model_vars.remaining_features])
 
     def calc_delta_f_actual(self, current_likelihood, new_likelihood, jacobian):
         eta1 = tf.constant(pkg_constants.TRUST_REGION_ETA1, dtype=self._dtype)

From 75ed5b808628736912bae730380e2c46486e953e Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 5 May 2020 19:32:14 +0200
Subject: [PATCH 110/124] pass dtype to keras superclass for all layers

---
 batchglm/train/tf2/base_glm/layers.py         | 25 ++++++------
 .../train/tf2/base_glm/layers_gradients.py    | 39 +++++++++----------
 2 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/layers.py b/batchglm/train/tf2/base_glm/layers.py
index 4c22da02..5e66ab10 100644
--- a/batchglm/train/tf2/base_glm/layers.py
+++ b/batchglm/train/tf2/base_glm/layers.py
@@ -2,8 +2,6 @@
 
 import abc
 import tensorflow as tf
-tf.keras.backend.set_floatx("float64")
-
 from .processModel import ProcessModelGLM
 
 
@@ -13,8 +11,8 @@ class UnpackParamsGLM(tf.keras.layers.Layer, ProcessModelGLM):
     Layer that slices the parameter tensor into mean and variance block.
     """
 
-    def __init__(self):
-        super(UnpackParamsGLM, self).__init__()
+    def __init__(self, dtype):
+        super(UnpackParamsGLM, self).__init__(dtype=dtype)
 
     def call(self, inputs, **kwargs):
         """
@@ -40,8 +38,8 @@ class LinearLocGLM(tf.keras.layers.Layer, ProcessModelGLM):
     Computes the dot product between the design matrix of the mean model and the mean block of the parameter matrix.
     """
 
-    def __init__(self):
-        super(LinearLocGLM, self).__init__()
+    def __init__(self, dtype):
+        super(LinearLocGLM, self).__init__(dtype=dtype)
 
     def _eta_loc(
             self,
@@ -111,8 +109,8 @@ class LinearScaleGLM(tf.keras.layers.Layer, ProcessModelGLM):
     and the variance block of the parameter matrix.
     """
 
-    def __init__(self):
-        super(LinearScaleGLM, self).__init__()
+    def __init__(self, dtype):
+        super(LinearScaleGLM, self).__init__(dtype=dtype)
 
     def _eta_scale(
             self,
@@ -166,8 +164,8 @@ class LinkerLocGLM(tf.keras.layers.Layer):
     Translation from linker to data space for the mean model.
     """
 
-    def __init__(self):
-        super(LinkerLocGLM, self).__init__()
+    def __init__(self, dtype):
+        super(LinkerLocGLM, self).__init__(dtype=dtype)
 
     @abc.abstractmethod
     def _inv_linker(self, loc: tf.Tensor):
@@ -202,8 +200,8 @@ class LinkerScaleGLM(tf.keras.layers.Layer):
     Translation from linker to data space for the variance model.
     """
 
-    def __init__(self):
-        super(LinkerScaleGLM, self).__init__()
+    def __init__(self, dtype):
+        super(LinkerScaleGLM, self).__init__(dtype=dtype)
 
     @abc.abstractmethod
     def _inv_linker(self, scale: tf.Tensor):
@@ -230,8 +228,7 @@ class LikelihoodGLM(tf.keras.layers.Layer, ProcessModelGLM):
     """
 
     def __init__(self, dtype):
-        super(LikelihoodGLM, self).__init__()
-        self.ll_dtype = dtype
+        super(LikelihoodGLM, self).__init__(dtype=dtype)
 
     @abc.abstractmethod
     def _ll(self, eta_loc, eta_scale, loc, scale, x):
diff --git a/batchglm/train/tf2/base_glm/layers_gradients.py b/batchglm/train/tf2/base_glm/layers_gradients.py
index 65df20b6..a807e51f 100644
--- a/batchglm/train/tf2/base_glm/layers_gradients.py
+++ b/batchglm/train/tf2/base_glm/layers_gradients.py
@@ -7,9 +7,8 @@ class Gradient(tf.keras.layers.Layer):
     """Superclass for Jacobians, Hessian, FIM"""
 
     def __init__(self, model_vars, dtype):
-        super(Gradient, self).__init__()
+        super(Gradient, self).__init__(dtype=dtype)
         self.model_vars = model_vars
-        self.grad_dtype = dtype
 
     @abc.abstractmethod
     def call(self, inputs, **kwargs):
@@ -91,13 +90,13 @@ def _b_byobs():
 
         elif compute_a and not compute_b:
             fim_a = _a_byobs()
-            fim_b = tf.zeros(fim_a.get_shape(), self.grad_dtype)
+            fim_b = tf.zeros(fim_a.get_shape(), self.dtype)
         elif not compute_a and compute_b:
-            fim_a = tf.zeros(fim_a.get_shape(), self.grad_dtype)
+            fim_a = tf.zeros(fim_a.get_shape(), self.dtype)
             fim_b = _b_byobs()
         else:
-            fim_a = tf.zeros_like(self.model_vars.a_var, dtype=self.grad_dtype)
-            fim_b = tf.zeros_like(self.model_vars.b_var, dtype=self.grad_dtype)
+            fim_a = tf.zeros_like(self.model_vars.a_var, dtype=self.dtype)
+            fim_b = tf.zeros_like(self.model_vars.b_var, dtype=self.dtype)
 
         if concat:
             fim = tf.concat([fim_a, fim_b], axis=1)
@@ -199,13 +198,13 @@ def _b_byobs():
             j_b = _b_byobs()
         elif compute_a and not compute_b:
             j_a = _a_byobs()
-            j_b = tf.zeros((j_a.get_shape()[0], self.model_vars.b_var.get_shape()[0]), dtype=self.grad_dtype)
+            j_b = tf.zeros((j_a.get_shape()[0], self.model_vars.b_var.get_shape()[0]), dtype=self.dtype)
         elif not compute_a and compute_b:
             j_b = _b_byobs()
-            j_a = tf.zeros((j_b.get_shape()[0], self.model_vars.b_var.get_shape()[0]), dtype=self.grad_dtype)
+            j_a = tf.zeros((j_b.get_shape()[0], self.model_vars.b_var.get_shape()[0]), dtype=self.dtype)
         else:
-            j_a = tf.transpose(tf.zeros_like(self.model_vars.a_var, dtype=self.grad_dtype))
-            j_b = tf.transpose(tf.zeros_like(self.model_vars.b_var, dtype=self.grad_dtype))
+            j_a = tf.transpose(tf.zeros_like(self.model_vars.a_var, dtype=self.dtype))
+            j_b = tf.transpose(tf.zeros_like(self.model_vars.b_var, dtype=self.dtype))
 
         if concat:
             j = tf.concat([j_a, j_b], axis=1)
@@ -347,19 +346,19 @@ def _ab_byobs_batched():
             h_ba = tf.transpose(h_ab, perm=[0, 2, 1])
         elif compute_a and not compute_b:
             h_aa = _aa_byobs_batched()
-            h_bb = tf.zeros_like(h_aa, dtype=self.grad_dtype)
-            h_ab = tf.zeros_like(h_aa, dtype=self.grad_dtype)
-            h_ba = tf.zeros_like(h_aa, dtype=self.grad_dtype)
+            h_bb = tf.zeros_like(h_aa, dtype=self.dtype)
+            h_ab = tf.zeros_like(h_aa, dtype=self.dtype)
+            h_ba = tf.zeros_like(h_aa, dtype=self.dtype)
         elif not compute_a and compute_b:
             h_bb = _bb_byobs_batched()
-            h_aa = tf.zeros_like(h_bb, dtype=self.grad_dtype)
-            h_ab = tf.zeros_like(h_bb, dtype=self.grad_dtype)
-            h_ba = tf.zeros_like(h_bb, dtype=self.grad_dtype)
+            h_aa = tf.zeros_like(h_bb, dtype=self.dtype)
+            h_ab = tf.zeros_like(h_bb, dtype=self.dtype)
+            h_ba = tf.zeros_like(h_bb, dtype=self.dtype)
         else:
-            h_aa = tf.zeros((), dtype=self.grad_dtype)
-            h_bb = tf.zeros((), dtype=self.grad_dtype)
-            h_ab = tf.zeros((), dtype=self.grad_dtype)
-            h_ba = tf.zeros((), dtype=self.grad_dtype)
+            h_aa = tf.zeros((), dtype=self.dtype)
+            h_bb = tf.zeros((), dtype=self.dtype)
+            h_ab = tf.zeros((), dtype=self.dtype)
+            h_ba = tf.zeros((), dtype=self.dtype)
 
         if concat:
             h = tf.concat(

From 2e1557c647a33ecf4f730b1f312dff49a8b871a1 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 5 May 2020 19:33:13 +0200
Subject: [PATCH 111/124] jumpt to scale update if loc already converged

---
 batchglm/train/tf2/base_glm/estimator.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 60414792..c6d8253b 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -82,7 +82,7 @@ def _train(
         # set necessary attributes
         self.noise_model = noise_model
         optim = optim_algo.lower()
-        self.irls_algo = optim in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr', 'irls_ar_tr']
+        self.irls_algo = optim in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr', 'irls_ar_tr', 'irls_tr_gd_tr']
         self.nr_algo = optim in ['nr', 'nr_tr']
 
         ################################################
@@ -231,6 +231,9 @@ def convergence_decision(num_converged, train_step):
 
             train_step += 1
             epochs_until_b_update = (epochs_until_b_update + b_update_freq - 1) % b_update_freq
+            #print(np.where(self.model.model_vars.remaining_features)[0])
+            if np.all(self.model.model_vars.converged):
+                epochs_until_b_update = 0
             # store some useful stuff for benchmarking purposes.
             if benchmark:
                 t1_epoch = time.time()
@@ -348,7 +351,7 @@ def get_optimizer_object(self, optimizer: str, learning_rate):
                 "dtype": self.dtype,
                 "model": self.model,
                 "name": optimizer,
-                "trusted_region_mode": tr_mode,
+                "tr_mode": tr_mode,
                 "n_obs": self.input_data.num_observations
             }
             if optimizer.startswith('irls'):

From a61b0dd10eb37fc64717890b6304049df51bd543 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 5 May 2020 19:34:43 +0200
Subject: [PATCH 112/124] add support for irls(_tr)_gd_tr

---
 batchglm/train/tf2/base_glm/convergence.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/convergence.py b/batchglm/train/tf2/base_glm/convergence.py
index 77734bd6..0dd04c79 100644
--- a/batchglm/train/tf2/base_glm/convergence.py
+++ b/batchglm/train/tf2/base_glm/convergence.py
@@ -124,16 +124,16 @@ def calculate_convergence(self, results, jac_normalization, optimizer_object, ba
 
         # In case we use irls_tr/irls_gd_tr or nr_tr, we can also utilize the trusted region radius.
         # For now it must not be below the threshold for the X step of the loc model.
-        if hasattr(optimizer_object, 'trusted_region_mode') \
-                and optimizer_object.trusted_region_mode:
+        if hasattr(optimizer_object, 'tr_mode') and optimizer_object.tr_mode:
             converged_tr = optimizer_object.tr_radius.numpy() < pkg_constants.TRTOL_BY_FEATURE_LOC
-            if hasattr(optimizer_object, 'tr_radius_b') and self.estimator._train_scale:
-                converged_tr_b = \
-                    optimizer_object.tr_radius_b.numpy() < pkg_constants.TRTOL_BY_FEATURE_SCALE
-                epoch_tr_converged_b = not_converged_b & converged_tr_b
-                epoch_step_converged_b |= epoch_tr_converged_b
+            print(converged_tr[self.estimator.model.model_vars.remaining_features])
             epoch_tr_converged = not_converged_a & converged_tr
             epoch_step_converged_a |= epoch_tr_converged
+        if hasattr(optimizer_object, 'tr_mode_b') and optimizer_object.tr_mode_b and self.estimator._train_scale:
+            converged_tr_b = optimizer_object.tr_radius_b.numpy() < pkg_constants.TRTOL_BY_FEATURE_SCALE
+            epoch_tr_converged_b = not_converged_b & converged_tr_b
+            epoch_step_converged_b |= epoch_tr_converged_b
+
         # print('tr: ', epoch_tr_converged[0], epoch_tr_converged_b[0])
         # print(self.estimator.model.model_vars.converged[0], self.estimator.model.model_vars.updated[0])
         # print(self.estimator.model.model_vars.converged_b[0], self.estimator.model.model_vars.updated_b[0])

From 7264bf8613f2116a5b7ab5b98a95927a6429f6b1 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Tue, 5 May 2020 19:35:05 +0200
Subject: [PATCH 113/124] rm Loss

---
 batchglm/train/tf2/base_glm/__init__.py | 2 +-
 batchglm/train/tf2/base_glm/external.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/__init__.py b/batchglm/train/tf2/base_glm/__init__.py
index 08f4e7dd..699b1bce 100644
--- a/batchglm/train/tf2/base_glm/__init__.py
+++ b/batchglm/train/tf2/base_glm/__init__.py
@@ -1,5 +1,5 @@
 from .processModel import ProcessModelGLM
-from .model import GLM, LossGLM
+from .model import GLM
 
 from .estimator import Estimator
 from .vars import ModelVarsGLM
diff --git a/batchglm/train/tf2/base_glm/external.py b/batchglm/train/tf2/base_glm/external.py
index 9188d2b0..db1c8e0e 100644
--- a/batchglm/train/tf2/base_glm/external.py
+++ b/batchglm/train/tf2/base_glm/external.py
@@ -1,5 +1,5 @@
 from batchglm.train.tf2.base import ProcessModelBase, ModelBase, TFEstimator
-from batchglm.train.tf2.base import OptimizerBase, LossBase
+from batchglm.train.tf2.base import OptimizerBase
 #from batchglm.train.tf2.glm_nb import NR, IRLS
 
 from batchglm.models.base_glm import InputDataGLM, _ModelGLM, _EstimatorGLM

From a7539b0c3479ccdf0956dbc728fba43a6486f93a Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 8 May 2020 12:15:30 +0200
Subject: [PATCH 114/124] alpha0 addet for armijo line search

---
 batchglm/pkg_constants.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/batchglm/pkg_constants.py b/batchglm/pkg_constants.py
index 8d8c4cc3..450a8dba 100644
--- a/batchglm/pkg_constants.py
+++ b/batchglm/pkg_constants.py
@@ -37,6 +37,7 @@
 
 WOLFE_C1 = 1e-3
 WOLFE_C2 = 0.99
+ALPHA0 = 100
 
 try:
     import tensorflow as tf

From b0e6bd46b00805166923addd4a39b6e9f765e415 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 8 May 2020 12:24:25 +0200
Subject: [PATCH 115/124] bugfix: scale grads zero if b_update_freq change

---
 batchglm/train/tf2/base_glm/estimator.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index c6d8253b..88e2c7c6 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -82,7 +82,7 @@ def _train(
         # set necessary attributes
         self.noise_model = noise_model
         optim = optim_algo.lower()
-        self.irls_algo = optim in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr', 'irls_ar_tr', 'irls_tr_gd_tr']
+        self.irls_algo = optim.startswith('irls')
         self.nr_algo = optim in ['nr', 'nr_tr']
 
         ################################################
@@ -171,7 +171,6 @@ def convergence_decision(num_converged, train_step):
             ############################################
             # 2. Update the parameters
             self.update(results, epoch_set, batch_features, epochs_until_b_update == 0)
-
             ############################################
             # 3. calculate new ll, jacs, hessian/fim
             compute_b = epochs_until_b_update < 2
@@ -231,9 +230,13 @@ def convergence_decision(num_converged, train_step):
 
             train_step += 1
             epochs_until_b_update = (epochs_until_b_update + b_update_freq - 1) % b_update_freq
-            #print(np.where(self.model.model_vars.remaining_features)[0])
-            if np.all(self.model.model_vars.converged):
-                epochs_until_b_update = 0
+
+            # make sure loc is not updated any longer if completely converged
+            if b_update_freq > 1 and epochs_until_b_update > 1:
+                if np.all(self.model.model_vars.converged):
+                    epochs_until_b_update = 1  # must not be 0: scale grads not yet calculated
+                    b_update_freq = 1  # from now on, calc scale grads in each step
+
             # store some useful stuff for benchmarking purposes.
             if benchmark:
                 t1_epoch = time.time()

From 7c8c0ba7a1456d29c40ea1cb06d9e5be138ebfe0 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 8 May 2020 12:27:27 +0200
Subject: [PATCH 116/124] updated known optimizer strings

---
 batchglm/train/tf2/base_glm/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/model.py b/batchglm/train/tf2/base_glm/model.py
index 396abb8e..378196cb 100644
--- a/batchglm/train/tf2/base_glm/model.py
+++ b/batchglm/train/tf2/base_glm/model.py
@@ -64,7 +64,7 @@ def setMethod(self, optimizer: str):
         elif optimizer in ['nr', 'nr_tr']:
             self._calc = self._calc_hessians
 
-        elif optimizer in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr', 'irls_ar', 'irls_ar_tr', 'irls_tr_gd_tr']:
+        elif optimizer in ['irls', 'irls_tr', 'irls_gd', 'irls_gd_tr', 'irls_ar', 'irls_tr_ar', 'irls_tr_gd_tr']:
             self._calc = self._calc_fim
         else:
             assert False, ("Unrecognized optimizer: %s", optimizer)

From 0082b821ea999382823fc8db1a3768bdf72cdba6 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 8 May 2020 12:28:49 +0200
Subject: [PATCH 117/124] style issue fix

---
 batchglm/train/tf2/glm_beta/layers.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/batchglm/train/tf2/glm_beta/layers.py b/batchglm/train/tf2/glm_beta/layers.py
index eb84bcb6..0cfc0ef9 100644
--- a/batchglm/train/tf2/glm_beta/layers.py
+++ b/batchglm/train/tf2/glm_beta/layers.py
@@ -38,15 +38,15 @@ class Likelihood(LikelihoodGLM, ProcessModel):
     def _ll(self, eta_loc, eta_scale, loc, scale, x):
 
         if isinstance(x, tf.SparseTensor):
-            one_minus_x = -tf.sparse.add(x, -tf.ones_like(loc))
+            one_minus_x = tf.negative(tf.sparse.add(x, tf.negative(tf.ones_like(loc))))
         else:
             one_minus_x = 1 - x
 
         one_minus_loc = 1 - loc
         log_probs = tf.math.lgamma(scale) - tf.math.lgamma(loc * scale) \
-                    - tf.math.lgamma(one_minus_loc * scale) \
-                    + (scale * loc - 1) * tf.math.log(x) \
-                    + (one_minus_loc * scale - 1) * tf.math.log(one_minus_x)
+            - tf.math.lgamma(one_minus_loc * scale) \
+            + (scale * loc - 1) * tf.math.log(x) \
+            + (one_minus_loc * scale - 1) * tf.math.log(one_minus_x)
 
         log_probs = self.tf_clip_param(log_probs, "log_probs")
 

From 621083da639134bf9d579402947692bc9ddb1184 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 8 May 2020 12:29:17 +0200
Subject: [PATCH 118/124] rm IRLS_LS strategy

---
 batchglm/train/tf2/glm_nb/training_strategies.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/batchglm/train/tf2/glm_nb/training_strategies.py b/batchglm/train/tf2/glm_nb/training_strategies.py
index 030b3c8a..858ec8a6 100644
--- a/batchglm/train/tf2/glm_nb/training_strategies.py
+++ b/batchglm/train/tf2/glm_nb/training_strategies.py
@@ -38,10 +38,3 @@ class TrainingStrategies(Enum):
             "optim_algo": "adam",
         },
     ]
-    IRLS_LS = [
-        {
-            "convergence_criteria": "all_converged",
-            "use_batching": False,
-            "optim_algo": "irls_ls_tr",
-        },
-    ]

From fd7ca8c93ca5ce53cb6cd47435da8ea9daacec9c Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 8 May 2020 12:34:03 +0200
Subject: [PATCH 119/124] bugfix: NaNs in armijo steplength

---
 batchglm/train/tf2/glm_nb/optim.py | 60 +++++++++++++-----------------
 1 file changed, 25 insertions(+), 35 deletions(-)

diff --git a/batchglm/train/tf2/glm_nb/optim.py b/batchglm/train/tf2/glm_nb/optim.py
index 97301fe7..ebab6441 100644
--- a/batchglm/train/tf2/glm_nb/optim.py
+++ b/batchglm/train/tf2/glm_nb/optim.py
@@ -10,12 +10,10 @@ def __init__(self, dtype, tr_mode, model, name, n_obs, intercept_scale):
         self.tr_mode_b = False
         if name.startswith('irls_tr'):
             parent_tr_mode = True  # for loc model
-            if name.startswith('irls_tr_gd'):
-                self.update_b_func = self.update_b_gd
-        elif name.startswith('irls_gd'):
+        if name in ['irls_tr_gd_tr', 'irls_gd_tr', 'irls_gd', 'irls_tr_gd']:
             self.update_b_func = self.update_b_gd
-        elif name in ['irls_ar_tr', 'irls_ar']:
-            assert intercept_scale, "Line search (IRLS_AR_TR) is only available" \
+        elif name in ['irls_ar', 'irls_tr_ar']:
+            assert intercept_scale, "Line search (armijo) is only available" \
                 "for scale models with a single coefficient (intercept scale)."
             self.update_b_func = self.update_b_ar
         else:
@@ -163,23 +161,18 @@ def update_b_gd(self, inputs, batch_features, is_batched):
                 batch_features=batch_features,
                 update_theta=update_theta)
 
-            #print(self.tr_radius_b[self.model.model_vars.remaining_features])
-
             return tf.where(update_theta, tr_proposed_vector_b, tf.zeros_like(tr_proposed_vector_b))
 
     def update_b_ar(self, inputs, batch_features, is_batched, alpha0=None):
 
-
-        c1 = pkg_constants.TRUST_REGION_ETA1
         x_batches, log_probs, _, jac_b = inputs
         jac_b = tf.reshape(jac_b, [jac_b.shape[0]])
-        #jac_b = tf.negative(jac_b)
         direction = -tf.sign(jac_b)
         derphi0 = jac_b / self.n_obs
         if alpha0 is None:
-            alpha0 = tf.ones_like(jac_b) * pkg_constants.TRUST_REGION_RADIUS_INIT_SCALE # self.tr_radius_b
+            alpha0 = tf.ones_like(jac_b) * pkg_constants.ALPHA0
         original_params_b_copy = self.model.params_copy[-1]
-        #print(direction[0].numpy(), jac_b[0].numpy())
+
         def phi(alpha):
             multiplier = tf.multiply(alpha, direction)
             new_scale_params = tf.add(original_params_b_copy, multiplier)
@@ -187,14 +180,15 @@ def phi(alpha):
             new_likelihood = None
             for i, x_batch in enumerate(x_batches):
                 log_likelihood = self.model.calc_ll([*x_batch])[0]
-                new_likelihood = log_likelihood if i == 0 else tf.math.add(new_likelihood, log_likelihood)
+                new_likelihood = log_likelihood if i == 0 else \
+                    tf.math.add(new_likelihood, log_likelihood)
             new_likelihood = self._norm_neg_log_likelihood(new_likelihood)
             return new_likelihood
-        current_likelihood = self._norm_neg_log_likelihood(log_probs)
 
+        current_likelihood = self._norm_neg_log_likelihood(log_probs)
         new_likelihood = phi(alpha0)
-        #print(new_likelihood, current_likelihood)
         beneficial = self.wolfe1(current_likelihood, new_likelihood, alpha0, derphi0)
+
         if tf.reduce_all(beneficial):  # are all beneficial?
             updated = beneficial
             if batch_features:
@@ -202,15 +196,13 @@ def phi(alpha):
                 indices = tf.where(self.model.model_vars.remaining_features)
                 updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
             self.model.model_vars.updated_b = updated
-            # self.tr_radius_b.assign(alpha0)
             return tf.multiply(alpha0, direction)
 
-        alpha1 = tf.negative(derphi0) * alpha0**2 / 2 / (new_likelihood - current_likelihood - derphi0 * alpha0)
+        divisor = new_likelihood - current_likelihood - derphi0 * alpha0
+        alpha1 = tf.negative(derphi0) * alpha0**2 / 2 / divisor
         alpha1 = tf.where(beneficial, alpha0, alpha1)
         new_likelihood2 = phi(alpha1)
-        #print(new_likelihood2, current_likelihood)
         beneficial = self.wolfe1(current_likelihood, new_likelihood2, alpha1, derphi0)
-        #print(beneficial)
         if tf.reduce_all(beneficial):
             updated = beneficial
             if batch_features:
@@ -218,10 +210,17 @@ def phi(alpha):
                 indices = tf.where(self.model.model_vars.remaining_features)
                 updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
             self.model.model_vars.updated_b = updated
-            # self.tr_radius_b.assign(alpha1)
             return tf.multiply(alpha1, direction)
 
-        while tf.reduce_any(alpha1 > 0):
+        if not tf.reduce_any(alpha1 > pkg_constants.XTOL_BY_FEATURE_SCALE):
+            # catch in case it doesn't enter the loop.
+            new_scale_params = tf.where(beneficial, self.model.params_copy[-1], original_params_b_copy)
+            self.model.params_copy[-1].assign(new_scale_params)
+            self.model.model_vars.updated_b = np.ones_like(self.model.model_vars.updated_b)
+            return tf.multiply(alpha1, direction)
+
+        while tf.reduce_any(alpha1 > pkg_constants.XTOL_BY_FEATURE_SCALE):
+
             factor = alpha0**2 * alpha1**2 * (alpha1-alpha0)
             a = alpha0**2 * (new_likelihood2 - current_likelihood - derphi0 * alpha1) - \
                 alpha1**2 * (new_likelihood - current_likelihood - derphi0 * alpha0)
@@ -233,16 +232,11 @@ def phi(alpha):
 
             alpha2 = (-b + tf.sqrt(tf.abs(tf.square(b) - 3 * a * derphi0))) / (3 * a)
             alpha2 = tf.where(beneficial, alpha1, alpha2)
-            alpha2 = tf.clip_by_value(alpha2, clip_value_min=0, clip_value_max=np.inf)
-            #print(alpha2)
-            if tf.reduce_all(alpha2 == 0):
-                #print('Minimum allowed step size reached for all features.')
-                self.model.model_vars.updated_b = np.zeros(self.model.model_vars.n_features, dtype=np.bool)
-            #print(alpha2)
+            idx_to_clip = tf.logical_or(tf.math.is_nan(alpha2), alpha2 < 0)
+            alpha2 = tf.where(idx_to_clip, tf.zeros_like(alpha2), alpha2)
             new_likelihood3 = phi(alpha2)
-            #print(new_likelihood3, current_likelihood)
             beneficial = self.wolfe1(current_likelihood, new_likelihood3, alpha2, derphi0)
-            #print(beneficial)
+
             if tf.reduce_all(beneficial):
                 updated = beneficial
                 if batch_features:
@@ -250,23 +244,19 @@ def phi(alpha):
                     indices = tf.where(self.model.model_vars.remaining_features)
                     updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
                 self.model.model_vars.updated_b = updated
-                # self.tr_radius_b.assign(alpha1)
                 return tf.multiply(alpha2, direction)
 
             step_diff_greater_half_alpha1 = (alpha1 - alpha2) > alpha1 / 2
             ratio = (1 - alpha2/alpha1) < 0.96
             set_back = tf.logical_or(step_diff_greater_half_alpha1, ratio)
             alpha2 = tf.where(set_back, alpha1 / 2, alpha2)
-            alpha2 = tf.clip_by_value(alpha2, clip_value_min=0, clip_value_max=np.inf)
-            #if step_diff or ratio:
-            #    alpha2 = alpha1 / 2
+            alpha2 = tf.where(tf.logical_or(tf.math.is_nan(alpha2), alpha2 < 0), tf.zeros_like(alpha2), alpha2)
 
             alpha0 = alpha1
             alpha1 = alpha2
             new_likelihood = new_likelihood2
             new_likelihood2 = new_likelihood3
 
-        # self.tr_radius_b.assign(alpha2)
         new_scale_params = tf.where(beneficial, self.model.params_copy[-1], original_params_b_copy)
         self.model.params_copy[-1].assign(new_scale_params)
         updated = beneficial
@@ -274,7 +264,7 @@ def phi(alpha):
             n_features = self.model.model_vars.n_features
             indices = tf.where(self.model.model_vars.remaining_features)
             updated = tf.scatter_nd(indices, beneficial, shape=(n_features,))
-        self.model.model_vars.updated_b |= updated.numpy()
+        self.model.model_vars.updated_b = np.ones_like(self.model.model_vars.updated_b)
         return tf.multiply(alpha2, direction)
 
     def wolfe1(self, current_likelihood, new_likelihood, alpha, jacobian):

From ba7e1a963ab78db2db33868125d65a226947f744 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Fri, 8 May 2020 12:50:04 +0200
Subject: [PATCH 120/124] default b_update-freq set to 1

---
 batchglm/train/tf2/glm_nb/estimator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/glm_nb/estimator.py b/batchglm/train/tf2/glm_nb/estimator.py
index cb90e6ca..4cabc8ac 100644
--- a/batchglm/train/tf2/glm_nb/estimator.py
+++ b/batchglm/train/tf2/glm_nb/estimator.py
@@ -99,7 +99,7 @@ def train(
             featurewise: bool = True,
             benchmark: bool = False,
             maxiter: int = 1,
-            b_update_freq = 5
+            b_update_freq = 1
     ):
         if self.model is None:
             self.model = NBGLM(

From 64f46c668ed73b8b22d49dc4a6914b46f4efcf5b Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Thu, 11 Jun 2020 18:14:59 +0200
Subject: [PATCH 121/124] bugfix for #44: fisher_inv not saved in property

---
 batchglm/train/tf2/base_glm/estimator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 8fae622f..004bb2ba 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -280,7 +280,8 @@ def convergence_decision(num_converged, train_step):
         num_non_invertible = n_features - len(invertible)
         if num_non_invertible > 0:
             logger.warning(f"fisher_inv could not be calculated for {num_non_invertible} features.")
-        fisher_inv[invertible] = np.linalg.inv(- self._hessian[invertible])
+        fisher_inv[invertible] = np.linalg.inv(-self._hessian[invertible])
+        self._fisher_inv = fisher_inv.copy()
         self.model.hessian.compute_b = self.model.compute_b  # reset if not self._train_scale
 
     def update_params(self, batches, results, batch_features, update_func):

From 5d03ee0785b2478bd2f88c83e2cc28d739f034a9 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Thu, 11 Jun 2020 18:18:49 +0200
Subject: [PATCH 122/124] removed TODO after bugfix #44

---
 batchglm/train/tf2/base_glm/estimator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/estimator.py b/batchglm/train/tf2/base_glm/estimator.py
index 004bb2ba..9db1f1b1 100644
--- a/batchglm/train/tf2/base_glm/estimator.py
+++ b/batchglm/train/tf2/base_glm/estimator.py
@@ -274,12 +274,12 @@ def convergence_decision(num_converged, train_step):
         self._log_likelihood = results[0].numpy()
         self._jacobian = tf.reduce_sum(tf.abs(results[1] / n_obs), axis=1)
         self._hessian = - results[2].numpy()
-        # TODO: maybe report fisher inf here in the future instead of inverted hessian.
+
         fisher_inv = np.zeros_like(self._hessian)
         invertible = np.where(np.linalg.cond(self._hessian, p=None) < 1 / sys.float_info.epsilon)[0]
         num_non_invertible = n_features - len(invertible)
         if num_non_invertible > 0:
-            logger.warning(f"fisher_inv could not be calculated for {num_non_invertible} features.")
+            logger.warning(f"fisher_inv could not be calculated for {num_non_invertible} features!")
         fisher_inv[invertible] = np.linalg.inv(-self._hessian[invertible])
         self._fisher_inv = fisher_inv.copy()
         self.model.hessian.compute_b = self.model.compute_b  # reset if not self._train_scale

From 699d26dfc7754d80362cfdae5eef1aa2849ebf7b Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 27 Sep 2021 10:09:51 +0200
Subject: [PATCH 123/124] simplify masking

---
 batchglm/train/tf2/base_glm/generator.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/generator.py b/batchglm/train/tf2/base_glm/generator.py
index 97349423..1d061fcd 100644
--- a/batchglm/train/tf2/base_glm/generator.py
+++ b/batchglm/train/tf2/base_glm/generator.py
@@ -74,7 +74,6 @@ def _featurewise_batch(self, x_tensor, dloc, dscale, size_factors):
         and returns the reduced element."""
 
         not_converged = self.estimator.model.model_vars.remaining_features
-        """
         if self.sparse:
             feature_columns = tf.sparse.split(
                 x_tensor,
@@ -84,8 +83,7 @@ def _featurewise_batch(self, x_tensor, dloc, dscale, size_factors):
             feature_columns = [feature_columns[i] for i in not_converged_idx]
             x_tensor = tf.sparse.concat(axis=1, sp_inputs=feature_columns)
         else:
-        """
-        x_tensor = tf.boolean_mask(tensor=x_tensor, mask=not_converged, axis=1)
+            x_tensor = tf.boolean_mask(tensor=x_tensor, mask=not_converged, axis=1)
         return x_tensor, dloc, dscale, size_factors
 
     def new_epoch_set(self, batch_features: bool = False):
@@ -93,11 +91,11 @@ def new_epoch_set(self, batch_features: bool = False):
         dataset_to_return = self.dataset.take(self.num_batches)
 
         if self.sparse:
-            if batch_features:
-                dataset_to_return = dataset_to_return.map(self._featurewise_batch_sparse)
             dataset_to_return = dataset_to_return.map(
                 lambda ivs_tuple, loc, scale, sf: (tf.SparseTensor(*ivs_tuple), loc, scale, sf)
             )
+            if batch_features:
+                dataset_to_return = dataset_to_return.map(self._featurewise_batch)
         else:
             if batch_features:
                 dataset_to_return = dataset_to_return.map(self._featurewise_batch)

From e85a881425a20c8a6b32c2ce99bb3ad48fd71899 Mon Sep 17 00:00:00 2001
From: picciama <mario.picciani@in.tum.de>
Date: Mon, 27 Sep 2021 10:18:41 +0200
Subject: [PATCH 124/124] cleanup

---
 batchglm/train/tf2/base_glm/convergence.py       | 1 -
 batchglm/train/tf2/base_glm/optim.py             | 1 -
 batchglm/train/tf2/glm_nb/training_strategies.py | 2 +-
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/batchglm/train/tf2/base_glm/convergence.py b/batchglm/train/tf2/base_glm/convergence.py
index 0dd04c79..aedd8d64 100644
--- a/batchglm/train/tf2/base_glm/convergence.py
+++ b/batchglm/train/tf2/base_glm/convergence.py
@@ -126,7 +126,6 @@ def calculate_convergence(self, results, jac_normalization, optimizer_object, ba
         # For now it must not be below the threshold for the X step of the loc model.
         if hasattr(optimizer_object, 'tr_mode') and optimizer_object.tr_mode:
             converged_tr = optimizer_object.tr_radius.numpy() < pkg_constants.TRTOL_BY_FEATURE_LOC
-            print(converged_tr[self.estimator.model.model_vars.remaining_features])
             epoch_tr_converged = not_converged_a & converged_tr
             epoch_step_converged_a |= epoch_tr_converged
         if hasattr(optimizer_object, 'tr_mode_b') and optimizer_object.tr_mode_b and self.estimator._train_scale:
diff --git a/batchglm/train/tf2/base_glm/optim.py b/batchglm/train/tf2/base_glm/optim.py
index 2bd5670c..7903b372 100644
--- a/batchglm/train/tf2/base_glm/optim.py
+++ b/batchglm/train/tf2/base_glm/optim.py
@@ -495,7 +495,6 @@ def perform_parameter_update(self, inputs, compute_a=True, compute_b=True, batch
                 batch_features=batch_features,
                 update_theta=update_theta
             )
-            print(self.tr_radius[self.model.model_vars.remaining_features])
 
     def calc_delta_f_actual(self, current_likelihood, new_likelihood, jacobian):
         eta1 = tf.constant(pkg_constants.TRUST_REGION_ETA1, dtype=self._dtype)
diff --git a/batchglm/train/tf2/glm_nb/training_strategies.py b/batchglm/train/tf2/glm_nb/training_strategies.py
index 858ec8a6..18c35302 100644
--- a/batchglm/train/tf2/glm_nb/training_strategies.py
+++ b/batchglm/train/tf2/glm_nb/training_strategies.py
@@ -14,7 +14,7 @@ class TrainingStrategies(Enum):
         {
             "convergence_criteria": "all_converged",
             "use_batching": False,
-            "optim_algo": "irls_gd_tr",
+            "optim_algo": "irls_tr_gd_tr",
         },
     ]
     IRLS_BATCHED = [